In [1]:
import pandas as pd

In [2]:
df_shopping = pd.read_csv("resources/shopping_data.csv")
df_shopping.head()

Unnamed: 0,CustomerID,Card Member,Age,Annual Income,Spending Score (1-100)
0,1,Yes,19.0,15000,39.0
1,2,Yes,21.0,15000,81.0
2,3,No,20.0,16000,6.0
3,4,No,23.0,16000,77.0
4,5,No,31.0,17000,40.0


## Data Selection

Looking at the data we have available and deciding what we want to get out of it.

In [3]:
# question: what data is available

df_shopping.columns

Index(['CustomerID', 'Card Member', 'Age', 'Annual Income',
       'Spending Score (1-100)'],
      dtype='object')

In [4]:
# question what type of data is there
df_shopping.dtypes

# note card member is an object, will need to be converted to numeric for ML to work

CustomerID                  int64
Card Member                object
Age                       float64
Annual Income               int64
Spending Score (1-100)    float64
dtype: object

In [5]:
# question: is anything missing

for col in df_shopping.columns:
    print(f'Column {col} has {df_shopping[col].isnull().sum()} missing values')

# pretty small amounts in each on where there's missing data, we'll just use the drop na method to get rid of those rows.
# if one col had a very significant amount gone we'd have to consider dropping that column

Column CustomerID has 0 missing values
Column Card Member has 2 missing values
Column Age has 2 missing values
Column Annual Income has 0 missing values
Column Spending Score (1-100) has 1 missing values


In [6]:
# drop null rows

df_shopping = df_shopping.dropna()

In [7]:
# check for duplicate entries

df_shopping.duplicated().sum()

0

In [8]:
# question: what data that isn't telling us anything can be removed?

df_shopping.drop("CustomerID", axis=1, inplace=True)

df_shopping.head()

Unnamed: 0,Card Member,Age,Annual Income,Spending Score (1-100)
0,Yes,19.0,15000,39.0
1,Yes,21.0,15000,81.0
2,No,20.0,16000,6.0
3,No,23.0,16000,77.0
4,No,31.0,17000,40.0


## Data Processing

Now that we have what we want to get out of our data it's time to move on to waht the model needs to work with the data

In [9]:
# make sure null values are handled, only numerical data is used, values are scaled

# there are a few ways to handle changing text to numbers. get_dummies works, but makes multiple columns.
# in the last module we used a dictionary to apply to change with the key set to current value and value set to the value we want there
# now we'll make a function

def change_string(member):
    if member == 'Yes':
        return 1
    else:
        return 0

df_shopping['Card Member'] = df_shopping['Card Member'].apply(change_string)
df_shopping.head()

Unnamed: 0,Card Member,Age,Annual Income,Spending Score (1-100)
0,1,19.0,15000,39.0
1,1,21.0,15000,81.0
2,0,20.0,16000,6.0
3,0,23.0,16000,77.0
4,0,31.0,17000,40.0


In [10]:
# notice the annual income is in a much larger scale than the other columns. 
# if we can get it to be around 0-100 then we can work with it without fiddling with the other columns

df_shopping['Annual Income'] = df_shopping['Annual Income'] / 1000

df_shopping.sample()

Unnamed: 0,Card Member,Age,Annual Income,Spending Score (1-100)
129,1,40.0,71.0,95.0


In [11]:
df_shopping = df_shopping.rename(columns={"Card Member": "CardMember", "Annual Income" : "Annual_Inc", "Spending Score (1-100)": "SpendingScore"})
df_shopping

Unnamed: 0,CardMember,Age,Annual_Inc,SpendingScore
0,1,19.0,15.0,39.0
1,1,21.0,15.0,81.0
2,0,20.0,16.0,6.0
3,0,23.0,16.0,77.0
4,0,31.0,17.0,40.0
...,...,...,...,...
198,0,35.0,120.0,79.0
199,0,45.0,126.0,28.0
200,1,32.0,126.0,74.0
201,1,32.0,137.0,18.0


## Data Transformation

Now that the work is done to get it passed into an ML model, we go ahead and export that cleaned data in case we or anyone else needs to use it in the future. Why redo work that's already been done?

In [13]:
# question: can i quickly hand off this data for others to use?

df_shopping.to_csv("resources/shopping_data_cleaned.csv", index=False)