In [1]:
# Import dependencies
import pandas as pd

In [2]:
file_path = "data/shopping_data.csv"
shopping_df = pd.read_csv(file_path, encoding = "ISO-8859-1")
shopping_df.head()

Unnamed: 0,CustomerID,Card Member,Age,Annual Income,Spending Score (1-100)
0,1,Yes,19.0,15000,39.0
1,2,Yes,21.0,15000,81.0
2,3,No,20.0,16000,6.0
3,4,No,23.0,16000,77.0
4,5,No,31.0,17000,40.0


In [3]:
 # With an unsupervised model for this dataset, we would want to group costumers by spending habits.

    # 1. What data is available?
    
shopping_df.columns

Index(['CustomerID', 'Card Member', 'Age', 'Annual Income',
       'Spending Score (1-100)'],
      dtype='object')

In [6]:
# List data types
shopping_df.dtypes

CustomerID                  int64
Card Member                object
Age                       float64
Annual Income               int64
Spending Score (1-100)    float64
dtype: object

In [7]:
# Find null values
shopping_df.isnull().sum()

CustomerID                0
Card Member               2
Age                       2
Annual Income             0
Spending Score (1-100)    1
dtype: int64

In [8]:
# Drop null rows
new_shopping_df = shopping_df.dropna()

In [10]:
# Review the previous operation
new_shopping_df.isnull().sum()

CustomerID                0
Card Member               0
Age                       0
Annual Income             0
Spending Score (1-100)    0
dtype: int64

In [11]:
# Find duplicate entries
new_shopping_df.duplicated().sum()

0

In [13]:
# Remove the CustomerID Column
clean_shopping_df = new_shopping_df.drop(columns={"CustomerID"}, axis = 1)
clean_shopping_df.head()

Unnamed: 0,Card Member,Age,Annual Income,Spending Score (1-100)
0,Yes,19.0,15000,39.0
1,Yes,21.0,15000,81.0
2,No,20.0,16000,6.0
3,No,23.0,16000,77.0
4,No,31.0,17000,40.0


In [15]:
# Transform string column
def change_string(member):
    if member == "Yes":
        return 1
    else:
        return 0

clean_shopping_df["Card Member"] = clean_shopping_df["Card Member"].apply(change_string)
clean_shopping_df.head()

Unnamed: 0,Card Member,Age,Annual Income,Spending Score (1-100)
0,1,19.0,15000,39.0
1,1,21.0,15000,81.0
2,0,20.0,16000,6.0
3,0,23.0,16000,77.0
4,0,31.0,17000,40.0


In [16]:
# Transform annual income
clean_shopping_df["Annual Income"] = clean_shopping_df["Annual Income"] / 1000
clean_shopping_df.head()

Unnamed: 0,Card Member,Age,Annual Income,Spending Score (1-100)
0,1,19.0,15.0,39.0
1,1,21.0,15.0,81.0
2,0,20.0,16.0,6.0
3,0,23.0,16.0,77.0
4,0,31.0,17.0,40.0


In [19]:
# Rename columns
clean_shopping_df.rename(columns = {"Card Member": "card_member", "Age":"age", "Annual Income":"annual_income", "Spending Score (1-100)": "spending_score"}, inplace = True)
clean_shopping_df.head()

Unnamed: 0,card_member,age,annual_income,spending_score
0,1,19.0,15.0,39.0
1,1,21.0,15.0,81.0
2,0,20.0,16.0,6.0
3,0,23.0,16.0,77.0
4,0,31.0,17.0,40.0


In [20]:
# Save cleaned data
output_path = "data/shopping_data_cleaned.csv"
clean_shopping_df.to_csv(output_path, index=False)