In [8]:
# Load libraries
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from sklearn import metrics
from sklearn import preprocessing
import pandas as pd

# 1. Load Data

In [9]:
# Load data
df = pd.read_csv('online_store_customer_data.csv')
df.head(3)

Unnamed: 0,Transaction_date,Transaction_ID,Gender,Age,Marital_status,State_names,Segment,Employees_status,Payment_method,Referal,Amount_spent
0,1/1/2019,151200,Female,19.0,Single,Kansas,Basic,Unemployment,Other,1.0,2051.36
1,1/1/2019,151201,Male,49.0,Single,Illinois,Basic,self-employed,Card,0.0,544.04
2,1/1/2019,151202,Male,63.0,Married,New Mexico,Basic,workers,PayPal,1.0,1572.6


## 1.1 Data Pre-processing

In [10]:
# •	Find out the number of rows and columns in your data frame by using function shape
print("Data Shape: ", df.shape)
print("Rows in Data: ", df.shape[0])
print("Columns in Data: ", df.shape[1])

Data Shape:  (2512, 11)
Rows in Data:  2512
Columns in Data:  11


## 1.2. Missing Values

In [11]:
df.isnull().sum()

Transaction_date      0
Transaction_ID        0
Gender               28
Age                  42
Marital_status        0
State_names           0
Segment               0
Employees_status     26
Payment_method        0
Referal             155
Amount_spent        242
dtype: int64

In [12]:
# Drop column Transaction_date from main dataframe
df.drop(columns = 'Transaction_date', inplace=True)
# Remove rows from data frame where Employee_status values are missing.
df.dropna(subset = ["Employees_status"], inplace=True)
# Replace Amount_spent missing values with the mean value of amount_spent.
mean_AS = df['Amount_spent'].mean()
df['Amount_spent'].fillna(mean_AS, inplace=True)
# Replace missing values in Age with the mean age value.
mean_Age = df['Age'].mean()
df['Age'].fillna(mean_Age, inplace=True)
# Replace missing values in Referal with the mean Referal value.
mean_R = df['Referal'].mean()
df['Referal'].fillna(mean_R, inplace=True)
# Replace missing values in “Employee_status” with the mode value of column “Employee_status”.
mode_emp = df['Employees_status'].mode().iloc[0]
df['Employees_status'].fillna(mode_emp, inplace=True)
# Replace missing values in Gender with the mode value of column Gender.
mode_Gender = df['Gender'].mode().iloc[0]
df['Gender'].fillna(mode_Gender, inplace=True)

In [13]:
df.isnull().sum()

Transaction_ID      0
Gender              0
Age                 0
Marital_status      0
State_names         0
Segment             0
Employees_status    0
Payment_method      0
Referal             0
Amount_spent        0
dtype: int64

## 1.3. Inconsistant Data

In [14]:
# Drop rows where Employees_status is unemployment and Payement_method is other
print('Data shape before droping rows', df.shape)
df = df.drop(df[(df['Employees_status']=='Unemployment') & (df['Payment_method']=='Other')].index)
print('Data shape after droping rows', df.shape)

Data shape before droping rows (2486, 10)
Data shape after droping rows (2428, 10)


## 1.4. Encodings

In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2428 entries, 1 to 2511
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Transaction_ID    2428 non-null   int64  
 1   Gender            2428 non-null   object 
 2   Age               2428 non-null   float64
 3   Marital_status    2428 non-null   object 
 4   State_names       2428 non-null   object 
 5   Segment           2428 non-null   object 
 6   Employees_status  2428 non-null   object 
 7   Payment_method    2428 non-null   object 
 8   Referal           2428 non-null   float64
 9   Amount_spent      2428 non-null   float64
dtypes: float64(3), int64(1), object(6)
memory usage: 208.7+ KB


In [16]:
# Convert categorical data into numerical data
##### Nominal Categorical data
# 1. Check the unique values in the column
# 2. Replace the categories to appropriate numeric value
# Convert nominal categorical data to numerical data using replace
df['Gender'] = df['Gender'].replace({'Female': 0, 'Male': 1})
# Convert nominal categorical data to numerical data using replace
df['Marital_status'] = df['Marital_status'].replace({'Single': 0, 'Married': 1})
df['Employees_status'] = df['Employees_status'].replace({'Employees': 0, 'workers': 1,'self-employed': 2, 'Unemployment': 3})
df['Payment_method'] = df['Payment_method'].replace({'PayPal': 0, 'Card': 1, 'Other':2})
# Convert categorical data to numerical data using cat.codes
df['State_names'] = df['State_names'].astype('category')
df['State_names'] = df['State_names'].cat.codes
# Convery ordinal categorical feature to numeric using one hot encoding
# Get one hot encoding of columns B
one_hot = pd.get_dummies(df['Segment'])
# Drop column B as it is now encoded
df = df.drop('Segment',axis = 1)
# Join the encoded df
df = df.join(one_hot)

In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2428 entries, 1 to 2511
Data columns (total 14 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Transaction_ID    2428 non-null   int64  
 1   Gender            2428 non-null   int64  
 2   Age               2428 non-null   float64
 3   Marital_status    2428 non-null   int64  
 4   State_names       2428 non-null   int8   
 5   Employees_status  2428 non-null   int64  
 6   Payment_method    2428 non-null   int64  
 7   Referal           2428 non-null   float64
 8   Amount_spent      2428 non-null   float64
 9   Basic             2428 non-null   uint8  
 10  Gold              2428 non-null   uint8  
 11  Missing           2428 non-null   uint8  
 12  Platinum          2428 non-null   uint8  
 13  Silver            2428 non-null   uint8  
dtypes: float64(3), int64(5), int8(1), uint8(5)
memory usage: 249.5 KB


# 2. Feature Selection


# 3. Splitting the data

# 4. Building Decision Tree Model

# 5. Classification Report

# 6. Feature Importance