### Data Understading and Cleaning

In [1]:
# Importing the required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# To ignore warnings
import warnings
warnings.filterwarnings("ignore")

In [2]:
# Reading the csv file and putting it into 'df' object.
df = pd.read_csv('ecommerce_consumers.csv')
df.head()

Unnamed: 0,ratio,time,label
0,0.54,17.2,female
1,0.93,18.2,male
2,0.84,13.6,female
3,0.19,6.0,male
4,0.89,13.2,female


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 3 columns):
ratio    200 non-null float64
time     200 non-null float64
label    200 non-null object
dtypes: float64(2), object(1)
memory usage: 4.8+ KB


### Data Preparation and Model Building

In [4]:
# Importing preprocessing from sklearn
from sklearn import preprocessing

In [5]:
# encode categorical variables using Label Encoder

# select all categorical variables
df_categorical = df.select_dtypes(include=['object'])
df_categorical.head()

Unnamed: 0,label
0,female
1,male
2,female
3,male
4,female


In [6]:
# apply Label encoder to df_categorical

le = preprocessing.LabelEncoder()
df_categorical = df_categorical.apply(le.fit_transform)
df_categorical.head()

Unnamed: 0,label
0,0
1,1
2,0
3,1
4,0


In [7]:
# concat df_categorical with original df
df = df.drop(df_categorical.columns, axis=1)
df = pd.concat([df, df_categorical], axis=1)
df.head()

Unnamed: 0,ratio,time,label
0,0.54,17.2,0
1,0.93,18.2,1
2,0.84,13.6,0
3,0.19,6.0,1
4,0.89,13.2,0


In [8]:
# look at column types
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 3 columns):
ratio    200 non-null float64
time     200 non-null float64
label    200 non-null int32
dtypes: float64(2), int32(1)
memory usage: 4.0 KB


In [9]:
# convert target variable income to categorical
df['label'] = df['label'].astype('category')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 3 columns):
ratio    200 non-null float64
time     200 non-null float64
label    200 non-null category
dtypes: category(1), float64(2)
memory usage: 3.5 KB


In [10]:
# Importing test_train_split from sklearn library
from sklearn.model_selection import train_test_split

In [11]:
X = df.drop(['label'], axis=1)
y = df[['label']]

In [12]:
# scaling the features
from sklearn.preprocessing import scale

X_scaled = scale(X)

In [13]:
# split into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 100)

### Model Building

# SVM

### Let's fist build two basic models - linear and non-linear with default hyperparameters, and compare the accuracies.

In [16]:
# linear model
from sklearn.svm import SVC
from sklearn import metrics
from sklearn.metrics import confusion_matrix

model_linear = SVC(kernel='linear')
model_linear.fit(X_train, y_train)

# predict
y_pred = model_linear.predict(X_test)


In [17]:
# confusion matrix and accuracy

# accuracy
print("accuracy:", metrics.accuracy_score(y_true=y_test, y_pred=y_pred), "\n")

# cm
print(metrics.confusion_matrix(y_true=y_test, y_pred=y_pred))

accuracy: 0.7 

[[ 0 18]
 [ 0 42]]


In [27]:
# non-linear model
# using rbf kernel, C=1, default value of gamma

# model
non_linear_model = SVC(kernel='poly')

# fit
non_linear_model.fit(X_train, y_train)

# predict
y_pred = non_linear_model.predict(X_test)

In [28]:
# confusion matrix and accuracy

# accuracy
print("accuracy:", metrics.accuracy_score(y_true=y_test, y_pred=y_pred), "\n")

# cm
print(metrics.confusion_matrix(y_true=y_test, y_pred=y_pred))

accuracy: 0.7 

[[ 0 18]
 [ 0 42]]


# Decision Tree


In [20]:
# Importing decision tree classifier from sklearn library
from sklearn.tree import DecisionTreeClassifier

# Fitting the decision tree with default hyperparameters, apart from
# max_depth which is 5 so that we can plot and read the tree.
dt_default = DecisionTreeClassifier(max_depth=5)
dt_default.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=5,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [21]:
# Let's check the evaluation metrics of our default model

# Importing classification report and confusion matrix from sklearn metrics
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Making predictions
y_pred_default = dt_default.predict(X_test)

# Printing classification report
print(classification_report(y_test, y_pred_default))

              precision    recall  f1-score   support

           0       0.90      1.00      0.95        18
           1       1.00      0.95      0.98        42

    accuracy                           0.97        60
   macro avg       0.95      0.98      0.96        60
weighted avg       0.97      0.97      0.97        60



In [22]:
# Printing confusion matrix and accuracy
print(confusion_matrix(y_test,y_pred_default))
print(accuracy_score(y_test,y_pred_default))

[[18  0]
 [ 2 40]]
0.9666666666666667
