In [2]:
import numpy as np
import pandas as pd

In [27]:
df = pd.read_csv('ecommerce_consumers.csv')
df.head()

Unnamed: 0,ratio,time,label
0,0.54,17.2,female
1,0.93,18.2,male
2,0.84,13.6,female
3,0.19,6.0,male
4,0.89,13.2,female


In [5]:
df.shape

(200, 3)

In [21]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   ratio   200 non-null    float64
 1   time    200 non-null    float64
 2   label   200 non-null    object 
dtypes: float64(2), object(1)
memory usage: 4.8+ KB


In [22]:
df.isnull().sum()

ratio    0
time     0
label    0
dtype: int64

In [28]:
df['label'] = df['label'].map({'male': 1, "female": 0})
df['label']

0      0
1      1
2      0
3      1
4      0
      ..
195    0
196    1
197    1
198    0
199    0
Name: label, Length: 200, dtype: int64

In [4]:
from sklearn.model_selection import train_test_split

In [31]:
X_train = df.drop('label', axis=1)
X_train

Unnamed: 0,ratio,time
0,0.54,17.2
1,0.93,18.2
2,0.84,13.6
3,0.19,6.0
4,0.89,13.2
...,...,...
195,0.62,17.2
196,0.79,9.8
197,0.91,19.8
198,0.73,18.8


In [32]:
y_train = df['label']
y_train

0      0
1      1
2      0
3      1
4      0
      ..
195    0
196    1
197    1
198    0
199    0
Name: label, Length: 200, dtype: int64

In [36]:
from sklearn.preprocessing import StandardScaler

In [37]:
scaler = StandardScaler()

X_train[['ratio','time']] = scaler.fit_transform(X_train[['ratio','time']])

X_train.head()

Unnamed: 0,ratio,time
0,0.132228,0.447943
1,1.451127,0.63663
2,1.146766,-0.23133
3,-1.051399,-1.66535
4,1.315855,-0.306805


### Logistic Regression

In [38]:
import statsmodels.api as sm

In [39]:
# Logistic regression model
X_train_sm = sm.add_constant(X_train)
logm1 = sm.GLM(y_train,X_train_sm, family=sm.families.Binomial())
logm1.fit().summary()

0,1,2,3
Dep. Variable:,label,No. Observations:,200.0
Model:,GLM,Df Residuals:,197.0
Model Family:,Binomial,Df Model:,2.0
Link Function:,Logit,Scale:,1.0
Method:,IRLS,Log-Likelihood:,-126.09
Date:,"Fri, 05 May 2023",Deviance:,252.18
Time:,22:24:43,Pearson chi2:,200.0
No. Iterations:,4,Pseudo R-squ. (CS):,0.0002802
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,0.7311,0.151,4.842,0.000,0.435,1.027
ratio,-0.0078,0.151,-0.052,0.959,-0.304,0.288
time,-0.0350,0.151,-0.232,0.816,-0.331,0.261


In [40]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()

In [41]:
logreg.fit(X_train, y_train)

In [42]:
y_train_pred = logreg.predict(X_train)
y_train_pred

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1], dtype=int64)

In [43]:
from sklearn import metrics

In [44]:
# Let's check the overall accuracy.
print(metrics.accuracy_score(y_train_pred, y_train))

0.675


### Decision Tree

In [45]:
from sklearn.tree import DecisionTreeClassifier

In [50]:
dt = DecisionTreeClassifier(max_depth=2)
dt.fit(X_train, y_train)

In [51]:
y_train_pred = dt.predict(X_train)

In [52]:
from sklearn.metrics import confusion_matrix, accuracy_score

In [53]:
print(accuracy_score(y_train, y_train_pred))

0.91


In [64]:
dt = DecisionTreeClassifier(min_samples_leaf=20)
dt.fit(X_train, y_train)

In [65]:
y_train_pred = dt.predict(X_train)

In [66]:
print(accuracy_score(y_train, y_train_pred))

0.935


### Hypertuning

In [54]:
dt = DecisionTreeClassifier(random_state=42)

In [55]:
from sklearn.model_selection import GridSearchCV

In [56]:
# Create the parameter grid based on the results of random search 
params = {
    'max_depth': [2, 3, 5],
    'min_samples_leaf': [5, 10, 20, 50],
    'criterion': ["gini", "entropy"]
}

In [57]:
# Instantiate the grid search model
grid_search = GridSearchCV(estimator=dt, 
                           param_grid=params, 
                           cv=4, n_jobs=-1, verbose=1, scoring = "accuracy")

In [58]:
%%time
grid_search.fit(X_train, y_train)

Fitting 4 folds for each of 24 candidates, totalling 96 fits
CPU times: total: 125 ms
Wall time: 3.47 s


In [59]:
score_df = pd.DataFrame(grid_search.cv_results_)
score_df.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_max_depth,param_min_samples_leaf,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,mean_test_score,std_test_score,rank_test_score
0,0.005817,0.000317,0.002871,0.0002264733,gini,2,5,"{'criterion': 'gini', 'max_depth': 2, 'min_sam...",0.94,0.88,0.92,0.88,0.905,0.025981,9
1,0.00475,0.000434,0.002633,0.0009664251,gini,2,10,"{'criterion': 'gini', 'max_depth': 2, 'min_sam...",0.94,0.88,0.92,0.88,0.905,0.025981,9
2,0.003253,0.000432,0.002,4.915125e-07,gini,2,20,"{'criterion': 'gini', 'max_depth': 2, 'min_sam...",0.94,0.88,0.92,0.88,0.905,0.025981,9
3,0.002006,6e-06,0.001252,0.000432122,gini,2,50,"{'criterion': 'gini', 'max_depth': 2, 'min_sam...",0.66,0.68,0.68,0.68,0.675,0.00866,19
4,0.00426,0.000439,0.00263,0.0006575799,gini,3,5,"{'criterion': 'gini', 'max_depth': 3, 'min_sam...",0.98,0.98,0.98,0.94,0.97,0.017321,3


In [60]:
grid_search.best_estimator_

In [61]:
dt_best = grid_search.best_estimator_

In [62]:
from sklearn.metrics import classification_report

In [63]:
print(classification_report(y_train, dt_best.predict(X_train)))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        65
           1       1.00      1.00      1.00       135

    accuracy                           1.00       200
   macro avg       1.00      1.00      1.00       200
weighted avg       1.00      1.00      1.00       200



### SVM

In [67]:
from sklearn.svm import SVC

In [68]:
# linear model

model_linear = SVC(kernel='linear')
model_linear.fit(X_train, y_train)

# predict
y_pred = model_linear.predict(X_train)

In [69]:
# accuracy
print("accuracy:", metrics.accuracy_score(y_true=y_train, y_pred=y_pred), "\n")

accuracy: 0.675 



In [71]:
# model
non_linear_model = SVC(kernel='rbf')

# fit
non_linear_model.fit(X_train, y_train)

# predict
y_pred = non_linear_model.predict(X_train)

In [72]:
# accuracy
print("accuracy:", metrics.accuracy_score(y_true=y_train, y_pred=y_pred), "\n")

accuracy: 0.955 



In [76]:
# model
non_linear_model = SVC(kernel='poly')

# fit
non_linear_model.fit(X_train, y_train)

# predict
y_pred = non_linear_model.predict(X_train)

In [77]:
# accuracy
print("accuracy:", metrics.accuracy_score(y_true=y_train, y_pred=y_pred), "\n")

accuracy: 0.675 



In [78]:
# model
non_linear_model = SVC(kernel='sigmoid')

# fit
non_linear_model.fit(X_train, y_train)

# predict
y_pred = non_linear_model.predict(X_train)

In [79]:
# accuracy
print("accuracy:", metrics.accuracy_score(y_true=y_train, y_pred=y_pred), "\n")

accuracy: 0.415 

