In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
# Reading the data
data = pd.read_excel(r'C:\Users\Radhika K J\Downloads\Project6\Project Group 6\ECommerceDataset.xlsx',sheet_name='E Comm')
pd.set_option('display.max_columns',None)

In [None]:
# Displaying a small chunk of data.
data.head()

In [None]:
# Changing the same values with different respresentations into one single value
data['PreferredLoginDevice'] = data['PreferredLoginDevice'].replace('Phone', 'Mobile Phone')
data['PreferredPaymentMode'] = data['PreferredPaymentMode'].replace('CC', 'Credit Card').replace('COD', 'Cash on Delivery')
data['PreferedOrderCat'] = data['PreferedOrderCat'].replace('Mobile', 'Mobile Phone')

In [None]:
# Rounding the OrderCount because orders have to be whole numbers.
data['OrderCount'] = round(data['OrderCount'])

In [None]:
# Changing the type of columns.
data['CityTier'] = data['CityTier'].astype('object')
data['SatisfactionScore'] = data['SatisfactionScore'].astype('object')
data['Complain'] = data['Complain'].astype('object')

In [None]:
# Fill the missing values using KNN.
missing_cols = [col for col in data.columns if data[col].isnull().any()]
from sklearn.impute import KNNImputer
impute_knn = KNNImputer(n_neighbors=5)
data_missing = data[missing_cols]
imputed_data = impute_knn.fit_transform(data_missing)
data_imputed = pd.concat([data.drop(missing_cols, axis=1), pd.DataFrame(imputed_data, columns=missing_cols)], axis=1)

In [None]:
# Winsorize the values above 99th percentile to 99th percentile
from scipy.stats.mstats import winsorize
winsorize(data_imputed['Tenure'], limits=(0, 0.01), inplace=True)

In [None]:
# Winsorize the values above 95th percentile to 95th percentile
winsorize(data_imputed['NumberOfAddress'], limits=(0, 0.05), inplace=True)

In [None]:
# Conducting Two Sample T-Test to see which numerical columns to select
from scipy.stats import ttest_ind
def num_stats(num_col):
    group_0 = data_imputed[data_imputed['Churn']==0][num_col]
    group_1 = data_imputed[data_imputed['Churn']==1][num_col]

    t_stat, p_value = ttest_ind(group_0,group_1,equal_var=False)

    print('P-value : ', p_value)
    if(p_value<0.05):
        print('Reject null hypothesis')
    else:
        print('Do not reject null hypothesis')

In [None]:
# Conducting chi-square test of independence on categorical columns.
from scipy.stats import chi2_contingency
def chisq_test(cat_col, df):
    CrossTabResult=pd.crosstab(index=df['Churn'], columns=df[cat_col])
    ChiSqResult = chi2_contingency(CrossTabResult)
        
    if (ChiSqResult[1] < 0.05):
        print('P-Value :', ChiSqResult[1])
        print('Reject null hypothesis')
    else:
        print('P-Value :', ChiSqResult[1])
        print('Do not reject null hypothesis')

In [None]:
numerical_cols = data_imputed.select_dtypes(include=['int', 'float']).columns.tolist()
numerical_cols.remove('Churn')
categorical_cols = data_imputed.select_dtypes(include=['object']).columns.tolist()

In [None]:
for col in numerical_cols:
    print('Column Name : ', col)
    num_stats(col)
    print('---------------------------------------------')

In [None]:
for col in categorical_cols:
    print('Column Name : ', col)
    chisq_test(col, data_imputed)
    print('---------------------------------------------')

In [None]:
# Selecting columns where we accept the null hypothesis of the column does not affect the target 'Churn'
data_imputed.drop(columns=['CustomerID','HourSpendOnApp','OrderCount','OrderAmountHikeFromlastYear','CouponUsed'],axis=1,inplace=True)

In [None]:
# Using one hot encoding for the categorical columns.
data_imputed[categorical_cols] = data_imputed[categorical_cols].astype('category')
df_encoded = pd.get_dummies(data_imputed,columns=categorical_cols)

In [None]:
# The data is imbalanced. So we use an oversampling method of SMOTE
from imblearn.over_sampling import SMOTE
smote = SMOTE(random_state=33)
X = df_encoded.drop('Churn', axis=1)
y = df_encoded['Churn']
x_smote, y_smote = smote.fit_resample(X, y)
df_encoded = pd.DataFrame(x_smote, columns=df_encoded.drop('Churn', axis=1).columns)
df_encoded['Churn'] = y_smote

In [None]:
from sklearn.preprocessing import StandardScaler
X = df_encoded.drop('Churn', axis=1)
cols = X.columns
y= df_encoded.Churn
# Using StandarScaler to scale the values
scaler = StandardScaler()
X = scaler.fit_transform(X)
X = pd.DataFrame(X,columns=cols)

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state= 42)

In [None]:
import optuna
from sklearn.metrics import confusion_matrix,accuracy_score,precision_score,recall_score,f1_score
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

def objective(trial):
    n_estimators = trial.suggest_int("n_estimators", 50, 200)
    learning_rate = trial.suggest_float("learning_rate", 0.01, 1.0)
    algorithm = trial.suggest_categorical("algorithm", ["SAMME"])
    base_estimator = trial.suggest_categorical("base_estimator", ["decision_tree", "logistic_regression", "svm"])

    if base_estimator == "decision_tree":
        criterion = trial.suggest_categorical("criterion", ["gini", "entropy"])
        max_depth = trial.suggest_int("max_depth", 2, 32)
        min_samples_split = trial.suggest_int("min_samples_split", 2, 10)

        base_estimator = DecisionTreeClassifier(
            criterion=criterion,
            max_depth=max_depth,
            min_samples_split=min_samples_split
        )
    elif base_estimator == "logistic_regression":
        C = trial.suggest_float("C", 0.1, 10.0)
        penalty = trial.suggest_categorical("penalty", ["l2",None])
        solver = trial.suggest_categorical("solver", ["lbfgs"])

        base_estimator = LogisticRegression(
            C=C,
            penalty=penalty,
            solver=solver
        )
    elif base_estimator == "svm":
        C = trial.suggest_float("C", 0.1, 10.0)
        kernel = trial.suggest_categorical("kernel", ["linear", "rbf", "poly", "sigmoid"])
        gamma = trial.suggest_categorical("gamma", ["scale", "auto"])

        base_estimator = SVC(
            C=C,
            kernel=kernel,
            gamma=gamma
        )

    adaboost = AdaBoostClassifier(
        n_estimators=n_estimators,
        learning_rate=learning_rate,
        algorithm=algorithm,
        estimator=base_estimator
    )

    adaboost.fit(X_train, y_train)

    y_pred = adaboost.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    return accuracy

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=100)
best_params = study.best_params
best_accuracy = study.best_value
print("Best parameters:", best_params)
print("Best accuracy:", best_accuracy)

In [None]:
best_params = study.best_params
best_accuracy = study.best_value
print("Best parameters:", best_params)
print("Best accuracy:", best_accuracy)

In [None]:
adaboost_best = AdaBoostClassifier(
    n_estimators=best_params['n_estimators'], # 147
    learning_rate=best_params['learning_rate'], # 0.42329791924956006
    algorithm=best_params['algorithm'], # SAMME
    estimator=DecisionTreeClassifier(criterion = best_params['criterion'], # gini
                                     max_depth = best_params['max_depth'], # 27
                                     min_samples_split = best_params['min_samples_split']) # 7
)

In [None]:
adaboost_best = adaboost_best.fit(X_train, y_train)

In [None]:
y_pred = adaboost_best.predict(X_test)

print("Accuracy = ",accuracy_score(y_test,y_pred))
print("Precision = ",precision_score(y_test,y_pred))
print("Recall = ",recall_score(y_test,y_pred))
print("F1 Score", f1_score(y_test,y_pred))

In [None]:
conf_matrix = confusion_matrix(y_test,y_pred)
plt.figure(figsize=(6, 4))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Churn Confusion Matrix Heatmap')
plt.show()

In [None]:
import pickle

In [None]:
pickle.dump(scaler,open("scaler_raw.pkl","wb"))

In [None]:
pickle.dump(adaboost_best,open("adaboost_best_raw.pkl","wb"))