# 3_4choose_learning_machine_models with SMOTE (Filtered_version)


In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
#load the file 
data = pd.read_csv('../data/cleaned_data/cleaned_data.csv')
data.head()


Unnamed: 0,senior_citizen,partner,dependents,phone_service,multiple_lines,online_security,online_backup,device_protection,tech_support,streaming_tv,...,internet_service_Fiber optic,internet_service_No,contract_Month-to-month,contract_One year,contract_Two year,payment_method_Bank transfer (automatic),payment_method_Credit card (automatic),payment_method_Electronic check,payment_method_Mailed check,churn
0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0,0,1,0,0,0,0,1,0,No
1,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,...,0,0,0,1,0,0,0,0,1,No
2,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,...,0,0,1,0,0,0,0,0,1,Yes
3,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,...,0,0,0,1,0,1,0,0,0,No
4,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1,0,1,0,0,0,0,1,0,Yes


In [3]:
columns_to_keep = ['total_charges', 'online_security', 'streaming_movies', 'dependents', 'tech_support',
'multiple_lines' , 'partner', 'online_backup', 'senior_citizen', 'contract_Two year', 
'internet_service_No', 'monthly_charges','churn'] 

# Drop columns not in the list
filtered_df = data.loc[:, columns_to_keep]
filtered_df

Unnamed: 0,total_charges,online_security,streaming_movies,dependents,tech_support,multiple_lines,partner,online_backup,senior_citizen,contract_Two year,internet_service_No,monthly_charges,churn
0,0.032379,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0,0,0.123197,No
1,0.592184,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0.399669,No
2,0.147354,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0,0,0.368614,Yes
3,0.586480,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0,0,0.251741,No
4,0.184582,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0.536070,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...
7027,0.603662,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0,0,0.674013,No
7028,0.947862,0.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,0,0,0.851641,No
7029,0.290455,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0,0,0.120582,No
7030,0.273348,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0,0,0.572441,Yes


In [5]:
data = filtered_df
data.shape

(7032, 13)

### x/y split

In [6]:
#separate the features from the labels
y = data['churn']
X = data.drop(['churn'], axis=1)

In [7]:
# change the type of 'y'
y.dtypes

dtype('O')

In [8]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
display(y_train.value_counts())
display(y_test.value_counts())

churn
No     4130
Yes    1495
Name: count, dtype: int64

churn
No     1033
Yes     374
Name: count, dtype: int64

#### It is imbalanced,so it needs up/down sampling or SMOTE

In [10]:
from imblearn.over_sampling import SMOTE

smote = SMOTE()
# Fit and apply SMOTE to generate synthetic samples for the minority class
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

In [11]:
X_resampled.shape

(8260, 12)

In [12]:
class_counts = y_resampled.value_counts()
print(class_counts)

churn
Yes    4130
No     4130
Name: count, dtype: int64


In [13]:
# save 
X_resampled.to_csv('../data/x_y_data/SMOTE/filtered2_X_resampled.csv', index=False)
y_resampled.to_csv('../data/x_y_data/SMOTE/filtered2_y_resampled.csv', index=False)
X_test.to_csv('../data/x_y_data/SMOTE/filtered2_X_test.csv', index=False)
y_test.to_csv('../data/x_y_data/SMOTE/filtered2_y_test.csv', index=False)

### Choosing models

In [14]:
X_resampled.shape

(8260, 12)

In [15]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier


models = []
models.append(('LR', LogisticRegression(random_state = 12345)))
models.append(('KNN', KNeighborsClassifier()))
models.append(('RF', RandomForestClassifier(random_state = 12345)))
models.append(('GB', GradientBoostingClassifier(random_state = 12345)))

#### Kappa

In [16]:
# evaluate each model in turn
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import cohen_kappa_score, make_scorer

import warnings
warnings.filterwarnings('ignore')

X_resampled2 = np.ascontiguousarray(X_resampled)
y_resampled2 = np.ascontiguousarray(y_resampled)


kappa_scorer = make_scorer(cohen_kappa_score)

names = []
results = []

for name, model in models:
    kfold = KFold(n_splits=10, shuffle=False)
    cv_results = cross_val_score(model, X_resampled2, y_resampled2, cv=kfold, scoring=kappa_scorer)
    results.append(cv_results)
    names.append(name)
    msg = f"{name}: {cv_results.mean()} ({cv_results.std()})"
    print(msg)


LR: 0.3155964583831451 (0.2134543406307721)
KNN: 0.31465314036829106 (0.21200437123853103)
RF: 0.3700482658170136 (0.24727795170663966)
GB: 0.33279391115904666 (0.2230088795544664)


In [None]:
# boxplot algorithm comparison
fig = plt.figure(figsize=(15,10))
fig.suptitle('Algorithm Comparison')
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(names)
plt.show()

In [None]:
results_dict = {name: result for name, result in zip(names, results)}

# Create the DataFrame using the dictionary
score = pd.DataFrame(results_dict)

# Print the DataFrame
display(score)