# Classification Sampling Problem

Data in real world scenarios will never be perfectly balanced. This is one of the core problems to solve. It is more complex than just optimizing one parameter for "overfitting" or "underfitting".

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score

In [None]:
df = pd.read_csv('../../data_sets/data.csv')
df

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.30,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.70,151.65,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,6840-RESVB,Male,0,Yes,Yes,24,Yes,Yes,DSL,Yes,...,Yes,Yes,Yes,Yes,One year,Yes,Mailed check,84.80,1990.5,No
7039,2234-XADUH,Female,0,Yes,Yes,72,Yes,Yes,Fiber optic,No,...,Yes,No,Yes,Yes,One year,Yes,Credit card (automatic),103.20,7362.9,No
7040,4801-JZAZL,Female,0,Yes,Yes,11,No,No phone service,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.60,346.45,No
7041,8361-LTMKD,Male,1,Yes,No,4,Yes,Yes,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Mailed check,74.40,306.6,Yes


## Prepare data

In [17]:
df.TotalCharges = pd.to_numeric(df.TotalCharges, errors='coerce')
df.dropna(how='any', inplace=True)
X=df.drop(['customerID', 'Churn'], axis=1)
y=df.Churn.values

## Feature Encoding and Feature Scaling

In [18]:
# Convert categorical features into numerical > Feature Encoding > Dummy Encoding

X = pd.get_dummies(X, columns=['gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService', 
'OnlineSecurity','OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV',
       'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod'], drop_first=True)

In [19]:
X.columns

Index(['SeniorCitizen', 'tenure', 'MonthlyCharges', 'TotalCharges',
       'gender_Male', 'Partner_Yes', 'Dependents_Yes', 'PhoneService_Yes',
       'MultipleLines_No phone service', 'MultipleLines_Yes',
       'InternetService_Fiber optic', 'InternetService_No',
       'OnlineSecurity_No internet service', 'OnlineSecurity_Yes',
       'OnlineBackup_No internet service', 'OnlineBackup_Yes',
       'DeviceProtection_No internet service', 'DeviceProtection_Yes',
       'TechSupport_No internet service', 'TechSupport_Yes',
       'StreamingTV_No internet service', 'StreamingTV_Yes',
       'StreamingMovies_No internet service', 'StreamingMovies_Yes',
       'Contract_One year', 'Contract_Two year', 'PaperlessBilling_Yes',
       'PaymentMethod_Credit card (automatic)',
       'PaymentMethod_Electronic check', 'PaymentMethod_Mailed check'],
      dtype='object')

In [20]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.25)

In [21]:
# Feature Scaling
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
X_train_sc = sc.fit_transform(X_train)
X_test_sc = sc.transform(X_test)

In [22]:
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


## Call the Decision Tree model

In [28]:
from sklearn.tree import DecisionTreeClassifier

model_dt = DecisionTreeClassifier()
model_dt.fit(X_train_sc, y_train)

y_pred_dt = model_dt.predict(X_test_sc)

print(accuracy_score(y_test, y_pred_dt)*100)

72.86689419795222


In [30]:
print(df.Churn.value_counts()/len(df)*100)

Churn
No     73.421502
Yes    26.578498
Name: count, dtype: float64


In [None]:
from sklearn.metrics import classification_report

print(classification_report(y_test,y_pred_dt))

              precision    recall  f1-score   support

          No       0.81      0.81      0.81      1284
         Yes       0.49      0.49      0.49       474

    accuracy                           0.73      1758
   macro avg       0.65      0.65      0.65      1758
weighted avg       0.73      0.73      0.73      1758



## Fix the data imbalance by UPSAMPLING

In [None]:
%pip install imblearn

Collecting imblearn
  Downloading imblearn-0.0-py2.py3-none-any.whl.metadata (355 bytes)
Collecting imbalanced-learn (from imblearn)
  Downloading imbalanced_learn-0.14.1-py3-none-any.whl.metadata (8.9 kB)
Collecting sklearn-compat<0.2,>=0.1.5 (from imbalanced-learn->imblearn)
  Downloading sklearn_compat-0.1.5-py3-none-any.whl.metadata (20 kB)
Downloading imblearn-0.0-py2.py3-none-any.whl (1.9 kB)
Downloading imbalanced_learn-0.14.1-py3-none-any.whl (235 kB)
Downloading sklearn_compat-0.1.5-py3-none-any.whl (20 kB)
Installing collected packages: sklearn-compat, imbalanced-learn, imblearn
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3/3[0m [imblearn]
[1A[2KSuccessfully installed imbalanced-learn-0.14.1 imblearn-0.0 sklearn-compat-0.1.5

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.2[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpi

In [None]:
# OVERSAMPLER

from imblearn.over_sampling import RandomOverSampler
oversampler = RandomOverSampler()
X_train_upsampled, y_train_upsampled = oversampler.fit_resample(X_train_sc, y_train)

# Build the model
model_dt2 = DecisionTreeClassifier()
model_dt2.fit(X_train_upsampled, y_train_upsampled)

# Predict with the test set
y_pred_dt2 = model_dt2.predict(X_test_sc)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred_dt2)*100
print("Accuracy: ", accuracy)

Accuracy:  72.07053469852104


In [None]:
# accuracy stayed the same... precision and recall got worse...
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred_dt2))

              precision    recall  f1-score   support

          No       0.81      0.81      0.81      1284
         Yes       0.48      0.48      0.48       474

    accuracy                           0.72      1758
   macro avg       0.65      0.64      0.64      1758
weighted avg       0.72      0.72      0.72      1758



In [37]:
# UNDERSAMPLER

from imblearn.under_sampling import RandomUnderSampler

#downsample the majority class
downsampler = RandomUnderSampler()
X_train_downsampled, y_train_downsampled = downsampler.fit_resample(X_train_sc, y_train)

# Build the model
model_dt3 = DecisionTreeClassifier()
model_dt3.fit(X_train_downsampled, y_train_downsampled)

# Predict with the test set
y_pred_dt3 = model_dt3.predict(X_test_sc)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred_dt3)*100
print("Accuracy: ", accuracy)

Accuracy:  67.57679180887372


In [40]:
# accuracy got worse... recall and f1 improved
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred_dt3))

              precision    recall  f1-score   support

          No       0.85      0.67      0.75      1284
         Yes       0.44      0.69      0.53       474

    accuracy                           0.68      1758
   macro avg       0.64      0.68      0.64      1758
weighted avg       0.74      0.68      0.69      1758



There are several tools in imblearn to fix over/underfitting:
- SMOTE (synthetic minority oversampling technique)
- SMOTENN (... edited nearest neighbors)
- SMOTETOMOER
- ADASYN
- ROS/RUS

In [None]:
# SMOTEENN

from imblearn.combine import SMOTEENN
from sklearn.metrics import accuracy_score, classification_report

# Create SMOTEENN object
smote_enn = SMOTEENN(random_state=42)
X_train_resampled, y_train_resampled = smote_enn.fit_resample(X_train_sc, y_train)

# Build the model
model_dt4 = DecisionTreeClassifier()
model_dt4.fit(X_train_resampled, y_train_resampled)

# Predict with the test set
y_pred_dt4 = model_dt4.predict(X_test_sc)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred_dt4)*100
print("Accuracy: ", accuracy)

Accuracy:  72.07053469852104


In [44]:
# Everything improved!
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred_dt4))

              precision    recall  f1-score   support

          No       0.90      0.70      0.78      1284
         Yes       0.49      0.79      0.60       474

    accuracy                           0.72      1758
   macro avg       0.69      0.74      0.69      1758
weighted avg       0.79      0.72      0.74      1758



In [48]:
# SMOTE

from imblearn.over_sampling import SMOTE
from sklearn.metrics import accuracy_score, classification_report

# Create SMOTEENN object
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_sc, y_train)

# Build the model
model_dt5 = DecisionTreeClassifier()
model_dt5.fit(X_train_resampled, y_train_resampled)

# Predict with the test set
y_pred_dt5 = model_dt5.predict(X_test_sc)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred_dt5)*100
print("Accuracy: ", accuracy)

Accuracy:  72.46871444823664


In [49]:
# Everything improved!
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred_dt5))

              precision    recall  f1-score   support

          No       0.84      0.78      0.80      1284
         Yes       0.49      0.59      0.53       474

    accuracy                           0.72      1758
   macro avg       0.66      0.68      0.67      1758
weighted avg       0.74      0.72      0.73      1758



In [51]:
# ADASYN

from imblearn.over_sampling import ADASYN
from sklearn.metrics import accuracy_score, classification_report

# Create ADASYN object
adasyn = ADASYN(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_sc, y_train)

# Build the model
model_dt6 = DecisionTreeClassifier()
model_dt6.fit(X_train_resampled, y_train_resampled)

# Predict with the test set
y_pred_dt6 = model_dt6.predict(X_test_sc)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred_dt6)*100
print("Accuracy: ", accuracy)

Accuracy:  72.2980659840728


In [52]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred_dt6))

              precision    recall  f1-score   support

          No       0.84      0.77      0.80      1284
         Yes       0.49      0.59      0.53       474

    accuracy                           0.72      1758
   macro avg       0.66      0.68      0.67      1758
weighted avg       0.74      0.72      0.73      1758

