In [1]:
# importing libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
import os

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

import warnings
warnings.filterwarnings('ignore')

In [2]:
# uploading dataset

data = pd.read_csv('Customer-Churn.csv')
data

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,MonthlyCharges,TotalCharges,Churn
0,Female,0,Yes,No,1,No,No,Yes,No,No,No,No,Month-to-month,29.85,29.85,No
1,Male,0,No,No,34,Yes,Yes,No,Yes,No,No,No,One year,56.95,1889.5,No
2,Male,0,No,No,2,Yes,Yes,Yes,No,No,No,No,Month-to-month,53.85,108.15,Yes
3,Male,0,No,No,45,No,Yes,No,Yes,Yes,No,No,One year,42.30,1840.75,No
4,Female,0,No,No,2,Yes,No,No,No,No,No,No,Month-to-month,70.70,151.65,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,Male,0,Yes,Yes,24,Yes,Yes,No,Yes,Yes,Yes,Yes,One year,84.80,1990.5,No
7039,Female,0,Yes,Yes,72,Yes,No,Yes,Yes,No,Yes,Yes,One year,103.20,7362.9,No
7040,Female,0,Yes,Yes,11,No,Yes,No,No,No,No,No,Month-to-month,29.60,346.45,No
7041,Male,1,Yes,No,4,Yes,No,No,No,No,No,No,Month-to-month,74.40,306.6,Yes


In [3]:
data.dtypes

gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
MonthlyCharges      float64
TotalCharges         object
Churn                object
dtype: object

In [4]:
data['TotalCharges'] = pd.to_numeric(data['TotalCharges'], errors = 'coerce')

In [5]:
data.dtypes

gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
MonthlyCharges      float64
TotalCharges        float64
Churn                object
dtype: object

In [6]:
data['TotalCharges'][488]

nan

In [7]:
data.isna().sum()

gender               0
SeniorCitizen        0
Partner              0
Dependents           0
tenure               0
PhoneService         0
OnlineSecurity       0
OnlineBackup         0
DeviceProtection     0
TechSupport          0
StreamingTV          0
StreamingMovies      0
Contract             0
MonthlyCharges       0
TotalCharges        11
Churn                0
dtype: int64

In [8]:
data = data.dropna()

In [9]:
data.isna().sum()

gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64

In [10]:
data2 = data[['tenure','SeniorCitizen','MonthlyCharges','TotalCharges','Churn']]

In [11]:
data2

Unnamed: 0,tenure,SeniorCitizen,MonthlyCharges,TotalCharges,Churn
0,1,0,29.85,29.85,No
1,34,0,56.95,1889.50,No
2,2,0,53.85,108.15,Yes
3,45,0,42.30,1840.75,No
4,2,0,70.70,151.65,Yes
...,...,...,...,...,...
7038,24,0,84.80,1990.50,No
7039,72,0,103.20,7362.90,No
7040,11,0,29.60,346.45,No
7041,4,1,74.40,306.60,Yes


In [12]:
# splitting our data

X = data2.drop('Churn', axis=1)
y = data2['Churn']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [13]:
from sklearn.preprocessing import StandardScaler

In [14]:
transformer = StandardScaler()
transformer.fit(X_train)

StandardScaler()

In [15]:
x_test= transformer.transform(X_test)
x_train= transformer.transform(X_train)

In [16]:
from sklearn.linear_model import LogisticRegression

In [17]:
classification = LogisticRegression(random_state=42, max_iter=10000)
classification.fit(x_train, y_train)

LogisticRegression(max_iter=10000, random_state=42)

In [18]:
classification.score(x_test, y_test)

0.781042654028436

In [19]:
from sklearn.metrics import confusion_matrix

In [20]:
predictions = classification.predict(X_test)
confusion_matrix(y_test, predictions)

array([[   0, 1549],
       [   0,  561]])

In [21]:
# Cross validation lab. Applying SMOTE...

In [25]:
from imblearn.over_sampling import SMOTE

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

smote = SMOTE()

X_sm, y_sm = smote.fit_resample(np.array(X_train), y_train)
y_sm.value_counts()

No     3614
Yes    3614
Name: Churn, dtype: int64

In [26]:
classification = LogisticRegression(random_state=42, max_iter=10000)
classification.fit(X_train, y_train)

LogisticRegression(max_iter=10000, random_state=42)

In [27]:
classification.score(X_test, y_test)

0.7843601895734598

In [30]:
from sklearn.ensemble import RandomForestClassifier

In [31]:
rfr= RandomForestClassifier()
rfr.fit(X_train, y_train)
rfr.score(X_test, y_test)

0.7649289099526067

In [35]:
from sklearn.metrics import classification_report

In [36]:
preds = rfr.predict(X_test)

print(classification_report(y_test, preds))

              precision    recall  f1-score   support

          No       0.81      0.88      0.85      1549
         Yes       0.57      0.45      0.50       561

    accuracy                           0.76      2110
   macro avg       0.69      0.66      0.67      2110
weighted avg       0.75      0.76      0.75      2110



In [37]:
from imblearn.under_sampling import TomekLinks

tl = TomekLinks('majority')

X_tl, y_tl = tl.fit_resample(np.array(X), y)

y_tl.value_counts()

No     4609
Yes    1869
Name: Churn, dtype: int64

In [40]:
X_train, X_test, y_train, y_test = train_test_split(X_tl, y_tl, test_size=0.3, random_state=42)

In [41]:
classification = LogisticRegression(random_state=42, max_iter=10000)
classification.fit(X_train, y_train)

LogisticRegression(max_iter=10000, random_state=42)

In [42]:
classification.score(X_test, y_test)

0.7829218106995884

In [43]:
rfr= RandomForestClassifier()
rfr.fit(X_train, y_train)
rfr.score(X_test, y_test)

0.7885802469135802

In [44]:
preds = rfr.predict(X_test)

print(classification_report(y_test, preds))

              precision    recall  f1-score   support

          No       0.83      0.89      0.86      1384
         Yes       0.66      0.55      0.60       560

    accuracy                           0.79      1944
   macro avg       0.74      0.72      0.73      1944
weighted avg       0.78      0.79      0.78      1944

