In [26]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.utils import resample

In [2]:
#Load the dataset and explore the variables.
data = pd.read_csv("customer_churn.csv")

In [3]:
data

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.30,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.70,151.65,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,6840-RESVB,Male,0,Yes,Yes,24,Yes,Yes,DSL,Yes,...,Yes,Yes,Yes,Yes,One year,Yes,Mailed check,84.80,1990.5,No
7039,2234-XADUH,Female,0,Yes,Yes,72,Yes,Yes,Fiber optic,No,...,Yes,No,Yes,Yes,One year,Yes,Credit card (automatic),103.20,7362.9,No
7040,4801-JZAZL,Female,0,Yes,Yes,11,No,No phone service,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.60,346.45,No
7041,8361-LTMKD,Male,1,Yes,No,4,Yes,Yes,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Mailed check,74.40,306.6,Yes


In [4]:
#We will try to predict variable Churn using a logistic regression on variables tenure, SeniorCitizen,MonthlyCharges.
X = data[['SeniorCitizen', 'tenure', 'MonthlyCharges']]
y = data['Churn']

In [5]:
#Check for null values
X.isna().sum()
y.isna().sum()

0

In [6]:
#Test train split
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [7]:
X_train_num = X_train.select_dtypes(include = np.number)

In [8]:
#Scaling
transformer = MinMaxScaler().fit(X_train_num)
X_train_normalized = transformer.transform(X_train_num)
X_train_norm = pd.DataFrame(X_train_normalized)

X_train_norm

Unnamed: 0,0,1,2
0,0.0,0.333333,0.363184
1,0.0,0.347222,0.560697
2,0.0,0.013889,0.501493
3,0.0,0.916667,0.863184
4,0.0,0.041667,0.467662
...,...,...,...
5277,0.0,0.583333,0.267164
5278,0.0,0.152778,0.026368
5279,0.0,0.972222,0.978109
5280,0.0,0.013889,0.314925


In [9]:
#Logistic Regression Model
classification = LogisticRegression(solver='lbfgs',
                  multi_class='auto').fit(X_train_norm, y_train) 

classification


In [10]:
X_test_normalized = transformer.transform(X_test)
X_test_norm = pd.DataFrame(X_test_normalized)
X_test_norm 

Unnamed: 0,0,1,2
0,1.0,0.694444,0.900498
1,1.0,0.013889,0.558706
2,0.0,0.250000,0.774129
3,0.0,0.041667,0.307463
4,0.0,0.013889,0.010448
...,...,...,...
1756,0.0,0.847222,0.852239
1757,0.0,0.194444,0.370647
1758,1.0,0.013889,0.701990
1759,0.0,0.708333,0.408458


In [11]:
#Evaluate the model.
predictions = classification.predict(X_test_norm)
classification.score(X_test_norm, y_test)

0.7955706984667802

In [12]:

pd.Series(predictions).value_counts()

No     1453
Yes     308
dtype: int64

In [13]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, predictions)

array([[1195,  102],
       [ 258,  206]])

In [14]:
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

pred = classification.predict(X_test_norm)

print("precision: ",precision_score(y_test,pred, pos_label="Yes"))
print("recall: ",recall_score(y_test,pred, average="binary", pos_label="Yes"))
print("f1: ",f1_score(y_test,pred, pos_label="Yes"))

precision:  0.6688311688311688
recall:  0.44396551724137934
f1:  0.533678756476684


In [15]:
#Even a simple model will give us more than 70% accuracy. Why?
#beacuse there is imbalance in the target

In [16]:
data['Churn'].value_counts()

No     5174
Yes    1869
Name: Churn, dtype: int64

In [17]:
#Resetting Index
y_train = y_train.reset_index(drop=True) 
y_test = y_test.reset_index(drop=True) 


In [18]:
#SMOTE
sm = SMOTE(k_neighbors=5)
X_train_SMOTE,y_train_SMOTE = sm.fit_resample(X_train_norm,y_train)

In [21]:
LR = LogisticRegression(solver='lbfgs')
LR.fit(X_train_SMOTE, y_train_SMOTE)
pred = LR.predict(X_test_norm)

print(LR.score(X_train_norm, y_train))
print(LR.score(X_test_norm, y_test))
print("precision: ",precision_score(y_test,pred, average='binary', pos_label = 'Yes'))
print("recall: ",recall_score(y_test,pred, average='binary', pos_label = 'Yes'))
print("f1: ",f1_score(y_test,pred, average='binary', pos_label = 'Yes'))

0.7264293828095418
0.740488358886996
precision:  0.5051094890510949
recall:  0.7456896551724138
f1:  0.6022628372497825


In [19]:
#Using Oversampling instead of SMOTE
train = pd.concat([X_train_norm, y_train],axis=1)
train.head

<bound method NDFrame.head of         0         1         2 Churn
0     0.0  0.333333  0.363184   Yes
1     0.0  0.347222  0.560697    No
2     0.0  0.013889  0.501493   Yes
3     0.0  0.916667  0.863184   Yes
4     0.0  0.041667  0.467662    No
...   ...       ...       ...   ...
5277  0.0  0.583333  0.267164    No
5278  0.0  0.152778  0.026368    No
5279  0.0  0.972222  0.978109    No
5280  0.0  0.013889  0.314925    No
5281  0.0  0.027778  0.518408    No

[5282 rows x 4 columns]>

In [23]:
# separate majority/minority classes
no_churn = train[train['Churn']=='No']
yes_churn = train[train['Churn']=='Yes']


In [24]:
display(no_churn.shape)
display(yes_churn.shape)

(3877, 4)

(1405, 4)

In [27]:
yes_churn_oversampled = resample(yes_churn, #<- sample from here
                                    replace=True, #<- we need replacement, since we don't have enough data otherwise
                                    n_samples = len(no_churn),#<- make both sets the same size
                                    random_state=0)


In [28]:
display(no_churn.shape)
display(yes_churn_oversampled.shape)
yes_churn_oversampled.head(20)

(3877, 4)

(3877, 4)

Unnamed: 0,0,1,2,Churn
2391,0.0,0.083333,0.91194,Yes
1955,1.0,0.444444,0.876617,Yes
4595,1.0,0.375,0.666667,Yes
3006,0.0,0.041667,0.728856,Yes
2752,0.0,0.208333,0.657214,Yes
5205,0.0,0.486111,0.365672,Yes
3834,1.0,0.25,0.768159,Yes
940,0.0,0.013889,0.660199,Yes
2098,0.0,0.013889,0.014925,Yes
4137,0.0,0.083333,0.812935,Yes


In [29]:
train_oversampled = pd.concat([no_churn,yes_churn_oversampled],axis=0)
train_oversampled.head()
train_oversampled.isna().sum()


0        0
1        0
2        0
Churn    0
dtype: int64

In [31]:
#Building new model, test and evaluate it
y_train_over = train_oversampled['Churn'].copy()
X_train_over = train_oversampled.drop('Churn',axis = 1).copy()
y_test.isna().sum()

0

In [32]:
LR_over = LogisticRegression(solver='lbfgs')
LR_over.fit(X_train_over, y_train_over)
display(LR_over.score(X_train_over, y_train_over))
display(LR_over.score(X_train_over, y_train_over))

0.7267216920299201

0.7267216920299201

In [None]:
#The values is kind of same 