In [139]:
import numpy as np
import pandas as pd
from imblearn.under_sampling import TomekLinks
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

import warnings
warnings.filterwarnings('ignore')

- Read that data into Python and call the dataframe `churnData`.
- Check the datatypes of all the columns in the data. You would see that the column `TotalCharges` is object type. Convert this column into numeric type using `pd.to_numeric` function.
- Check for null values in the dataframe. Replace the null values.
- Use the following features: `tenure`, `SeniorCitizen`, `MonthlyCharges` and `TotalCharges`:
  - Scale the features either by using normalizer or a standard scaler.
  - Split the data into a training set and a test set.
  - Fit a logistic regression model on the training data.
  - Check the accuracy on the test data.

In [7]:
churn_data = pd.read_csv('files_for_lab/Customer-Churn.csv')

In [21]:
def to_zero(x):
    if x == ' ':
        return 0
    else:
        return float(x)

churn_data['TotalCharges'] = churn_data.TotalCharges.apply(to_zero)

In [24]:
churn_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 16 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   gender            7043 non-null   object 
 1   SeniorCitizen     7043 non-null   int64  
 2   Partner           7043 non-null   object 
 3   Dependents        7043 non-null   object 
 4   tenure            7043 non-null   int64  
 5   PhoneService      7043 non-null   object 
 6   OnlineSecurity    7043 non-null   object 
 7   OnlineBackup      7043 non-null   object 
 8   DeviceProtection  7043 non-null   object 
 9   TechSupport       7043 non-null   object 
 10  StreamingTV       7043 non-null   object 
 11  StreamingMovies   7043 non-null   object 
 12  Contract          7043 non-null   object 
 13  MonthlyCharges    7043 non-null   float64
 14  TotalCharges      7043 non-null   float64
 15  Churn             7043 non-null   object 
dtypes: float64(2), int64(2), object(12)
memory

In [26]:
churn_data.isna().sum()

gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64

In [44]:
y = churn_data['Churn']
X = churn_data[['tenure', 'SeniorCitizen', 'MonthlyCharges', 'TotalCharges']]

### Model 1

In [69]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=44)
scaler = StandardScaler().fit(X_train)
X_train_t = scaler.transform(X_train)
X_train_t = pd.DataFrame(X_train_t, columns=X.columns)

lgr = LogisticRegression().fit(X_train_t, y_train)
prediction = lgr.predict(X_test)

print(classification_report(y_test, prediction))

              precision    recall  f1-score   support

          No       0.00      0.00      0.00      1721
         Yes       0.26      1.00      0.41       604

    accuracy                           0.26      2325
   macro avg       0.13      0.50      0.21      2325
weighted avg       0.07      0.26      0.11      2325



In [64]:
y.value_counts()

No     5174
Yes    1869
Name: Churn, dtype: int64

Ok, that is something new. I got the problem that I thought I would get, but the other way around, instead of getting a full recall on the data that had the more counts I got a full recall of the other data.

### Model 2

In [176]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=50)
scaler = StandardScaler().fit(X_train)
X_train_t = scaler.transform(X_train)
X_train_t = pd.DataFrame(X_train_t, columns=X.columns)

lgr = LogisticRegression().fit(X_train_t, y_train)
prediction = lgr.predict(X_test)

print(classification_report(y_test, prediction))

              precision    recall  f1-score   support

          No       0.00      0.00      0.00      1709
         Yes       0.26      1.00      0.42       616

    accuracy                           0.26      2325
   macro avg       0.13      0.50      0.21      2325
weighted avg       0.07      0.26      0.11      2325



In [181]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=50)

## Concat the data
full_train = pd.concat([X_train, y_train],axis=1)

## Balance the data
train_no = full_train[full_train.Churn == 'No']
length = len(C_no)
train_yes = full_train[full_train.Churn == 'Yes'].sample(length, replace=True)
full_train = pd.concat([train_no, train_yes])
print(full_train.Churn.value_counts())


## Split data again
X_train = full_train.drop(['Churn'],axis=1)
y_train = full_train[['Churn']]

## Scale data
scaler = StandardScaler().fit(X_train)
scaler.transform(X_train)
X_train = pd.DataFrame(X_train, columns=X.columns)


## Run model
lgr = LogisticRegression().fit(X_train, y_train)
prediction = lgr.predict(X_test)
classification_1 = classification_report(y_test, prediction)
print(classification_1)

No     3465
Yes    3465
Name: Churn, dtype: int64
              precision    recall  f1-score   support

          No       0.90      0.69      0.78      1709
         Yes       0.47      0.78      0.59       616

    accuracy                           0.71      2325
   macro avg       0.68      0.73      0.68      2325
weighted avg       0.78      0.71      0.73      2325



In [182]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=50)

## Concat the data
full_train = pd.concat([X_train, y_train],axis=1)

## Balance the data
C_yes = full_train[full_train.Churn == 'Yes']
length = len(C_no)
C_no = full_train[full_train.Churn == 'No'].sample(length, replace=True)
full_train = pd.concat([C_yes, C_no])

## Split data again
X_train = full_train.drop(['Churn'],axis=1)
y_train = full_train[['Churn']]

## Scale data
scaler = StandardScaler().fit(X_train)
scaler.transform(X_train)
X_train = pd.DataFrame(X_train, columns=X.columns)

## Run model
lgr = LogisticRegression().fit(X_train, y_train)
prediction = lgr.predict(X_test)

classification_2 = classification_report(y_test, prediction)
print(classification_2)

              precision    recall  f1-score   support

          No       0.83      0.89      0.86      1709
         Yes       0.62      0.49      0.55       616

    accuracy                           0.79      2325
   macro avg       0.73      0.69      0.70      2325
weighted avg       0.77      0.79      0.78      2325



In [183]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=50)

over_sampler = SMOTE().fit(X_train, y_train)
X_res, y_res = over_sampler.fit_resample(X, y)

# ## Scale data
scaler = StandardScaler().fit(X_res)
scaler.transform(X_res)

## Run model
lgr = LogisticRegression().fit(X_res, y_res)
prediction = lgr.predict(X_test)
classification_3 = classification_report(y_test, prediction)
print(classification_3)

              precision    recall  f1-score   support

          No       0.89      0.70      0.79      1709
         Yes       0.48      0.75      0.58       616

    accuracy                           0.72      2325
   macro avg       0.68      0.73      0.68      2325
weighted avg       0.78      0.72      0.73      2325



In [184]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=50)

under_sampler = TomekLinks(n_jobs=4).fit(X_train, y_train)
X_res, y_res = under_sampler.fit_resample(X, y)

# ## Scale data
scaler = StandardScaler().fit(X_res)
scaler.transform(X_res)

## Run model
lgr = LogisticRegression().fit(X_res, y_res)
prediction = lgr.predict(X_test)
classification_4 = classification_report(y_test, prediction)
print(classification_4)

              precision    recall  f1-score   support

          No       0.89      0.66      0.76      1709
         Yes       0.45      0.78      0.57       616

    accuracy                           0.69      2325
   macro avg       0.67      0.72      0.67      2325
weighted avg       0.78      0.69      0.71      2325



In [185]:
print(classification_1)
print(classification_2)
print(classification_3)
print(classification_4)

              precision    recall  f1-score   support

          No       0.90      0.69      0.78      1709
         Yes       0.47      0.78      0.59       616

    accuracy                           0.71      2325
   macro avg       0.68      0.73      0.68      2325
weighted avg       0.78      0.71      0.73      2325

              precision    recall  f1-score   support

          No       0.83      0.89      0.86      1709
         Yes       0.62      0.49      0.55       616

    accuracy                           0.79      2325
   macro avg       0.73      0.69      0.70      2325
weighted avg       0.77      0.79      0.78      2325

              precision    recall  f1-score   support

          No       0.89      0.70      0.79      1709
         Yes       0.48      0.75      0.58       616

    accuracy                           0.72      2325
   macro avg       0.68      0.73      0.68      2325
weighted avg       0.78      0.72      0.73      2325

              preci