# Churn Prediction

In [49]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import classification_report

In [3]:
train_data = pd.read_csv('Dataset/train.csv')
test_data = pd.read_csv('Dataset/test.csv')
train_data.head()
train_data['Churn'].value_counts()

Churn
0    199605
1     44182
Name: count, dtype: int64

In [106]:
def getThresholdValue(df,var):
    q1 = df[var].quantile(0.25)
    q3 = df[var].quantile(0.75)
    inter_quantile_range = q3-q1
    lowerLimit = q1-1.5*inter_quantile_range
    upperLimit = q3+1.5*inter_quantile_range
    return lowerLimit,upperLimit
    
def outlierChecker(df,var):
    lower,upper=getThresholdValue(df,var)
    if df[(df[var]>upper) | (df[var]<lower)].shape[0]>0:
        return True
    else:
        return False
def fixOutlier(var):
    lower,upper=getThresholdValue(train_data,var)
    train_data.loc[train_data[var]<lower,var]=lower
    train_data.loc[train_data[var]>upper,var]=upper
        
# print(train_data["AccountAge"])
# print(outlierChecker(train_data,"AccountAge"))

num_var = [i for i in train_data.columns if ((train_data[i].dtypes in ["int64","float64"]) and i!="Churn")]

def printOuliers(df):
    for i in num_var:
        print(i," ",outlierChecker(df,i))
print(printOuliers(train_data))

for i in num_var:
    if outlierChecker(train_data,i):
        fixOutlier(i) 
        
print(printOuliers(train_data))
# if their is no outlier then no need of standard scaler

AccountAge   False
MonthlyCharges   False
TotalCharges   False
ViewingHoursPerWeek   False
AverageViewingDuration   False
ContentDownloadsPerMonth   False
UserRating   False
SupportTicketsPerMonth   False
WatchlistSize   False
None
AccountAge   False
MonthlyCharges   False
TotalCharges   False
ViewingHoursPerWeek   False
AverageViewingDuration   False
ContentDownloadsPerMonth   False
UserRating   False
SupportTicketsPerMonth   False
WatchlistSize   False
None


In [5]:
train_data.isnull().any()
# print(train_data.isnull().sum())
# print(train_data.shape[0])
# print(train_data['AccountAge'].value_counts(),end="   ")
# print(train_data['MonthlyCharges'].value_counts(),end="   ")
# print(train_data['TotalCharges'].value_counts(),end="   ")
print(train_data['SubscriptionType'].value_counts(),end="   ")
print(train_data['PaymentMethod'].value_counts(),end="   ")
print(train_data['PaperlessBilling'].value_counts(),end="   ")
print(train_data['ContentType'].value_counts(),end="   ")
print(train_data['MultiDeviceAccess'].value_counts(),end="   ")
print(train_data['DeviceRegistered'].value_counts())
train_data.dtypes


cat_var = [i for i in train_data.columns if train_data[i].dtypes in ["object", "category", "bool"]]
    
num_but_cat = [i for i in train_data.columns if train_data[i].dtypes in ["int64", "float64"]
               and train_data[i].nunique() <= 20]

cat_but_car = [i for i in train_data.columns if train_data[i].dtypes in ["category", "object"]
               and train_data[i].nunique() > 20]

cat_var = cat_var + num_but_cat
cat_var = [i for i in cat_var if i not in cat_but_car]
cat_var=[i for i in cat_var if i!="Churn"]
train_data_num=pd.get_dummies(data=train_data, columns=cat_var, drop_first=True,dtype="int32")
train_data_num
cat_var

SubscriptionType
Standard    81920
Basic       81050
Premium     80817
Name: count, dtype: int64   PaymentMethod
Electronic check    61313
Credit card         60924
Bank transfer       60797
Mailed check        60753
Name: count, dtype: int64   PaperlessBilling
No     121980
Yes    121807
Name: count, dtype: int64   ContentType
Both        81737
TV Shows    81145
Movies      80905
Name: count, dtype: int64   MultiDeviceAccess
No     122035
Yes    121752
Name: count, dtype: int64   DeviceRegistered
Computer    61147
Tablet      61143
Mobile      60914
TV          60583
Name: count, dtype: int64


['SubscriptionType',
 'PaymentMethod',
 'PaperlessBilling',
 'ContentType',
 'MultiDeviceAccess',
 'DeviceRegistered',
 'GenrePreference',
 'Gender',
 'ParentalControl',
 'SubtitlesEnabled',
 'SupportTicketsPerMonth']

In [35]:
y=train_data['Churn']
x=train_data_num.drop("Churn",axis=1)
x=x.drop("CustomerID",axis=1)
# xtrain,xtest,ytrain,ytest=train_test_split()
y.value_counts()
# train_data['Churn'].value_counts()
# train_data['Churn']
xtrain,xtest,ytrain,ytest = train_test_split(x,y,test_size=0.25,random_state=20)
xtrain.columns


Index(['AccountAge', 'MonthlyCharges', 'TotalCharges', 'ViewingHoursPerWeek',
       'AverageViewingDuration', 'ContentDownloadsPerMonth', 'UserRating',
       'WatchlistSize', 'SubscriptionType_Premium',
       'SubscriptionType_Standard', 'PaymentMethod_Credit card',
       'PaymentMethod_Electronic check', 'PaymentMethod_Mailed check',
       'PaperlessBilling_Yes', 'ContentType_Movies', 'ContentType_TV Shows',
       'MultiDeviceAccess_Yes', 'DeviceRegistered_Mobile',
       'DeviceRegistered_TV', 'DeviceRegistered_Tablet',
       'GenrePreference_Comedy', 'GenrePreference_Drama',
       'GenrePreference_Fantasy', 'GenrePreference_Sci-Fi', 'Gender_Male',
       'ParentalControl_Yes', 'SubtitlesEnabled_Yes',
       'SupportTicketsPerMonth_1', 'SupportTicketsPerMonth_2',
       'SupportTicketsPerMonth_3', 'SupportTicketsPerMonth_4',
       'SupportTicketsPerMonth_5', 'SupportTicketsPerMonth_6',
       'SupportTicketsPerMonth_7', 'SupportTicketsPerMonth_8',
       'SupportTicketsPerMo

In [40]:
sc = []

In [42]:
rf = RandomForestClassifier(n_estimators=100,random_state=42)

rf.fit(xtrain,ytrain)
ypred = rf.predict(xtest)
sc.append({"paramets inc.",accuracy_score(ytest,ypred)})
sc

[{0.8222225868377443, 'Normal'}, {0.8215334635010747, 'paramets inc.'}]

In [61]:
# sc.append({"precision.",precision_score(ytest,ypred)})
print(sc)
print(classification_report(ytest,ypred))
rf.get_params()

[{0.8222225868377443, 'Normal'}, {0.8215334635010747, 'paramets inc.'}, {0.5556155507559395, 'precision.'}, {0.5556155507559395, 'precision.'}, {0.5556155507559395, 'precision.'}, {0.5556155507559395, 'precision.'}, {0.5556155507559395, 'precision.'}]
              precision    recall  f1-score   support

           0       0.83      0.98      0.90     49927
           1       0.56      0.09      0.16     11020

    accuracy                           0.82     60947
   macro avg       0.69      0.54      0.53     60947
weighted avg       0.78      0.82      0.77     60947



{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'sqrt',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'monotonic_cst': None,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 42,
 'verbose': 0,
 'warm_start': False}

In [43]:
lsc=[]

In [63]:
lr = LogisticRegression()
lr.fit(xtrain,ytrain)
ypred = lr.predict(xtest)
# lsc.append({"Normal",accuracy_score(ytest,ypred)})

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [64]:

# sc.append({"precision.",precision_score(ytest,ypred)})
print(lsc)
print(classification_report(ytest,ypred))
lr.get_params()

[{0.822567148506079, 'parameter inc.'}, {0.8241258798628316, 'parameter inc.'}, {0.822567148506079, 'Normal'}]
              precision    recall  f1-score   support

           0       0.83      0.98      0.90     49927
           1       0.56      0.09      0.16     11020

    accuracy                           0.82     60947
   macro avg       0.69      0.54      0.53     60947
weighted avg       0.78      0.82      0.77     60947



{'C': 1.0,
 'class_weight': None,
 'dual': False,
 'fit_intercept': True,
 'intercept_scaling': 1,
 'l1_ratio': None,
 'max_iter': 100,
 'multi_class': 'deprecated',
 'n_jobs': None,
 'penalty': 'l2',
 'random_state': None,
 'solver': 'lbfgs',
 'tol': 0.0001,
 'verbose': 0,
 'warm_start': False}

In [103]:
# test_data=test_data.drop("CustomerID",axis=1)
test_data
# test_data["AccountAge"].nunique() > 20

catVar = [i for i in test_data.columns if test_data[i].dtypes in ["object","bool","category"]]
numVar = [i for i in test_data.columns if test_data[i].dtypes in ["int64","float64"] and test_data[i].nunique() >20]
numVar_but_cat = [i for i in test_data.columns if test_data[i].dtypes in ["int64","float64"] and test_data[i].nunique() <=20]

# test_data["SupportTicketsPerMonth"].value_counts()
numVar
catVar
numVar_but_cat
catVar = catVar+numVar_but_cat
catVar

test_data_enc = pd.get_dummies(data=test_data,columns=catVar,drop_first=True,dtype="int64")
test_data_enc
leng = len(test_data_enc)
leng
xtrain


Unnamed: 0,AccountAge,MonthlyCharges,TotalCharges,ViewingHoursPerWeek,AverageViewingDuration,ContentDownloadsPerMonth,UserRating,WatchlistSize,SubscriptionType_Premium,SubscriptionType_Standard,...,SubtitlesEnabled_Yes,SupportTicketsPerMonth_1,SupportTicketsPerMonth_2,SupportTicketsPerMonth_3,SupportTicketsPerMonth_4,SupportTicketsPerMonth_5,SupportTicketsPerMonth_6,SupportTicketsPerMonth_7,SupportTicketsPerMonth_8,SupportTicketsPerMonth_9
0,38,17.869374,679.036195,29.126308,122.274031,42,3.522724,23,1,0,...,0,0,1,0,0,0,0,0,0,0
1,77,9.912854,763.289768,36.873729,57.093319,43,2.021545,22,0,0,...,0,0,1,0,0,0,0,0,0,0
2,5,15.019011,75.095057,7.601729,140.414001,14,4.806126,22,0,1,...,1,0,1,0,0,0,0,0,0,0
3,88,15.357406,1351.451692,35.586430,177.002419,14,4.943900,23,0,1,...,1,0,0,0,0,0,0,0,0,0
4,91,12.406033,1128.949004,23.503651,70.308376,6,2.846880,0,0,1,...,0,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
104475,80,17.348236,1387.858873,19.189141,135.445204,35,1.411831,14,0,1,...,1,0,0,0,0,0,0,1,0,0
104476,20,8.275459,165.509180,30.986604,114.868640,17,2.783849,8,1,0,...,0,0,1,0,0,0,0,0,0,0
104477,106,18.134343,1922.240365,7.236303,109.583153,31,2.991527,12,0,0,...,1,1,0,0,0,0,0,0,0,0
104478,46,19.774010,909.604454,25.809285,115.153570,1,4.998019,12,0,0,...,0,0,0,0,0,0,0,0,0,0


In [107]:
num_var = [i for i in test_data_enc.columns if ((test_data_enc[i].dtypes in ["int64","float64"]) and i!="Churn")]
def checkOuliers(df):
    for i in num_var:
        print(i," ",outlierChecker(df,i))
checkOuliers(test_data_enc)

AccountAge   False
MonthlyCharges   False
TotalCharges   True
ViewingHoursPerWeek   False
AverageViewingDuration   False
ContentDownloadsPerMonth   False
UserRating   False


KeyError: 'SupportTicketsPerMonth'