<a href="https://colab.research.google.com/github/ramrajv/Ramraj_Prayag_Kaggle/blob/main/Test/TelecomChurn_Test.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Exploratory Data Analysis
### Load libraries

In [62]:
# Load libraries
import numpy as np
import pandas as pd

import datetime as dt

import matplotlib.pyplot as plt
from matplotlib.gridspec import GridSpec
import seaborn as sns
sns.set(style="ticks")
%matplotlib inline

from scipy.stats import norm
from scipy import stats

import sklearn
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve, auc

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

import warnings
warnings.filterwarnings("ignore")

In [63]:
train = pd.read_csv('train.csv')

### Check for unique values

In [64]:
df = train.copy()

In [65]:
df.head().T

Unnamed: 0,0,1,2,3,4
id,0,1,2,3,4
circle_id,109,109,109,109,109
loc_og_t2o_mou,0.0,0.0,0.0,0.0,0.0
std_og_t2o_mou,0.0,0.0,0.0,0.0,0.0
loc_ic_t2o_mou,0.0,0.0,0.0,0.0,0.0
last_date_of_month_6,6/30/2014,6/30/2014,6/30/2014,6/30/2014,6/30/2014
last_date_of_month_7,7/31/2014,7/31/2014,7/31/2014,7/31/2014,7/31/2014
last_date_of_month_8,8/31/2014,8/31/2014,8/31/2014,8/31/2014,8/31/2014
arpu_6,31.277,0.0,60.806,156.362,240.708
arpu_7,87.009,122.787,103.176,205.26,128.191


In [66]:
df.shape

(69999, 172)

In [67]:
dfx = df.drop(['id','churn_probability'], axis = 1)

In [68]:
dfx.columns[dfx.isna().sum()/dfx.shape[0]>0]

Index(['loc_og_t2o_mou', 'std_og_t2o_mou', 'loc_ic_t2o_mou',
       'last_date_of_month_7', 'last_date_of_month_8', 'onnet_mou_6',
       'onnet_mou_7', 'onnet_mou_8', 'offnet_mou_6', 'offnet_mou_7',
       ...
       'arpu_3g_8', 'arpu_2g_6', 'arpu_2g_7', 'arpu_2g_8', 'night_pck_user_6',
       'night_pck_user_7', 'night_pck_user_8', 'fb_user_6', 'fb_user_7',
       'fb_user_8'],
      dtype='object', length=125)

### Drop null data

#### Drop rows with all null values

In [69]:
dfx.dropna(how = 'all', inplace = True)

In [70]:
dfx.shape

(69999, 170)

In [71]:
dfx.sample(5).T

Unnamed: 0,16450,55880,14759,22725,23264
circle_id,109,109,109,109,109
loc_og_t2o_mou,0.0,0.0,0.0,0.0,0.0
std_og_t2o_mou,0.0,0.0,0.0,0.0,0.0
loc_ic_t2o_mou,0.0,0.0,0.0,0.0,0.0
last_date_of_month_6,6/30/2014,6/30/2014,6/30/2014,6/30/2014,6/30/2014
last_date_of_month_7,7/31/2014,7/31/2014,7/31/2014,7/31/2014,7/31/2014
last_date_of_month_8,8/31/2014,8/31/2014,8/31/2014,8/31/2014,8/31/2014
arpu_6,187.597,202.762,0.0,20.0,113.841
arpu_7,428.909,106.108,97.563,117.834,262.639
arpu_8,229.89,63.358,235.862,0.0,159.433


#### Drop columns with null values in more than 70% of the rows

In [72]:
drop_cols = dfx.columns[dfx.isna().sum()/dfx.shape[0]>0.7]

In [73]:
drop_cols

Index(['date_of_last_rech_data_6', 'date_of_last_rech_data_7',
       'date_of_last_rech_data_8', 'total_rech_data_6', 'total_rech_data_7',
       'total_rech_data_8', 'max_rech_data_6', 'max_rech_data_7',
       'max_rech_data_8', 'count_rech_2g_6', 'count_rech_2g_7',
       'count_rech_2g_8', 'count_rech_3g_6', 'count_rech_3g_7',
       'count_rech_3g_8', 'av_rech_amt_data_6', 'av_rech_amt_data_7',
       'av_rech_amt_data_8', 'arpu_3g_6', 'arpu_3g_7', 'arpu_3g_8',
       'arpu_2g_6', 'arpu_2g_7', 'arpu_2g_8', 'night_pck_user_6',
       'night_pck_user_7', 'night_pck_user_8', 'fb_user_6', 'fb_user_7',
       'fb_user_8'],
      dtype='object')

In [74]:
dfx.drop(drop_cols, axis = 1, inplace = True)

In [75]:
dfx.dropna(how = 'all',inplace = True)

In [76]:
dfx.shape

(69999, 140)

In [77]:
 # Use only independent variables for check

x = dfx.copy()

In [78]:
# Find columns having only a single unique value.

x.columns[x.nunique()==1]

Index(['circle_id', 'loc_og_t2o_mou', 'std_og_t2o_mou', 'loc_ic_t2o_mou',
       'last_date_of_month_6', 'last_date_of_month_7', 'last_date_of_month_8',
       'std_og_t2c_mou_6', 'std_og_t2c_mou_7', 'std_og_t2c_mou_8',
       'std_ic_t2o_mou_6', 'std_ic_t2o_mou_7', 'std_ic_t2o_mou_8'],
      dtype='object')

In [79]:
# Drop columns with a single unique value.

x.drop(x.columns[x.nunique()==1], axis = 1, inplace = True)

In [80]:
x.columns[x.nunique()==2]

Index([], dtype='object')

In [81]:
x.columns[x.nunique()<=20]

Index(['monthly_2g_6', 'monthly_2g_7', 'monthly_2g_8', 'monthly_3g_6',
       'monthly_3g_7', 'monthly_3g_8'],
      dtype='object')

In [82]:
# Find categorical variables

cols = x.columns[x.nunique()<=20]
for col in cols:
  print(col,':',x[col].unique())

monthly_2g_6 : [0 1 2 3 4]
monthly_2g_7 : [0 1 2 4 3 5]
monthly_2g_8 : [0 1 2 4 3 5]
monthly_3g_6 : [0 1 2 4 3 5 6 8 7 9]
monthly_3g_7 : [ 0  1  2  3  5  4  8  6  7 16 14  9 10]
monthly_3g_8 : [ 0  1  2  3  4  8  5  6  9  7 16 10]


In [83]:
# Store categorical variables.

cat_vars = cols

In [84]:
x.shape

(69999, 127)

In [85]:
cols = x.columns
for col in cols:
  print(col)

arpu_6
arpu_7
arpu_8
onnet_mou_6
onnet_mou_7
onnet_mou_8
offnet_mou_6
offnet_mou_7
offnet_mou_8
roam_ic_mou_6
roam_ic_mou_7
roam_ic_mou_8
roam_og_mou_6
roam_og_mou_7
roam_og_mou_8
loc_og_t2t_mou_6
loc_og_t2t_mou_7
loc_og_t2t_mou_8
loc_og_t2m_mou_6
loc_og_t2m_mou_7
loc_og_t2m_mou_8
loc_og_t2f_mou_6
loc_og_t2f_mou_7
loc_og_t2f_mou_8
loc_og_t2c_mou_6
loc_og_t2c_mou_7
loc_og_t2c_mou_8
loc_og_mou_6
loc_og_mou_7
loc_og_mou_8
std_og_t2t_mou_6
std_og_t2t_mou_7
std_og_t2t_mou_8
std_og_t2m_mou_6
std_og_t2m_mou_7
std_og_t2m_mou_8
std_og_t2f_mou_6
std_og_t2f_mou_7
std_og_t2f_mou_8
std_og_mou_6
std_og_mou_7
std_og_mou_8
isd_og_mou_6
isd_og_mou_7
isd_og_mou_8
spl_og_mou_6
spl_og_mou_7
spl_og_mou_8
og_others_6
og_others_7
og_others_8
total_og_mou_6
total_og_mou_7
total_og_mou_8
loc_ic_t2t_mou_6
loc_ic_t2t_mou_7
loc_ic_t2t_mou_8
loc_ic_t2m_mou_6
loc_ic_t2m_mou_7
loc_ic_t2m_mou_8
loc_ic_t2f_mou_6
loc_ic_t2f_mou_7
loc_ic_t2f_mou_8
loc_ic_mou_6
loc_ic_mou_7
loc_ic_mou_8
std_ic_t2t_mou_6
std_ic_t2t_mou_7


### Get object columns and change data types to int/ float

In [86]:
obj = x.columns[x.dtypes=='O']
obj

Index(['date_of_last_rech_6', 'date_of_last_rech_7', 'date_of_last_rech_8'], dtype='object')

In [87]:
for element in obj:
  x[element] = pd.to_datetime(x[element])

In [88]:
for element in obj:
  x[element] = (x[element] - x[element].min()).dt.days

In [None]:
# x[obj]

### Null value imputation

In [89]:
x_copy = x.copy()

In [90]:
noncat = x.drop(cat_vars, axis = 1).columns
noncat

Index(['arpu_6', 'arpu_7', 'arpu_8', 'onnet_mou_6', 'onnet_mou_7',
       'onnet_mou_8', 'offnet_mou_6', 'offnet_mou_7', 'offnet_mou_8',
       'roam_ic_mou_6',
       ...
       'sachet_2g_6', 'sachet_2g_7', 'sachet_2g_8', 'sachet_3g_6',
       'sachet_3g_7', 'sachet_3g_8', 'aon', 'aug_vbc_3g', 'jul_vbc_3g',
       'jun_vbc_3g'],
      dtype='object', length=121)

In [91]:
n_cat = x[noncat]
med_cols = n_cat[n_cat.columns[abs(n_cat.mean() - n_cat.median())/n_cat.median()>=0.5]].columns

In [92]:
mean_cols = n_cat.drop(med_cols, axis = 1).columns

In [93]:
mean_cols

Index(['arpu_6', 'arpu_7', 'arpu_8', 'total_rech_num_6', 'total_rech_num_7',
       'total_rech_num_8', 'total_rech_amt_6', 'total_rech_amt_7',
       'total_rech_amt_8', 'max_rech_amt_6', 'max_rech_amt_7',
       'max_rech_amt_8', 'date_of_last_rech_6', 'date_of_last_rech_7',
       'date_of_last_rech_8', 'aon'],
      dtype='object')

In [94]:
from sklearn.impute import SimpleImputer
imp_mean = SimpleImputer(missing_values = np.nan, strategy = 'mean',verbose=0)
imp_med = SimpleImputer(missing_values = np.nan, strategy = 'median',verbose=0)
imp_mean = imp_mean.fit(x[mean_cols])
imp_med = imp_med.fit(x[med_cols])
x[mean_cols] = imp_mean.transform(x[mean_cols])
x[med_cols] = imp_med.transform(x[med_cols])

In [95]:
x.isna().sum()/x.shape[0]

arpu_6                 0.0
arpu_7                 0.0
arpu_8                 0.0
onnet_mou_6            0.0
onnet_mou_7            0.0
onnet_mou_8            0.0
offnet_mou_6           0.0
offnet_mou_7           0.0
offnet_mou_8           0.0
roam_ic_mou_6          0.0
roam_ic_mou_7          0.0
roam_ic_mou_8          0.0
roam_og_mou_6          0.0
roam_og_mou_7          0.0
roam_og_mou_8          0.0
loc_og_t2t_mou_6       0.0
loc_og_t2t_mou_7       0.0
loc_og_t2t_mou_8       0.0
loc_og_t2m_mou_6       0.0
loc_og_t2m_mou_7       0.0
loc_og_t2m_mou_8       0.0
loc_og_t2f_mou_6       0.0
loc_og_t2f_mou_7       0.0
loc_og_t2f_mou_8       0.0
loc_og_t2c_mou_6       0.0
loc_og_t2c_mou_7       0.0
loc_og_t2c_mou_8       0.0
loc_og_mou_6           0.0
loc_og_mou_7           0.0
loc_og_mou_8           0.0
std_og_t2t_mou_6       0.0
std_og_t2t_mou_7       0.0
std_og_t2t_mou_8       0.0
std_og_t2m_mou_6       0.0
std_og_t2m_mou_7       0.0
std_og_t2m_mou_8       0.0
std_og_t2f_mou_6       0.0
s

In [96]:
print("printing the count of infinity values")
  
count = np.isinf(x).values.sum()
print("It contains " + str(count) + " infinite values")

printing the count of infinity values
It contains 0 infinite values


In [97]:
# Check for null values in categorical variables

x[cat_vars].isna().sum()/x[cat_vars].shape[0]*100

monthly_2g_6    0.0
monthly_2g_7    0.0
monthly_2g_8    0.0
monthly_3g_6    0.0
monthly_3g_7    0.0
monthly_3g_8    0.0
dtype: float64

## Data Preparation

In [98]:
# Get dummies for categorical variables.

x = pd.get_dummies(data=x, columns=cat_vars, drop_first = True)

In [100]:
y = df['churn_probability']

In [101]:
X_train, X_test, y_train, y_test = train_test_split(x,y, test_size=0.25, random_state = 100)

In [102]:
X_train.shape

(52499, 167)

In [103]:
y_train.shape

(52499,)

In [104]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [106]:
df_train = pd.DataFrame(X_train, columns = x.columns)

In [109]:
df_train.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
arpu_6,52499.0,-7.884637e-17,1.00001,-7.451738,-0.553811,-0.250198,0.258287,80.498
arpu_7,52499.0,-1.061901e-16,1.00001,-4.431231,-0.540161,-0.244243,0.246276,98.563382
arpu_8,52499.0,-6.256702e-17,1.00001,-1.314632,-0.537791,-0.238919,0.25401,91.722421
onnet_mou_6,52499.0,1.181934e-16,1.00001,-0.441148,-0.413439,-0.323737,-0.051553,24.950301
onnet_mou_7,52499.0,1.80458e-16,1.00001,-0.42771,-0.403602,-0.320814,-0.068244,26.103927
onnet_mou_8,52499.0,1.504804e-16,1.00001,-0.422842,-0.398036,-0.316104,-0.069078,32.753701
offnet_mou_6,52499.0,-5.3985360000000007e-17,1.00001,-0.631174,-0.511589,-0.317738,0.09069,18.55292
offnet_mou_7,52499.0,-6.682612999999999e-19,1.00001,-0.609558,-0.503675,-0.319656,0.078655,20.655811
offnet_mou_8,52499.0,-1.67484e-16,1.00001,-0.604481,-0.495658,-0.31399,0.081654,43.720245
roam_ic_mou_6,52499.0,-3.107007e-15,1.00001,-0.16317,-0.16317,-0.16317,-0.16317,48.695974


In [174]:
from sklearn.decomposition import PCA
pca = PCA(n_components = 0.6, svd_solver = 'full', random_state = 100)
principalComponents = pca.fit_transform(X_train)
principalDf = pd.DataFrame(data = principalComponents)
                           # columns = ['pca1','pca2'])

In [221]:
pca = PCA(n_components = 2, random_state = 100)
principalComponents = pca.fit_transform(X_train)
principalDf = pd.DataFrame(data = principalComponents)

In [216]:
# principalDf.shape

In [217]:
# principalDf.columns

In [222]:
test_components = pca.transform(X_test)
X_test_pca = pd.DataFrame(data = test_components)

In [223]:
from sklearn.linear_model import LogisticRegression
X_train_pca = principalDf.copy()
logistic=LogisticRegression()
logistic.fit(X=X_train_pca,y=y_train)
logistic.predict(X_test_pca)
score_2=logistic.score(X_test_pca,y_test)
print(score_2)

0.8992571428571429


In [224]:
"{:2.2}".format(metrics.roc_auc_score(y_test, pred_proba[:,1]))

'0.87'

In [213]:
pred_proba = logistic.predict_proba(X_test_pca)

In [172]:
y_pred = np.round(pred_proba[:,1],0)

In [173]:
metrics.accuracy_score(y_test, y_pred)

0.8998285714285714

In [169]:
y_test

3425     1
33067    1
52420    0
5220     0
33958    1
9315     0
9796     0
7084     0
62263    0
47921    0
50222    0
68115    0
42917    0
43618    0
9225     0
6686     0
12763    0
10116    1
28285    0
61494    0
69274    0
36596    0
6972     0
67713    1
39196    0
27336    0
4201     0
62093    0
20338    0
69210    0
45596    0
14688    0
42643    0
49835    1
42132    0
27799    0
21065    0
28875    0
19790    0
34326    0
40412    0
9310     0
47766    1
37325    1
26688    0
61997    0
43702    0
50811    0
32206    1
5223     0
58083    0
12687    0
41168    0
42111    0
67068    0
13493    0
12435    0
53466    0
37447    0
66792    1
39373    0
38392    0
4040     0
39442    0
2028     0
50368    0
62229    1
21325    0
67922    0
41209    0
3244     0
7394     0
64355    1
14495    0
21678    0
53403    0
25607    0
15085    1
31406    0
33573    0
41601    0
20464    1
9910     0
19715    0
16658    1
5372     1
51832    0
66844    0
46170    0
33761    0
59363    0

## Support Vector Machine

In [48]:
import numpy as np
import cvxopt
from sklearn.model_selection import train_test_split
from matplotlib import pyplot as plt
from sklearn.svm import LinearSVC
from sklearn.metrics import confusion_matrix

In [49]:
x['churn_probability'] = df['churn_probability']

In [51]:
y_train.shape

(48999,)

In [52]:
X_train.shape

(48999, 168)

In [53]:
from sklearn import svm

In [54]:
svm = svm.SVC()
svm.fit(X_train, y_train)

SVC()

In [55]:
y_pred = svm.predict(X_test)
confusion_matrix(y_test, y_pred)

array([[18706,   195],
       [ 1256,   843]])

In [56]:
svc = LinearSVC()
svc.fit(X_train, y_train)

LinearSVC()

In [57]:
plt.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap='winter');
ax = plt.gca()
xlim = ax.get_xlim()
w = svc.coef_[0]
a = -w[0] / w[1]
xx = np.linspace(xlim[0], xlim[1])
yy = a * xx - svc.intercept_[0] / w[1]
plt.plot(xx, yy)
yy = a * xx - (svc.intercept_[0] - 1) / w[1]
plt.plot(xx, yy, 'k--')
yy = a * xx - (svc.intercept_[0] + 1) / w[1]
plt.plot(xx, yy, 'k--')

TypeError: ignored

### Logistics Regression
#### Start with RFE

In [60]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()
rfe = RFE(estimator = logreg, n_features_to_select = 20)
rfe = rfe.fit(X_train, y_train)
print(rfe.support_)
print(rfe.ranking_)

[False False False False False False False False False False False False
 False False False False False False False False False  True False False
  True  True False False False False False False False False False False
  True  True  True False False False False False False False False False
 False False False False False False False False False False False False
 False False  True  True False False False False False False False False
 False False  True  True False  True  True False  True False False False
  True False  True False False  True  True False  True False False False
 False False False False False False False False False False False False
 False False False False False  True False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False Fa

### Random Forest

In [61]:
# Import the model we are using
from sklearn.ensemble import RandomForestRegressor
# Instantiate model with 1000 decision trees
rf = RandomForestRegressor(n_estimators = 1000, random_state = 42)
# Train the model on training data
rf.fit(X_train, y_train);

In [None]:
# Use the forest's predict method on the test data
predictions = rf.predict(X_test)
# Calculate the absolute errors
errors = abs(predictions - y_test)
# Print out the mean absolute error (mae)
print('Mean Absolute Error:', round(np.mean(errors), 2))