In [1]:
# import the modules
import warnings
warnings.filterwarnings('ignore')

import pandas as pd 
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV


### Load the dataset

- Load the train data and using all your knowledge try to explore the different statistical properties of the dataset.

In [2]:
# Code starts here
train = pd.read_csv(r"/Users/rahulkosamkar/Documents/Data_Science/Projects/telecom_gradient_boosting/train.csv")

# drop serial number
train.drop(columns=['customerID', 'Id'],inplace=True)
print(train.head())



# Code ends here

   gender  SeniorCitizen Partner Dependents  tenure PhoneService  \
0  Female              0      No         No      27          Yes   
1    Male              0     Yes        Yes       1          Yes   
2  Female              0      No         No      17          Yes   
3  Female              1      No         No      42          Yes   
4    Male              0     Yes         No      23          Yes   

  MultipleLines InternetService       OnlineSecurity         OnlineBackup  \
0            No              No  No internet service  No internet service   
1            No              No  No internet service  No internet service   
2           Yes     Fiber optic                   No                   No   
3           Yes     Fiber optic                   No                   No   
4            No             DSL                  Yes                   No   

      DeviceProtection          TechSupport          StreamingTV  \
0  No internet service  No internet service  No internet ser

### Visualize the data

- Replace the missing values and modify some column values(as required by you).
- Check out the best plots for plotting between categorical target and continuous features and try making some inferences from these plots.
- Clean the data, apply some data preprocessing and engineering techniques.

In [3]:
# Code starts here

# Split the data into X and y
X = train.drop(columns = ['Churn'])
y = train[['Churn']]


#Replacing spaces with 'NaN' in train dataset
X['TotalCharges'].replace(' ',np.NaN, inplace=True)

#Converting the type of column from X_train to float
X['TotalCharges'] = X['TotalCharges'].astype(float)

#Filling missing values
X['TotalCharges'].fillna(X['TotalCharges'].mean(),inplace=True)
# test['TotalCharges'].fillna(train['TotalCharges'].mean(), inplace=True)

#Check value counts
print(X.isnull().sum())

cat_cols = X.select_dtypes(include='O').columns.tolist()

#Label encoding train data
for x in cat_cols:
    le = LabelEncoder()
    X[x] = le.fit_transform(X[x])

#Encoding target data     
y = y.replace({'No':0, 'Yes':1})


gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
dtype: int64


In [4]:
X.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges
0,0,0,0,0,27,1,0,2,1,1,1,1,1,1,2,1,0,20.25,538.2
1,1,0,1,1,1,1,0,2,1,1,1,1,1,1,0,0,2,19.75,19.75
2,0,0,0,0,17,1,2,1,0,0,2,0,0,0,0,0,0,80.05,1345.65
3,0,1,0,0,42,1,2,1,0,0,0,0,2,0,0,1,2,84.65,3541.35
4,1,0,1,0,23,1,0,0,2,0,0,0,0,2,0,1,0,59.95,1406.0


### Model building

- Try to predict the churning of customers using AdaBoost
- Try and implement XGBoost for our customer churn problem and see how it performs in comparision to AdaBoost. Use different techniques you have learned to imporove the performance of the model.
- Try improving upon the `accuracy_score` ([Accuracy Score](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.accuracy_score.html))

In [5]:
# Code Starts here

# Split the data into train and test

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=3)

# Initialising AdaBoostClassifier model
ada_model = AdaBoostClassifier(random_state=0)

#Fitting the model on train data
ada_model.fit(X_train,y_train)

#Making prediction on test data
y_pred = ada_model.predict(X_test)

#Finding the accuracy score
ada_score = accuracy_score(y_test,y_pred)
print("Accuracy: ",ada_score)

#Finding the confusion matrix
ada_cm=confusion_matrix(y_test,y_pred)
print('Confusion matrix: \n', ada_cm)

#Finding the classification report
ada_cr=classification_report(y_test,y_pred)
print('Classification report: \n', ada_cr)

# Code ends here

Accuracy:  0.7906564163217031
Confusion matrix: 
 [[1086  158]
 [ 196  251]]
Classification report: 
               precision    recall  f1-score   support

           0       0.85      0.87      0.86      1244
           1       0.61      0.56      0.59       447

    accuracy                           0.79      1691
   macro avg       0.73      0.72      0.72      1691
weighted avg       0.79      0.79      0.79      1691



In [6]:
# Let's see if pruning of decision tree improves its accuracy. We will use grid search to do the optimum pruning.

#Parameter list
parameters={'learning_rate':[0.1,0.15,0.2,0.25,0.3],
            'max_depth':range(1,3)}

# Code starts here

#Initializing the model
xgb_model = XGBClassifier(random_state=0)

#Fitting the model on train data
xgb_model.fit(X_train,y_train)

#Making prediction on test data
y_pred = xgb_model.predict(X_test)

#Finding the accuracy score
xgb_score = accuracy_score(y_test,y_pred)
print("Accuracy: ",xgb_score)

#Finding the confusion matrix
xgb_cm=confusion_matrix(y_test,y_pred)
print('Confusion matrix: \n', xgb_cm)

#Finding the classification report
xgb_cr=classification_report(y_test,y_pred)
print('Classification report: \n', xgb_cr)


### GridSearch CV

#Initialsing Grid Search
clf = GridSearchCV(xgb_model, parameters)

#Fitting the model on train data
clf.fit(X_train,y_train)

#Making prediction on test data
y_pred = clf.predict(X_test)

#Finding the accuracy score
clf_score = accuracy_score(y_test,y_pred)
print("Accuracy: ",clf_score)

#Finding the confusion matrix
clf_cm=confusion_matrix(y_test,y_pred)
print('Confusion matrix: \n', clf_cm)

#Finding the classification report
clf_cr=classification_report(y_test,y_pred)
print('Classification report: \n', clf_cr)


Accuracy:  0.7841513897102307
Confusion matrix: 
 [[1086  158]
 [ 207  240]]
Classification report: 
               precision    recall  f1-score   support

           0       0.84      0.87      0.86      1244
           1       0.60      0.54      0.57       447

    accuracy                           0.78      1691
   macro avg       0.72      0.70      0.71      1691
weighted avg       0.78      0.78      0.78      1691



Accuracy:  0.7942046126552336
Confusion matrix: 
 [[1091  153]
 [ 195  252]]
Classification report: 
               precision    recall  f1-score   support

           0       0.85      0.88      0.86      1244
           1       0.62      0.56      0.59       447

    accuracy                           0.79      1691
   macro avg       0.74      0.72      0.73      1691
weighted avg       0.79      0.79      0.79      1691



### Prediction on the test data and creating the sample submission file.

- Load the test data and store the `Id` column in a separate variable.
- Perform the same operations on the test data that you have performed on the train data.
- Create the submission file as a `csv` file consisting of the `Id` column from the test data and your prediction as the second column.

In [7]:
# Code Starts here
# Prediction on test data

# Read the test data
test = pd.read_csv(r'/Users/rahulkosamkar/Documents/Data_Science/Projects/telecom_gradient_boosting/test.csv')

# Storing the id from the test file
id_ = test['Id']

# Apply the transformations on test
test.drop(columns=['customerID', 'Id'],inplace=True)

#Replacing spaces with 'NaN' in test dataset
test['TotalCharges'].replace(' ',np.NaN, inplace=True)

#Converting the type of column from X_test to float
test['TotalCharges'] = test['TotalCharges'].astype(float)

#Label encoding test data    
for x in cat_cols:
    le = LabelEncoder()    
    test[x] = le.fit_transform(test[x])
    
# Predict on the test data
y_pred_test = clf.predict(test)
y_pred_test = y_pred_test.flatten()

# Create a sample submission file
sample_submission = pd.DataFrame({'Id':id_,'Churn':y_pred_test})
print(sample_submission.head())

# Replacing the values of sample_submission
sample_submission.replace({1:'Yes', 0: 'No'},inplace=True)

# Convert the sample submission file into a csv file
# sample_submission.to_csv('sample_submission_test.csv',index=False)

# Code ends here

     Id  Churn
0  4539      0
1  1802      0
2  1380      0
3  5305      0
4  1960      0
