In [1]:
# Importing all usefull libraries

import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from imblearn.over_sampling import RandomOverSampler

from sklearn import set_config
set_config(display='diagram')

In [2]:
# Loding dataset
telecom = pd.read_csv('Churn_Model_Building_Data.csv')

telecom.head()

Unnamed: 0,account_length,voice_messages,intl_plan,intl_mins,intl_calls,day_mins,eve_charge,night_mins,customer_calls,churn
0,128,25.0,0,10.0,3.0,265.1,16.78,244.7,1.0,0
1,107,26.0,0,13.7,3.0,161.6,16.62,254.4,1.0,0
2,137,0.0,0,12.2,5.0,243.4,10.3,162.6,0.0,0
3,84,0.0,1,6.6,7.0,299.4,5.5,196.9,2.0,0
4,75,0.0,1,10.1,3.0,166.7,12.61,186.9,3.0,0


In [3]:
# Defining independent (x) & dependent (y) variables
x = telecom.iloc[:,:-1]
y = telecom['churn']

In [4]:
# Target column value count
y.value_counts()

0    4293
1     707
Name: churn, dtype: int64

## Balncing the data : 

In [5]:
# Creating a RandomOverSampler object
ros = RandomOverSampler()

# Fitting the RandomOverSampler to the independent & dependent features & perform the oversampling
x, y = ros.fit_resample(x, y)

# Verify that the dataset is balanced
print(y.value_counts())

0    4293
1    4293
Name: churn, dtype: int64


In [6]:
# Splitting data into train & test sets
xtrain,xtest,ytrain,ytest = train_test_split(x,y,test_size=0.25,random_state=1)

In [7]:
# Creating transformers for pipeline
transformer = ColumnTransformer(transformers=[
    ('Impute', SimpleImputer(strategy='mean'), [1, 7]),
    ('Encode', OneHotEncoder(sparse=False, handle_unknown='ignore'), [3, 4]),
    ('Scale', MinMaxScaler(), slice(0, 9))
], remainder='passthrough')

In [8]:
# Creating pipeline
model = Pipeline(steps=[
    ('transformer', transformer),
    ('model', RandomForestClassifier())
])

In [9]:
# Fitting pipeline on training data
model.fit(xtrain,ytrain)

In [10]:
# Prediction by using pipeline
ypred = model.predict(xtest)

In [11]:
# Print classifiation report
print(classification_report(ytest,ypred))

              precision    recall  f1-score   support

           0       1.00      0.99      0.99      1068
           1       0.99      1.00      0.99      1079

    accuracy                           0.99      2147
   macro avg       0.99      0.99      0.99      2147
weighted avg       0.99      0.99      0.99      2147



In [15]:
# Accuracy
accuracy_score(ytest,ypred)

0.9916162086632511

In [12]:
# Cross validation score
cross_val_score(model,xtrain,ytrain,cv=5,scoring='accuracy').mean()

0.9849352338482774

In [16]:
model.get_feature_names_out

<bound method Pipeline.get_feature_names_out of Pipeline(steps=[('transformer',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('Impute', SimpleImputer(),
                                                  [1, 7]),
                                                 ('Encode',
                                                  OneHotEncoder(handle_unknown='ignore',
                                                                sparse=False),
                                                  [3, 4]),
                                                 ('Scale', MinMaxScaler(),
                                                  slice(0, 9, None))])),
                ('model', RandomForestClassifier())])>

# Save the final model :

In [13]:
import pickle

In [14]:
pickle.dump(model, open('Churn_Pipeline.pkl', 'wb'))