In [16]:
import os
import numpy as np
import pandas as pd

import scipy.stats as stats
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from imblearn.combine import SMOTEENN 
from imblearn.over_sampling import RandomOverSampler

import pickle
import sqlalchemy as sa
import warnings
warnings.filterwarnings('ignore')

In [26]:
class Telecom_customer():
    
# Data Gathering & Data Analysis
    def get_data(self):
        
        Engine=sa.create_engine("mysql+pymysql://root:Pratiksha2501@localhost:3306/project")
        self.churn_df = pd.read_sql_table('telco_customer_churn',Engine)
        
# Feature Selection
    def feature_selection(self):

        # Total charges are in object dtype so convert into Numerical feature 
        self.churn_df['Total_Charges'] = pd.to_numeric(self.churn_df['Total_Charges'], errors='coerce')
        
        # replace NaN values with mean value
        self.churn_df.Total_Charges = self.churn_df.Total_Charges.fillna(self.churn_df.Total_Charges.median())
        
        features = ['Gender','Senior_Citizen','Partner','Dependents','Tenure_Months','Phone_Service','Multiple_Lines','Internet_Service',
                 'Online_Security','Online_Backup','Device_Protection','Tech_Support','Streaming_TV','Streaming_Movies','Contract',
                 'Paperless_Billing','Payment_Method','Monthly_Charges','Total_Charges']
        
        # Categorical feature
        self.categorical_feature = {feature for feature in  self.churn_df.columns if  self.churn_df[feature].dtypes == 'O'}
        
        
        # Label Encoding
        encoder = LabelEncoder()
        for feature in self.categorical_feature:
             self.churn_df[feature] = encoder.fit_transform( self.churn_df[feature])
            
        self.churn_df.drop(['CustomerID','Count', 'Country', 'State', 'City','Zip_Code','Lat_Long','Longitude',
                       'Latitude','Churthn_Label','Churn_Score','CLTV','Churn_Reason'],axis=1,inplace=True)

    def preprocess_data(self):
        """preprocess data and return model trainable data"""
        self.X = self.churn_df.drop(['Churn_Value'],axis=1)
        self.Y = self.churn_df['Churn_Value']
        
        self.x_train, self.x_test, self.y_train, self.y_test = train_test_split(self.X,self.Y,test_size=0.2, 
                                                                                random_state=2,stratify=self.Y)
    
    def train_model(self):
        """trains model on data and return model object"""
        #Model Training/Building
        random_os = RandomOverSampler(sampling_strategy=0.7)
        self.x_train_ros, self.y_train_ros = random_os.fit_resample(self.x_train,self.y_train)

    def get_model_results(self):
        """returns performance of model on train and test data"""
        # Using SMOTENN Technique
        st=SMOTEENN()
        self.x_train_st, self.y_train_st = st.fit_resample(self.x_train, self.y_train)
        print("The number of classes before fit {}".format(Counter(self.y_train)))
        print("The number of classes after fit {}".format(Counter(self.y_train_st)))
        
        #splitting the over sampling dataset 
        self.x_train_sap, self.x_test_sap, self.y_train_sap, self.y_test_sap = train_test_split(self.x_train_st, self.y_train_st, test_size=0.2)
        
        # Random forest classifier
        self.Rfc_sampling = RandomForestClassifier(n_estimators=150,criterion='gini', max_depth=15, min_samples_leaf=10, min_samples_split=6)
        self.Rfc_sampling.fit(self.x_train_sap, self.y_train_sap)
        


    def test_evaluation(self,string):
        # Model Evaluation Testing Data
        print(string.center(50,'*'))
    
        self.test_pred = self.Rfc_sampling.predict(self.x_test_sap)
        cnf_matrix = confusion_matrix(self.y_test_sap, self.test_pred)
        print('Confusion Matrix :\n', cnf_matrix)
    
        accuracy = accuracy_score(self.y_test_sap, self.test_pred)
        print('Accuracy :', accuracy)
    
        clf_report = classification_report(self.y_test_sap, self.test_pred )
        print('Classification Report :\n', clf_report)
    
    def train_evaluation(self,string):
        # Model Evaluatioon Training Data
        print(string.center(50,'*'))
    
        self.train_pred = self.Rfc_sampling.predict(self.x_train_sap)
        cnf_matrix = confusion_matrix(self.y_train_sap, self.train_pred)
        print('Confusion Matrix :\n', cnf_matrix)
    
        accuracy = accuracy_score(self.y_train_sap, self.train_pred)
        print('Accuracy :', accuracy)
    
        clf_report = classification_report(self.y_train_sap, self.train_pred)
        print('Classification Report :\n', clf_report)
        
        return self.churn_df

    def load_file(self):
        with open('RandomForest_model.pkl', 'wb') as f:
            pickle.dump(self.Rfc_sampling, f)
            
        # Load the Model back from file
        with open('RandomForest_model.pkl', 'rb') as file:  
            self.load_model = pickle.load(file)
            
            
    def new_data(self):
        Gender = 'Female'
        Senior_Citizen = 'No'
        Partner = 'Yes'
        Dependents = 'No'
        Tenure_Months = 72
        Phone_Service = 'Yes'
        Multiple_Lines = 'No'
        Internet_Service = 'No'
        Online_Security = 'No internet service'
        Online_Backup = 'No internet service'
        Device_Protection = 'No internet service'
        Tech_Support = 'No internet service'
        Streaming_TV = 'No internet service'
        Streaming_Movies = 'No internet service'
        Contract = 'Two year'
        Paperless_Billing = 'Yes'
        Payment_Method = 'Bank transfer (automatic)'
        Monthly_Charges = 21.15
        Total_Charges = 1419.4
        
        data = [[Gender,Senior_Citizen,Partner,Dependents,Tenure_Months,Phone_Service,Multiple_Lines,Internet_Service,
                 Online_Security,Online_Backup,Device_Protection,Tech_Support,Streaming_TV,Streaming_Movies,Contract,
                 Paperless_Billing,Payment_Method,Monthly_Charges,Total_Charges]]

        df = pd.DataFrame(data, columns=['Gender','Senior_Citizen','Partner','Dependents','Tenure_Months','Phone_Service',
                                         'Multiple_Lines','Internet_Service','Online_Security','Online_Backup',
                                         'Device_Protection','Tech_Support','Streaming_TV','Streaming_Movies','Contract',
                                         'Paperless_Billing','Payment_Method','Monthly_Charges','Total_Charges'])
        
#         print(self.load_model.score(self.x_test_sap, self.y_test_sap))

        for feature in df.columns:
            if df[feature].dtypes == 'object':
                categorical_feature = feature
#                 print(categorical_feature)
    
        encoder = LabelEncoder()
        for feature in df.columns:
            if df[feature].dtypes == 'object':
                df[feature] = encoder.fit_transform(df[feature])
        
        single = self.load_model.predict(df)
        probability = self.load_model.predict_proba(df)[:,1]
        
        print('Prediccted Class=',single)
        print('Probablity=',probability)
        
#         self.churn_df.to_csv('Telecom_customer_churn.csv')
#         return df,self.churn_df
            
        
obj=Telecom_customer()
obj.get_data()
obj.feature_selection()
obj.preprocess_data()
obj.train_model()
obj.get_model_results()
print()
obj.test_evaluation('Test Data Evaluation')
print('*#'*30)
print()
obj.train_evaluation('Train Data Evaluation')
obj.load_file()
obj.new_data()

The number of classes before fit Counter({0: 4130, 1: 1495})
The number of classes after fit Counter({1: 2497, 0: 2055})

***************Test Data Evaluation***************
Confusion Matrix :
 [[394  26]
 [ 29 462]]
Accuracy : 0.9396267837541163
Classification Report :
               precision    recall  f1-score   support

           0       0.93      0.94      0.93       420
           1       0.95      0.94      0.94       491

    accuracy                           0.94       911
   macro avg       0.94      0.94      0.94       911
weighted avg       0.94      0.94      0.94       911

*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#

**************Train Data Evaluation***************
Confusion Matrix :
 [[1534  101]
 [  66 1940]]
Accuracy : 0.9541334798132382
Classification Report :
               precision    recall  f1-score   support

           0       0.96      0.94      0.95      1635
           1       0.95      0.97      0.96      2006

    accuracy           