In [1]:
# install pycaret
!pip install pycaret

In [1]:
# import libraries
import pandas as pd
import numpy as np

In [1]:
# read csv data
data = pd.read_csv('../input/telco-customer-churn/WA_Fn-UseC_-Telco-Customer-Churn.csv')
data.head()

# Exploratory Data Analysis

In [1]:
# check data types
data.dtypes

In [1]:
# replace blanks with np.nan
data['TotalCharges'] = data['TotalCharges'].replace(' ', np.nan)
data.isnull().sum()

In [1]:
# convert to float64
data['TotalCharges'] = data['TotalCharges'].astype('float64')
data.info()

In [1]:
!pip install pandas-profiling

In [1]:
from pandas_profiling import ProfileReport
profile = ProfileReport(data, title="EDA Report")
profile

# Missing Values

In [1]:
# check missing values
data.isnull().sum()

In [1]:
round(data.Churn.value_counts()*100 / len(data),2)

In [1]:
categorical = []
for i in data.columns:
    if (data[i].dtype=='object'):
        categorical.append(i)
print("Categorical Attribute : {}\n ".format(len(categorical)))
for x in range(len(categorical)): 
    print(categorical[x])

In [1]:
data[categorical].nunique()

In [1]:
for i in categorical[1:]:
    print(i)
    print(data[i].unique())
    print("\n")

# Data Preparation

In [1]:
# init setup
from pycaret.classification import *
telecom = setup(data, target = 'Churn', ignore_features = ['customerID'],
                ordinal_features = {'Contract' : ['Month-to-month' ,'One year', 'Two year']},
                fix_imbalance = True,
               transformation = True,#Transformation changes the shape of the distribution such that the transformed data can be represented by normal distribution
                  normalize = True, #rescale the values of numeric columns
                  handle_unknown_categorical = True, 
                  unknown_categorical_method = 'most_frequent',
                  remove_multicollinearity = True, #rop one of the two features that are highly correlated with each other
                  ignore_low_variance = True,#all categorical features with statistically insignificant variances are removed from the dataset.
                  combine_rare_levels = True,# all levels in categorical features below the threshold defined in rare_level_threshold param are combined together as a single level
                numeric_imputation='median',
                categorical_imputation='mode',)

In [1]:
ada= create_model('ada')

In [1]:
print(ada)

In [1]:
tuned_ada = tune_model(ada,optimize = 'F1') #tuned on F1

In [1]:
evaluate_model(tuned_ada) #Graphical plot 

In [1]:
predict_model(tuned_ada)# Test data evaluation 

In [1]:
final_ada = finalize_model(tuned_ada) # Final model 
final_ada

In [1]:
predict_model(final_ada)#final test model evaluation 