In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Import Libraires

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split


from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.metrics import classification_report
%matplotlib inline

## EDA

In [None]:
## Reading dataset 
churn_data = pd.read_csv('../input/churn-in-telecoms-dataset/bigml_59c28831336c6604c800002a.csv')

## dataset shape 
print(f"{'shape of the dataset'.title()} :- {churn_data.shape}")

## sample data
print(f"{'sample dataset'.title()} :- \n {churn_data.head()}")

## missing values
print(f"\n {'Number of null values in every column'.title()} \n {churn_data.isnull().sum()}")

## duplicate values
print(f"\n {'number of duplicate values'.title()} :- {len(churn_data.loc[churn_data.duplicated()])}")

## target value count 
print(f"\n {'count of each value of target column'.title()} \n {churn_data.churn.value_counts()}")

## information about dataset
print(f"{'dataset info'.title()} \n ")
churn_data.info()

## DATA Visualization

In [None]:
sns.countplot(x ='churn', data = churn_data)
plt.show()

## Preprocessing

In [None]:
def col_unique_values(col_name):
  ## input : category variables
  ## Output : number of unique values in particular category col
  print(f"****************** Col Name : {col_name} ****************")
  print(f"Unique Values :- \n {churn_data[col_name].unique()}")
  print(f"Number of Unique values :- {churn_data[col_name].nunique()}\n\n")


## all columns
total_col_names = churn_data.columns
## find numeric columns (int & float, bool)
num_cols = churn_data._get_numeric_data().columns
## getting category columns 
cat_col_names = list(set(total_col_names) - set(num_cols))


for col_name in cat_col_names:
  ## check unique values of every category column
  col_unique_values(col_name)


## phone number is not important feature so we can remove it
churn_data = churn_data.drop(['phone number'], axis=1)
cat_col_names.remove('phone number')

## Apply label encoding operation on category columns
def label_encoding(col_name):
  le = LabelEncoder()
  churn_data[col_name] = le.fit_transform(churn_data[col_name])


for col_name in cat_col_names:
  label_encoding(col_name)

## sample dataset after label encoding
churn_data.head()

In [None]:
## separate dependent and independent variables 
X = churn_data.drop(['churn'], axis=1)
y = churn_data['churn']

column_names = list(X.columns)

## create pipeline to apply feature scaling
pipeline = Pipeline([
                     ('std_scaler', StandardScaler())
])

## apply feature scaling on independent values (X)
X = pd.DataFrame(data=pipeline.fit_transform(X), columns=column_names)
X.head()

## label encoding on target variables
le = LabelEncoder()
y = le.fit_transform(y)

## splitting whole dataset into train and test dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
print(f"Size Of The Train Dataset :- {len(X_train)}")
print(f"Size Of The Test Dataset :- {len(X_test)}")

## Model Building & Evaluation

In [None]:
## building different models
def model_building(model_name):
  model = model_name
  model.fit(X_train, y_train)
  print(f"******** Model :- {model_name} ********\n\n")
  print(f"******** Score :- {model.score(X_test, y_test)} ***********")
  print(f"******** Classification Report ************************\n\n")
  y_prediction = model.predict(X_test)
  print(classification_report(y_test, y_prediction))


## dictionary with different models
model_dict = {'dt':DecisionTreeClassifier(criterion='entropy'), 
              'knn':KNeighborsClassifier(n_neighbors=17), 
              'rf': RandomForestClassifier(), 
              'xgb':xgb.XGBClassifier(random_state=42, learning_rate=0.4)}


## calling to build and evaluate models
for key in model_dict.keys():
  model_building(model_dict[key])