#### Let's import the required libraries for the case study    

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA
from sklearn import metrics
from sklearn.metrics import confusion_matrix, precision_score, recall_score

import warnings
warnings.filterwarnings("ignore")

In [None]:
data = pd.read_csv("train.csv")
data.head()

###  Data Understanding, Preparation, and Pre-Processing 

In [None]:
# We have around 172 columns. Let's see their datatypes to know if any are categorical in nature
data.info(verbose=True)

##### We see that total 9 columns are of object datatype. Let's see the values for these columns and figure out if they would mean any business value.

In [None]:
object_cols = list(data.select_dtypes(include=["object"]).columns)
object_cols

In [None]:
data[object_cols]

###### We see that these columns are of date datatype and they simply describe the last date of the month. 
###### From a business point of view, last date of the month will not have any impact on the customer's behaviour as it's a static data. Hence we can exclude this column from further analysis

In [None]:
type(object_cols)

In [None]:
data = data.drop(columns=object_cols)
# Let's see if there are any columns left with object data type
data.select_dtypes(include=["object"])

##### As we can see from above, no column is left with object data type. Once we have all the data in numeric data type let's move on to checking if there are any null values present in the dataset

In [None]:
data.info(verbose=True, show_counts=True )

In [None]:
missing_data_percent = 100*data.isnull().sum()/len(data)
missing_data_percent

In [None]:
# We see that there are columns having around 74% missing values, we should be removing these columns as they
# would not be helpful in model building and making prediction
new_vars = missing_data_percent[missing_data_percent.le(40)].index
new_vars

In [None]:
data = data[new_vars]
data

In [None]:
# We have around 136 columns now, let's see if any of them are having higher percentage of missing values
(100*data.isnull().sum()/len(data)).describe()

In [None]:
# We see that the missing percentage is very low that is 5 Percent. Lets try imputing zero value for these missing records
data = data.fillna(0)

In [None]:
(100*data.isnull().sum()/len(data)).describe()

### Exploratory Data Analysis

##### Let's check for outliers

In [None]:
def show_box_plot_in_batches(dataarray):
    prev=0
    for i in range(15,len(dataarray.columns),16):
        plt.figure(figsize=(15,8))
        plt.xticks(rotation=45)
        sns.boxplot(data = dataarray.iloc[:,prev:i])
        prev = i

In [None]:
# Visualizing with the help of bar plot
# Since there are around 136 columns, viewing bar plot for all of the columns on X axis will not be possible.
# Let's divide the columns into 15 batches and run the box plot on them

show_box_plot_in_batches(data)

In [None]:
data.describe(percentiles=[.25,.5,.75,.90,.95,.99], include="all")

In [None]:
def cap_outliers(array, k=3):
    upper_limit = array.mean() + k*array.std()
    lower_limit = array.mean() - k*array.std()
    array[array<lower_limit] = lower_limit
    array[array>upper_limit] = upper_limit
    return array

In [None]:
data_new = data.apply(cap_outliers, axis=0)

In [None]:
show_box_plot_in_batches(data_new)

In [None]:
# Getting X and y dataframe and splitting into train and test dataset
data_new = data_new.drop(columns=["id","circle_id"])
y = data_new.pop("churn_probability")
X = data_new

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

##### Let's print correlations for each feature using heatmap

In [None]:
prev=0
for i in range(15,len(X_train.columns),16):
    plt.figure(figsize=(10,8))
    sns.heatmap(pd.DataFrame(X_train, columns=X_train.iloc[:,prev:i].columns).corr())
    prev = i

### Feature Engineering and Variable Transformation 

##### Feature scaling

In [None]:
X_train

In [None]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)

In [None]:
X_train

##### Applying PCA on data

In [None]:
pca = PCA(random_state=42)
pca.fit(X_train)

In [None]:
pca.components_

In [None]:
pca.explained_variance_ratio_

In [None]:
var_cumu = np.cumsum(pca.explained_variance_ratio_)

##### Making a scree plot to know how many variables explain the maximum variance

In [None]:
fig = plt.figure(figsize=[12,8])
plt.vlines(x=70, ymax=1, ymin=0, colors="r", linestyles="--")
plt.hlines(y=0.95, xmax=100, xmin=0, colors="g", linestyles="--")
plt.plot(var_cumu)
plt.ylabel("Cumulative variance explained")
plt.show()

##### As we can see clearly, around 95% of the variance is explained by using 70 features. Hence we will be using 70 variabels to build PCA model

In [None]:
pc2 = PCA(n_components=70, random_state=42)
transformed_data = pc2.fit_transform(X_train)

In [None]:
transformed_data.shape

In [None]:
df_train_pca = pd.DataFrame(transformed_data)
df_train_pca.head()

In [None]:
corrmat = np.corrcoef(df_train_pca.transpose())

In [None]:
# Let's plot the heatmap of the coefficients obtained from pca model. 
# After the dimentionality reduction, the newly obtain coefficients should not be correlated to each other
plt.figure(figsize=[15,15])
sns.heatmap(corrmat, annot=True)

##### As we can see, the newly obtained 69 coefficients are not correlated to each other

##### Let's build logistic regression model on top of transformed data received from PCA and predict the churn probability

In [None]:
logisticRegression = LogisticRegression()

In [None]:
lrmodel = logisticRegression.fit(df_train_pca, y_train)

In [None]:
# Applying transformation on test data set to reduce dimentionality and get more uncorrelated features
df_test_pca = pc2.transform(X_test)
df_test_pca.shape

In [None]:
pred_probs_train = lrmodel.predict_proba(df_train_pca)
y_train_pred = lrmodel.predict(df_train_pca)

In [None]:
pred_probs_test = lrmodel.predict_proba(df_test_pca)
y_test_pred = lrmodel.predict(df_test_pca)

In [None]:
"{:2.2}".format(metrics.roc_auc_score(y_train, pred_probs_train[:,1]))

In [None]:
"{:2.2}".format(metrics.roc_auc_score(y_test, pred_probs_test[:,1]))

##### Let's make a confusion matrix to analyze how each class is being predicted by the model.

In [None]:
confusion_matrix(y_train, y_train_pred)

In [None]:
confusion_matrix(y_test, y_test_pred)

In [None]:
precision_score(y_test, y_test_pred)

In [None]:
recall_score(y_test, y_test_pred)