# JupyterHub Notebook

### This notebook server is hosted on the OpenShift platform which provides a separate server for each individual user. The platform takes care of the provisioning of the server and allocating related to storage.


In [None]:
# Install packages that won't be present if this lab is done out of sequence.
!pip3 install seaborn
!pip3 install verta

In [None]:
import matplotlib
import matplotlib.pyplot as plt

import numpy as np
import pandas as pd

import watermark
# import s3fs
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn import model_selection
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from minio import Minio
from verta import Client
from minio.error import ResponseError
import os
from sklearn.preprocessing import OneHotEncoder

from sklearn.pipeline import Pipeline

# import tools as tools
%matplotlib inline
%load_ext watermark

In [None]:
%watermark -n -v -m -g -iv


## Load Lab parameters

In [None]:
from config import get_params, download_csv_files
user_id,PROJECT_NAME,EXPERIMENT_NAME,experiment_name, s3BucketFullPath = get_params()

In [None]:
def get_s3_server():
    minioClient = Minio('minio-ml-workshop:9000',
                    access_key='minio',
                    secret_key='minio123',
                    secure=False)

    return minioClient

### Read the Merged Data

In [None]:
minioClient = get_s3_server()
download_csv_files(minioClient, s3BucketFullPath)
# data_file = minioClient.fget_object("data", s3BucketFullPath, "/tmp/data.csv")
# data_file_version = data_file.version_id
data = pd.read_csv('/tmp/data.csv')
data.head(5)


### Use pandas.DataFrame functions
- _shape_ to return the dimensionality
- _info_ to print a concise summary of the DataFrame
- _describe_ to generate descriptive statistics of the DataFrame's columns
- _isnull().sum()_ to sum the empty values
- finally determine Churn and Total Changes 


In [None]:
data.shape

In [None]:
data.info()

In [None]:
data.describe()

In [None]:
data.isnull().sum()

In [None]:
fig = plt.figure(figsize=(10,6), dpi=80)
ax = sns.countplot(x="Churn", data=data)
ax.set_title('Distribution of the Target Variable', fontsize=20)
ax.set_xlabel('Churn', fontsize = 15)
ax.set_ylabel('Count', fontsize = 15)

In [None]:
# Convert binary variable into numeric so plotting is easier. We need to later take mean
data['Churn'] = data['Churn'].map({'Yes': 1, 'No': 0})

In [None]:
fig, ((ax1,ax2),(ax3,ax4), (ax5,ax6)) = plt.subplots(ncols=2, nrows=3, figsize=(25,17), dpi = 80)
plt.subplots_adjust(left=None, bottom=None, right=None, top=None ,wspace=None, hspace=1.5)
plt.rc('xtick', labelsize = 12)    # fontsize of the tick labels
plt.rc('ytick', labelsize = 12)

data.groupby('gender').Churn.sum().plot(kind='bar', ax = ax1)
ax1.set_ylabel('Total count',fontsize = 20)
ax1.set_xlabel('Gender',fontsize = 20)
ax1.tick_params(labelsize = 18)
ax1.set_title('Churn count by Gender',fontsize = 20)

data.groupby('InternetService').Churn.sum().plot(kind='bar', ax=ax2)
ax2.set_ylabel('Total count',fontsize = 20)
ax2.set_xlabel('Internet Service Type',fontsize = 20)
ax2.tick_params(labelsize = 18)
ax2.set_title('Churn count by Internet Service',fontsize = 20)

data.groupby('DeviceProtection').Churn.sum().plot(kind='bar', ax=ax3)
ax3.set_ylabel('Total count',fontsize = 20)
ax3.set_xlabel('Device Protection',fontsize = 20)
ax3.tick_params(labelsize = 18)
ax3.set_title('Churn count by Device Protection',fontsize = 20)

data.groupby('OnlineSecurity').Churn.sum().plot(kind='bar', ax=ax4)
ax4.set_ylabel('Total count',fontsize = 20)
ax4.set_xlabel('Online Security',fontsize = 20)
ax4.tick_params(labelsize = 18)
ax4.set_title('Churn count by Online Security',fontsize = 20)

data.groupby('TechSupport').Churn.sum().plot(kind='bar',ax=ax5)
ax5.set_ylabel('Total count',fontsize = 20)
ax5.set_xlabel('Techinal Support',fontsize = 20)
ax5.tick_params(labelsize = 18)
ax5.set_title('Churn count by Technical Support',fontsize = 20)

data.groupby('Contract').Churn.sum().plot(kind='bar',ax=ax6)
ax6.set_ylabel('Total count',fontsize = 20)
ax6.set_xlabel('Contract Type',fontsize = 20)
ax6.tick_params(labelsize = 18)
ax6.set_title('Churn count by Contract',fontsize = 20)


In [None]:
data.replace(" ", np.nan, inplace=True)

In [None]:
data.isna().sum()

In [None]:
data['TotalCharges'] = pd.to_numeric(data['TotalCharges'])

In [None]:
mean = data['TotalCharges'].mean()
data.fillna(mean, inplace=True)
# Now we know that total charges has nan values
data.isna().sum()

In [None]:
plt.figure(figsize=(10,8), dpi=80)
# Sns.set(rc={'figure.figsize':(25,15)})
ax = sns.catplot(x="Contract", y="TotalCharges", hue="Churn", kind="box", data=data, height = 6,aspect = 1.5,palette = 'RdBu')
plt.title('Comparison of Total Charges for each Contract',fontsize = 20)
plt.xlabel('Contract',fontsize = 15)
plt.ylabel('Total Charges',fontsize = 15)