In [None]:
# import all libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
import re


# to evaluate regression models
from sklearn.metrics import mean_squared_error
# to evaluate classification models
from sklearn.metrics import roc_auc_score
import warnings
warnings.filterwarnings('ignore')

#######################################################
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import scale
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline

import warnings # supress warnings
warnings.filterwarnings('ignore')




## Missing Value

In [None]:
# let's inspect the percentage of missing values in each variable
data.isnull().mean().sort_values(ascending=True)
data.isnull().sum()

#missing value > 70%

churn_data_missing=pd.DataFrame(churn_data.isna().mean().round(4) * 100)
churn_data_missing.where(churn_data_missing[0] > 0.70).dropna()

print(churn_data_missing)
print(churn_data_missing.shape)



####################### Categorical missing val #####################333
## Categorical features which are missing
features_nan=[feature for feature in dataset.columns if dataset[feature].isnull().sum()>1 and dataset[feature].dtypes=='O']

for feature in features_nan:
    print("{}: {}% missing values".format(feature,np.round(dataset[feature].isnull().mean(),4)))
    
    
## Replace missing value with a new label
def replace_cat_feature(dataset,features_nan):
    data=dataset.copy()
    data[features_nan]=data[features_nan].fillna('Missing')
    return data

dataset=replace_cat_feature(dataset,features_nan)

dataset[features_nan].isnull().sum()



####################### Numeric missing val #####################333
numerical_with_nan=[feature for feature in dataset.columns if dataset[feature].isnull().sum()>1 and dataset[feature].dtypes!='O']

## We will print the numerical nan variables and percentage of missing values

for feature in numerical_with_nan:
    print("{}: {}% missing value".format(feature,np.around(dataset[feature].isnull().mean(),4)))
    
    
## Replacing the numerical Missing Values

for feature in numerical_with_nan:
    ## We will replace by using median since there are outliers
    median_value=dataset[feature].median()
    
    ## create a new feature to capture nan values
    dataset[feature+'nan']=np.where(dataset[feature].isnull(),1,0)
    dataset[feature].fillna(median_value,inplace=True)
    
dataset[numerical_with_nan].isnull().sum()


####################### Impute NA with Median #####################333
# let's make a function to create 2 variables from Age:
# one filling NA with median, and another one filling NA with zeroes

def impute_na(df, variable, median):
    df[variable+'_median'] = df[variable].fillna(median)
    df[variable+'_zero'] = df[variable].fillna(0)


median = X_train.Age.median()
impute_na(X_train, 'Age', median)
X_train.head(15)

## Rename column / Drop Columns / Fill NA / Drop NA

In [None]:
## rename column name 
df.rename(columns ={'aug_vbc_3g':'vbc_3g_8', 'jul_vbc_3g':'vbc_3g_7'}, inplace = True) 


## Drop columns
df = df.drop(['circle_id' , 'HV_thrashhold'], axis=1)


## object type columns
df.select_dtypes(include=['object']).columns

## Fill NA  and drop NA
churn_data_missing.where(churn_data_missing[0] > 0.70).dropna()
churn_data=churn_data.fillna(0)

### remove one column from df
df=df.loc[:,~df.columns.isin(["mobile_number"])]

### group by
X_train.groupby(['FireplaceQu'])['FireplaceQu'].count().sort_values(ascending=False).plot.bar()

## outlier

In [None]:
# Checking outliers at 25%, 50%, 75%, 90%, 95% and 99%
churn_data_HV.select_dtypes(include=['int64','float64']).describe(percentiles=[.75, .90, .95, .99,0.999])

################# outlier detection=IQR (inter quartile range ) ##################################

churn_data_IQR=churn_data_HV.select_dtypes(include=['int64','float64'])

Q1 = churn_data_IQR.quantile(0.10)
Q3 = churn_data_IQR.quantile(0.90)
IQR = Q3 - Q1

print(IQR)

thrashhold=1.5

churn_data_IQR_outlier = churn_data_IQR[~((churn_data_IQR < (Q1 - thrashhold * IQR)) |(churn_data_IQR > (Q3 + thrashhold * IQR))).any(axis=1)]

print("original count " ,churn_data_HV.shape)
print("After removing outlier"  , churn_data_IQR_outlier.shape)


################# outlier detection=  Z-score ##################################
from scipy import stats
import numpy as np


churn_data_HV_o=churn_data_HV.select_dtypes(include=['int64','float64'])
churn_data_HV_o=churn_data_HV_o.loc[:,~churn_data_HV_o.columns.isin(["mobile_number"])]

z = np.abs(stats.zscore(churn_data_HV_o,axis=1))

threshold = 3
print(np.where(z > 3)
      
## remove outlier
churn_data_zscore_outlier = churn_data_HV[(z < 3).all(axis=1)]

print("original count " ,churn_data_HV.shape)
print("After removing outlier " , churn_data_zscore_outlier.shape)

## Scaling - MinMax / Standard

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

# StandardScaler
scaler = preprocessing.StandardScaler()

# MinMaxScaler
#scaler = preprocessing.MinMaxScaler()

cols=df_train.columns
df_train[cols]=scaler.fit_transform(df_train[cols])


df_train.head()

## Label Encoding

In [None]:
## Label Encoder on Embarked Varibale
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
df.Embarked = le.fit_transform(df.Embarked)
df.head()

## apply label encoding on text categorical variable.

from sklearn import preprocessing

categorical_text_var=["carbody","enginelocation","aspiration","fueltype","CarCompany","doornumber","drivewheel","fuelsystem", "enginetype"
,"cylindernumber"]

lab = preprocessing.LabelEncoder() 
cars[categorical_text_var]=cars[categorical_text_var].apply(lambda col: lab.fit_transform(col))
cars[categorical_text_var].head()

## Plot

In [None]:
################## BOX plot #########################
import seaborn as sns

for col in churn_data_HV.select_dtypes(include=['int64','float64']).columns:
    sns.set(style="whitegrid")
    #sns.distplot(churn_data_HV[col])
    sns.boxplot(x=churn_data_HV[col])
    plt.show()
    

################## Scatter plot- Co-Relation #########################      
sns.pairplot(df)
plt.show()

################## HeatMap- Co-Relation ######################### 
plt.figure(figsize = (16, 10))
sns.heatmap(df_train.corr(), annot = True, cmap="YlGnBu")
plt.show()


################## Scatter plot #########################    

cnt=0
plt.figure(figsize=(20, 12))
for x in selected_features:
    plt.subplot(2,3,cnt+1)
    plt.scatter(X_train[x] ,(y_train - y_train_predicted))
    plt.xlabel(x, fontsize = 18)                         
    plt.ylabel('Error', fontsize = 16)
    cnt+=1
plt.show()


################## Bell curve #########################    

for col in churn_data_HV.select_dtypes(include=['int64','float64']).columns:
    sns.set(style="whitegrid")
    sns.distplot(churn_data_HV[col])
    #sns.boxplot(x=churn_data_HV[col])
    plt.show()

## Split Data

In [None]:
# split into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    train_size=0.7,
                                                    test_size = 0.3, random_state=40)

## K-Fold

In [None]:
# k-fold CV (using all the 13 variables)
lm = LinearRegression()

# create a KFold object with 5 splits 
folds = KFold(n_splits = 5, shuffle = True, random_state = 100)
scores = cross_val_score(lm, X_train, y_train, scoring='r2', cv=folds)
scores
# can tune other metrics, such as MSE
scores = cross_val_score(lm, X_train, y_train, scoring='mean_squared_error', cv=5)
scores

##############################cross-validation scheme##############################

# step-1: create a cross-validation scheme
folds = KFold(n_splits = 5, shuffle = True, random_state = 100)

# step-2: specify range of hyperparameters to tune
hyper_params = [{'n_features_to_select': list(range(1, 14))}]


# step-3: perform grid search
# 3.1 specify model
lm = LinearRegression()
lm.fit(X_train, y_train)
rfe = RFE(lm)             

# 3.2 call GridSearchCV()
model_cv = GridSearchCV(estimator = rfe, 
                        param_grid = hyper_params, 
                        scoring= 'r2', 
                        cv = folds, 
                        verbose = 1,
                        return_train_score=True)      

#3.3 fit the model
model_cv.fit(X_train, y_train)


#3.4 cv results
cv_results = pd.DataFrame(model_cv.cv_results_)
cv_results


#3.5 plotting cv results
plt.figure(figsize=(16,6))

plt.plot(cv_results["param_n_features_to_select"], cv_results["mean_test_score"])
plt.plot(cv_results["param_n_features_to_select"], cv_results["mean_train_score"])
plt.xlabel('number of features')
plt.ylabel('r-squared')
plt.title("Optimal Number of Features")
plt.legend(['test score', 'train score'], loc='upper left')


#3.6 final model
n_features_optimal = 10

lm = LinearRegression()
lm.fit(X_train, y_train)

rfe = RFE(lm, n_features_to_select=n_features_optimal)             
rfe = rfe.fit(X_train, y_train)

# predict prices of X_test
y_pred = lm.predict(X_test)
r2 = sklearn.metrics.r2_score(y_test, y_pred)
print(r2)

## Pipeline

In [None]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.externals import joblib
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn import tree

# Load and split the data
iris = load_iris()
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.2, random_state=42)

# Construct some pipelines
pipe_lr = Pipeline([('scl', StandardScaler()),
			('pca', PCA(n_components=2)),
			('clf', LogisticRegression(random_state=42))])

pipe_svm = Pipeline([('scl', StandardScaler()),
			('pca', PCA(n_components=2)),
			('clf', svm.SVC(random_state=42))])
			
pipe_dt = Pipeline([('scl', StandardScaler()),
			('pca', PCA(n_components=2)),
			('clf', tree.DecisionTreeClassifier(random_state=42))])

# List of pipelines for ease of iteration
pipelines = [pipe_lr, pipe_svm, pipe_dt]
			
# Dictionary of pipelines and classifier types for ease of reference
pipe_dict = {0: 'Logistic Regression', 1: 'Support Vector Machine', 2: 'Decision Tree'}

# Fit the pipelines
for pipe in pipelines:
	pipe.fit(X_train, y_train)

# Compare accuracies
for idx, val in enumerate(pipelines):
	print('%s pipeline test accuracy: %.3f' % (pipe_dict[idx], val.score(X_test, y_test)))

# Identify the most accurate model on test data
best_acc = 0.0
best_clf = 0
best_pipe = ''
for idx, val in enumerate(pipelines):
	if val.score(X_test, y_test) > best_acc:
		best_acc = val.score(X_test, y_test)
		best_pipe = val
		best_clf = idx
print('Classifier with best accuracy: %s' % pipe_dict[best_clf])

# Save pipeline to file
joblib.dump(best_pipe, 'best_pipeline.pkl', compress=1)
print('Saved %s pipeline to file' % pipe_dict[best_clf])

## Confusion matrix

In [None]:
# Python script for confusion matrix creation. 
from sklearn.metrics import confusion_matrix 
from sklearn.metrics import accuracy_score 
from sklearn.metrics import classification_report 

actual = [1, 1, 0, 1, 0, 0, 1, 0, 0, 0] 
predicted = [1, 0, 0, 1, 0, 0, 1, 1, 1, 0] 

results = confusion_matrix(actual, predicted) 

print 'Confusion Matrix :'
print(results) 
print 'Accuracy Score :',accuracy_score(actual, predicted) 
print 'Report : '
print classification_report(actual, predicted) 