In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

<b>Steps for Regression Modelling:</b>

1. <b>Business Problem Definition</b> - How to predict Red Wine Quality based on Attributes with defined factors
2. <b>Convert business problem</b> Into statistical problem  sales = F( attributes, product features, marketing info etc.)
3. <b>Finding the right technique</b> - Since it is predicting value (Regression Problem) problem so we can use OLS as one of the technique. We can also use other Machine Learning techniques like Decision Trees, Ensemble learning, KNN, SVM, ANN etc.
4. <b>Data colletion(Y, X)</b> - Identify the sources of information and collect the data
5. <b>Consolidate the data</b> - aggregate and consolidate the data at Model level/customer level/store level depends on business problem
6. <b>Data preparation for modeling</b> (create data audit report to identify the steps to perform as part of data preparation)
    a. missinmg value treatment
    b. outlier treatment
    c. dummy variable creation
7. Variable creation by using transformation and derived variable creation.
8. <b>Basic assumptions</b> (Normality, linearity, no outliers, homoscadasticity, no pattern in residuals, no auto correlation etc)
9. Variable reduction techniques (removing multicollinerity with the help of FA/PCA, correlation matrics, VIF)
10. Create dev and validation data sets (50:50 if you have more data else 70:30 or 80:20)
11. Modeling on dev data set (identify significant variables, model interpretation, check the signs and coefficients, multi-collinierity check, measures of good neess fit, final mathematical equation etc)
12. validating on validation data set (check the stability of model, scoring, decile analysis, cross validation etc.)
13. Output interpretation and derive insights (understand the limitations of the model and define strategy to implementation)
14. convert statistical solution into business solutions (implementation, model monitoring etc)

In [None]:
#Packages related to general operating system & warnings
import os 
import warnings
warnings.filterwarnings('ignore')

#Packages related to data importing, manipulation, exploratory data analysis, data understanding
import numpy as np
import pandas as pd
from pandas import Series, DataFrame
import pandas_profiling
import scipy.stats as stats

#Packages related to data visualizaiton
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

#Setting plot sizes and type of plot
plt.rc("font", size=14)
plt.rcParams['axes.grid'] = True
plt.figure(figsize=(6,3))
plt.gray()

from matplotlib.backends.backend_pdf import PdfPages

#Modules related to split the data & gridsearch
from sklearn.model_selection import train_test_split, GridSearchCV

#Module related to calculation of metrics
from sklearn import metrics

#Module related to VIF 
from statsmodels.stats.outliers_influence import variance_inflation_factor

#Modules related to preprocessing (Imputation of missings, standardiszation, new features creation, converting categorical to numerical)
from sklearn.impute import MissingIndicator, SimpleImputer
from sklearn.preprocessing import  PolynomialFeatures, KBinsDiscretizer, FunctionTransformer
from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, LabelBinarizer, OrdinalEncoder

#Moudles related to feature selection
from sklearn.feature_selection import RFE, RFECV, SelectKBest, chi2, SelectPercentile, f_classif, mutual_info_classif, f_regression, VarianceThreshold, SelectFromModel, mutual_info_classif, mutual_info_regression, SelectFpr, SelectFdr, SelectFwe


#Modules related to pipe line creation for faster processing
from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion
#from sklearn.features.transformers import DataFrameSelector

#Dumping model into current directory: joblib.dump(model_xg,"my_model.pkl") 
#Loading model: my_model_loaded=joblib.load("my_model.pkl")

#Modules related key techniques of supervised learning 
import statsmodels.formula.api as smf
import statsmodels.tsa as tsa

from sklearn.linear_model import LogisticRegression, LinearRegression, ElasticNet, Lasso, Ridge
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor, export_graphviz, export
from sklearn.ensemble import BaggingClassifier, BaggingRegressor,RandomForestClassifier,RandomForestRegressor
from sklearn.ensemble import GradientBoostingClassifier,GradientBoostingRegressor, AdaBoostClassifier, AdaBoostRegressor 
#from xgboost import XGBClassifier, XGBRegressor
from sklearn.naive_bayes import BernoulliNB, MultinomialNB, GaussianNB
from sklearn.svm import LinearSVC, LinearSVR, SVC, SVR
from sklearn.neural_network import MLPClassifier, MLPRegressor

In [None]:
#import Data

Wine_Data = pd.read_csv('/kaggle/input/red-wine-quality-cortez-et-al-2009/winequality-red.csv')

In [None]:
Wine_Data.head(5)

In [None]:
#Identifying Null Values

print(Wine_Data.isnull().sum())
print('Sum of Total Null Values is  {}'.format(sum(Wine_Data.isnull().sum())))

In [None]:
##Identifying Shape

Wine_Data.shape

In [None]:
Wine_Data.info()

In [None]:
###Data Analysis using Bar Plot - Each Column in Bar Graph

for x in Wine_Data.columns:
    Wine_Data[x].hist()
    plt.xlabel(str(x))
    plt.show()

### Observation - Using Bar Plot

As per Target Variable - we have more Quality ranges between 5 & 6 (Average Quality) and above 7 Next range (Good Quality)

In [None]:
### Pandas Profiling

pandas_profiling.ProfileReport(Wine_Data)

In [None]:
##Target Variable

Wine_Data['quality'] = Wine_Data['quality'].apply(lambda x: 0 if x<7 else 1)

In [None]:
Wine_Data.quality.value_counts()

In [None]:
Wine_Data.quality.value_counts()/Wine_Data.quality.count()

In [None]:
##Target Variable - Value Counts

fig = plt.figure()
ax = fig.add_axes([0,0,1,1])
y_axis = [x for x in Wine_Data.quality.value_counts()]
ax.pie(y_axis,labels=['Bad Quality','Good Quality'],autopct='%1.2f%%')

ax.set_title("Distribution of Wine Quality")

In [None]:
sns.countplot(x="quality", data=Wine_Data)
plt.show()

In [None]:
Wine_Data.groupby(by=['quality']).mean().reset_index()

In [None]:
# Creating Data audit Report

def var_summary(x):
    return pd.Series([x.count(), x.isnull().sum(), x.sum(), x.mean(), x.median(),  x.std(), x.var(), x.min(), x.dropna().quantile(0.01), x.dropna().quantile(0.05),x.dropna().quantile(0.10),x.dropna().quantile(0.25),x.dropna().quantile(0.50),x.dropna().quantile(0.75), x.dropna().quantile(0.90),x.dropna().quantile(0.95), x.dropna().quantile(0.99),x.max()], 
                  index=['N', 'NMISS', 'SUM', 'MEAN','MEDIAN', 'STD', 'VAR', 'MIN', 'P1' , 'P5' ,'P10' ,'P25' ,'P50' ,'P75' ,'P90' ,'P95' ,'P99' ,'MAX'])

num_data =Wine_Data.apply(lambda x: var_summary(x)).T

In [None]:
num_data

In [None]:
### visualize correlation matrix in Seaborn using a heatmap

sns.heatmap(Wine_Data.corr())

In [None]:
###Corr Relationship p>0.05

Corr_Data = Wine_Data.corr()
Corr_Data.to_csv('Corr_Data.csv')

In [None]:
##Changing Column Name 

Wine_Data.columns = [x.replace(' ','_') for x in Wine_Data.columns]

### variable reduction (feature selection/reduction)

- Univariate Regression
- WOE - Binomial classification
- RFE
- SelectKBest
- VIF
- PCA

#### Univariate Regression

In [None]:
somersd_df = pd.DataFrame()
for num_varaible in Wine_Data.columns.difference(['quality']):
    result = smf.logit(formula= str('quality~')+str(num_varaible),data=Wine_Data).fit()
    somers_d = 2 * metrics.roc_auc_score(Wine_Data.quality,result.predict())-1
    temp = pd.DataFrame([num_varaible,somers_d]).T
    temp.columns = ['VariableName','SomersD']
    somersd_df = pd.concat([somersd_df, temp], axis=0)

In [None]:
imp_vars_SD = somersd_df.sort_values('SomersD', ascending=False).head(11)

In [None]:
imp_vars_SD

In [None]:
imp_vars_somerceD = imp_vars_SD.VariableName

In [None]:
imp_vars_somerceD = list(imp_vars_somerceD)

#### RFE

In [None]:
X = Wine_Data[Wine_Data.columns.difference(['quality'])]

classifier = RandomForestClassifier()
rfe = RFE(classifier, 11)
rfe = rfe.fit(X, Wine_Data[['quality']] )

In [None]:
imp_vars_RFE = list(X.columns[rfe.support_])

In [None]:
imp_vars_RFE

#### Select K-Best

In [None]:
X = Wine_Data[Wine_Data.columns.difference(['quality'])]
SKB = SelectKBest(f_classif, k=11).fit(X, Wine_Data[['quality']] )  

In [None]:
SKB.get_support()
imp_vars_SKB = list(X.columns[SKB.get_support()])

In [None]:
imp_vars_SKB

In [None]:
Final_list = list(set(imp_vars_SKB + imp_vars_somerceD + imp_vars_RFE))

In [None]:
X = Wine_Data[Final_list]

#### Using WOE (Weight of Evidence)

- Identify important variables using WOE or log(odds) comparing with Y
- Variable Transformation: (i) Bucketing if the variables are not having linear relationship with log(odds)

In [None]:
bp = PdfPages('WOE Plots.pdf')

for num_variable in Wine_Data.columns.difference(['quality']):
    binned = pd.cut(Wine_Data[num_variable], bins=10, labels=list(range(1,11)))
    #binned = binned.dropna()
    odds = Wine_Data.groupby(binned)['quality'].sum() / (Wine_Data.groupby(binned)['quality'].count()-Wine_Data.groupby(binned)['quality'].sum())
    log_odds = np.log(odds)
    fig,axes = plt.subplots(figsize=(10,4))
    sns.barplot(x=log_odds.index,y=log_odds)
    plt.ylabel('Log Odds Ratio')
    plt.title(str('Logit Plot for identifying if the bucketing is required or not for variable ') + str(num_variable))
    bp.savefig(fig)

bp.close()

In [None]:
def calculate_woe_iv(dataset, feature, target):
    lst = []
    for i in range(dataset[feature].nunique()):
        val = list(dataset[feature].unique())[i]
        lst.append({
            'Value': val,
            'All': dataset[dataset[feature] == val].count()[feature],
            'Good': dataset[(dataset[feature] == val) & (dataset[target] == 0)].count()[feature],
            'Bad': dataset[(dataset[feature] == val) & (dataset[target] == 1)].count()[feature]
        })
        
    dset = pd.DataFrame(lst)
    dset['Distr_Good'] = dset['Good'] / dset['Good'].sum()
    dset['Distr_Bad'] = dset['Bad'] / dset['Bad'].sum()
    dset['WoE'] = np.log(dset['Distr_Good'] / dset['Distr_Bad'])
    dset = dset.replace({'WoE': {np.inf: 0, -np.inf: 0}})
    dset['IV'] = (dset['Distr_Good'] - dset['Distr_Bad']) * dset['WoE']
    iv = dset['IV'].sum()
    
    dset = dset.sort_values(by='WoE')
    
    return dset, iv

In [None]:
for col in Wine_Data.columns:
    if col == 'quality': continue
    else:
        print('WoE and IV for column: {}'.format(col))
        df, iv = calculate_woe_iv(Wine_Data, col, 'quality')
       # print(df)
        print('IV score: {:.2f}'.format(iv))
        print('\n')

#### VIF - Varience Inflation Factor

In [None]:
vif = pd.DataFrame()

In [None]:
vif['VIF Factor'] = [variance_inflation_factor(X.values,i) for i in range(X.shape[1])]
vif['features'] = X.columns

In [None]:
print(vif)

#### Splitting the Data into Train & Test

In [None]:
target = Wine_Data[['quality']]
features = X

In [None]:
#Splitting the data for sklearn methods

train_y, test_y, train_X, test_X = train_test_split(target,features, test_size=0.3, random_state=123)

In [None]:
#for logistic regression using statsmodels

train, test = train_test_split(Wine_Data, test_size=0.5, random_state=0)

#### Model Building

#### Logistic Regression Using Stats Models (Traditional Approach)

In [None]:
logreg = smf.logit(formula='quality ~ chlorides + free_sulfur_dioxide + fixed_acidity + total_sulfur_dioxide + pH + residual_sugar + citric_acid + volatile_acidity + sulphates + density + alcohol', data=train)
result = logreg.fit()

In [None]:
print(result.summary2())

#### Accuracy Deatils

In [None]:
train_gini = 2 * metrics.roc_auc_score(train['quality'], result.predict(train)) - 1
print("The Gini Index for the model built on the Train Data is : ", train_gini)

test_gini = 2 * metrics.roc_auc_score(test['quality'], result.predict(test)) - 1
print("The Gini Index for the model built on the Test Data is : ", test_gini)

train_auc = metrics.roc_auc_score(train['quality'], result.predict(train))
test_auc = metrics.roc_auc_score(test['quality'], result.predict(test))

print("The AUC for the model built on the Train Data is : ", train_auc)
print("The AUC for the model built on the Test Data is : ", test_auc)                                 

In [None]:
## Intuition behind ROC curve - predicted probability as a tool for separating the '1's and '0's - Train

train_predicted_prob = pd.DataFrame(result.predict(train))
train_predicted_prob.columns = ['prob']
train_actual = train['quality']

# making a DataFrame with actual and prob columns
train_predict = pd.concat([train_actual, train_predicted_prob], axis=1)
train_predict.columns = ['actual','prob']
train_predict.head()

In [None]:
## Intuition behind ROC curve - predicted probability as a tool for separating the '1's and '0's - Test

test_predicted_prob = pd.DataFrame(result.predict(test))
test_predicted_prob.columns = ['prob']
test_actual = test['quality']

# making a DataFrame with actual and prob columns
test_predict = pd.concat([test_actual, test_predicted_prob], axis=1)
test_predict.columns = ['actual','prob']
test_predict.head()

In [None]:
## Intuition behind ROC curve - confusion matrix for each different cut-off shows trade off in sensitivity and specificity

roc_like_df = pd.DataFrame()
train_temp = train_predict.copy()

for cut_off in np.linspace(0,1,50):
    train_temp['cut_off'] = cut_off
    train_temp['predicted'] = train_temp['prob'].apply(lambda x: 0.0 if x < cut_off else 1.0)
    train_temp['tp'] = train_temp.apply(lambda x: 1.0 if x['actual']==1.0 and x['predicted']==1 else 0.0, axis=1)
    train_temp['fp'] = train_temp.apply(lambda x: 1.0 if x['actual']==0.0 and x['predicted']==1 else 0.0, axis=1)
    train_temp['tn'] = train_temp.apply(lambda x: 1.0 if x['actual']==0.0 and x['predicted']==0 else 0.0, axis=1)
    train_temp['fn'] = train_temp.apply(lambda x: 1.0 if x['actual']==1.0 and x['predicted']==0 else 0.0, axis=1)
    sensitivity = train_temp['tp'].sum() / (train_temp['tp'].sum() + train_temp['fn'].sum())
    specificity = train_temp['tn'].sum() / (train_temp['tn'].sum() + train_temp['fp'].sum())
    accuracy = (train_temp['tp'].sum()  + train_temp['tn'].sum() ) / (train_temp['tp'].sum() + train_temp['fn'].sum() + train_temp['tn'].sum() + train_temp['fp'].sum())
    roc_like_table = pd.DataFrame([cut_off, sensitivity, specificity, accuracy]).T
    roc_like_table.columns = ['cutoff', 'sensitivity', 'specificity', 'accuracy']
    roc_like_df = pd.concat([roc_like_df, roc_like_table], axis=0)


In [None]:
roc_like_df.head()

In [None]:
## Finding ideal cut-off for checking if this remains same in OOS validation
roc_like_df['total'] = roc_like_df['sensitivity'] + roc_like_df['specificity']

In [None]:
roc_like_df.head()

In [None]:
#Cut-off based on highest sum(sensitivity+specicity)   - common way of identifying cut-off

roc_like_df[roc_like_df['total']==roc_like_df['total'].max()]

In [None]:
#Cut-off based on highest accuracy   - some teams use this as methodology to decide the cut-off

roc_like_df[roc_like_df['accuracy']==roc_like_df['accuracy'].max()]

In [None]:
#Cut-off based on highest sensitivity

roc_like_df[roc_like_df['sensitivity']==roc_like_df['sensitivity'].max()]

In [None]:
#Choosen Best Cut-off is 0.53 based on highest (sensitivity+specicity)

test_predict['predicted'] = test_predict['prob'].apply(lambda x: 1 if x > 0.183673 else 0)
train_predict['predicted'] = train_predict['prob'].apply(lambda x: 1 if x > 0.183673 else 0)

In [None]:
train_predict.head()

In [None]:
print("The overall accuracy score for the Train Data is : ", metrics.accuracy_score(train_predict.actual, train_predict.predicted))
print("The overall accuracy score for the Test Data  is : ", metrics.accuracy_score(test_predict.actual, test_predict.predicted))

In [None]:
print(metrics.classification_report(train_predict.actual, train_predict.predicted))

In [None]:
print(metrics.classification_report(test_predict.actual, test_predict.predicted))

### Decile Analysis

#Decile analysis
#Top-two deciles - High risk (Low Quality Wine) - will reject 
#3rd,4th, 5th deciles - medium risk (Medium Quality Wine) - will accept wine with proper quality
#6th decile onwards - low risk Wine - accept the Wine


#Decile analysis for validation of models - Business validation

In [None]:
train_predict['Deciles'] = pd.qcut(train_predict['prob'],10,labels=False)

In [None]:
train_predict.head()

In [None]:
test_predict['Deciles'] = pd.qcut(test_predict['prob'],10,labels=False)
test_predict.head()

In [None]:
# Decile Analysis for train data

no_1s = train_predict[['Deciles','actual']].groupby(train_predict.Deciles).sum().sort_index(ascending=False)['actual']
no_total = train_predict[['Deciles','actual']].groupby(train_predict.Deciles).count().sort_index(ascending=False)['actual']
max_prob = train_predict[['Deciles','prob']].groupby(train_predict.Deciles).max().sort_index(ascending=False)['prob']
min_prob = train_predict[['Deciles','prob']].groupby(train_predict.Deciles).min().sort_index(ascending=False)['prob']

In [None]:
Decile_analysis_train = pd.concat([max_prob, min_prob, no_1s, no_total-no_1s, no_total], axis=1)

Decile_analysis_train.reset_index()

In [None]:
# Decile Analysis for test data

no_1s = test_predict[['Deciles','actual']].groupby(test_predict.Deciles).sum().sort_index(ascending=False)['actual']
no_total = test_predict[['Deciles','actual']].groupby(test_predict.Deciles).count().sort_index(ascending=False)['actual']
max_prob = test_predict[['Deciles','prob']].groupby(test_predict.Deciles).max().sort_index(ascending=False)['prob']
min_prob = test_predict[['Deciles','prob']].groupby(test_predict.Deciles).min().sort_index(ascending=False)['prob']

Decile_analysis_test = pd.concat([max_prob, min_prob, no_1s, no_total-no_1s, no_total], axis=1)

Decile_analysis_test.reset_index()

#### 1. Logistic Regression using SkLearn

In [None]:
model = LogisticRegression()
model_Reg = model.fit(train_X,train_y)

In [None]:
train_X['pred_prob'] = pd.DataFrame(model_Reg.predict_proba(train_X), index=train_X.index)[1]
test_X['pred_prob'] = pd.DataFrame(model_Reg.predict_proba(test_X),index=test_X.index)[1]

In [None]:
train = pd.concat([train_X, train_y], axis=1)
test  = pd.concat([test_X,test_y],axis=1)

In [None]:
train['pred'] = np.where(train.pred_prob>0.183673, 1,0)
test['pred'] = np.where(test.pred_prob>0.183673, 1,0)

In [None]:
train.head(4)

In [None]:
test.head(4)

In [None]:
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(train.quality, train.pred)
print(confusion_matrix)

In [None]:
print(metrics.classification_report(train.quality, train.pred))

In [None]:
print(metrics.classification_report(test.quality, test.pred))

In [None]:
#### K-nearest Neighbours

In [None]:
sc = StandardScaler()

In [None]:
std_data_train = pd.DataFrame(sc.fit_transform(train_X), columns=train_X.columns, index = train_X.index )
std_data_test = pd.DataFrame(sc.transform(test_X), columns=test_X.columns, index = test_X.index )

In [None]:
#Using GridsearchCV with standrardized data
param_grid = {'n_neighbors':[3,4,5,6,7],
              'weights': ['uniform', 'distance']}

model = GridSearchCV(KNeighborsClassifier(), param_grid = param_grid, cv=5, scoring = 'f1_weighted')
model_KNN = model.fit(std_data_train, train_y)

In [None]:
model_KNN.best_score_
model_KNN.best_params_

In [None]:
train_pred = model_KNN.predict(std_data_train)
test_pred  = model_KNN.predict(std_data_test)

In [None]:
print(metrics.classification_report(train_y,train_pred))

In [None]:
print(metrics.classification_report(test_y,test_pred))

#### Decision Tree Classifier

In [None]:
train_X = train_X[train_X.columns.difference(['pred_prob'])]
test_X = test_X[test_X.columns.difference(['pred_prob'])]

In [None]:
param_grid = {'max_depth':np.arange(2,5),
              'max_features':np.arange(2,5)}

In [None]:
tree = GridSearchCV(DecisionTreeClassifier(),param_grid,cv=5,n_jobs=-1)
tree.fit(train_X,train_y)

In [None]:
tree.best_estimator_

In [None]:
tree.best_score_

In [None]:
tree.best_params_

#### Decision Tree- Classification Report

In [None]:
train_pred = tree.predict(train_X)
test_pred  = tree.predict(test_X)

In [None]:
print(metrics.classification_report(train_y,train_pred))

In [None]:
print(metrics.classification_report(test_y,test_pred))

#### Final Decision Tree Model

In [None]:
train_X = train_X[train_X.columns.difference(['pred_prob'])]
clf_tree = DecisionTreeClassifier( max_depth = 3, max_features=3, max_leaf_nodes=5 )
clf_tree.fit( train_X, train_y )

In [None]:
##ROC Curve - Using Decision Tree

print(metrics.classification_report(train_y, clf_tree.predict(train_X)))
print(metrics.classification_report(test_y, clf_tree.predict(test_X)))

In [None]:
clf_tree.feature_importances_

In [None]:
# summarize the selection of the attributes
import itertools
feature_map = [(i, v) for i, v in itertools.zip_longest(train_X.columns, clf_tree.feature_importances_)]

feature_map

In [None]:
Feature_importance = pd.DataFrame(feature_map, columns=['Feature', 'importance'])
Feature_importance.sort_values('importance', inplace=True, ascending=False)
Feature_importance.head(30)

#### XG Boost Classifier

In [None]:
from xgboost import XGBClassifier

In [None]:
xgb_estimator = XGBClassifier( learning_rate=0.01,
                               n_estimators=1000,
                               max_depth=5,
                               min_child_weight=1,
                               gamma=1,
                               subsample=0.8,
                               colsample_bytree=0.8,
                               n_jobs=-1,
                               reg_alpa=1,
                               scale_pos_weight=1,
                               random_state=42,
                               verbose=1)

In [None]:
xgb_estimator.fit(train_X,train_y)

In [None]:
print(metrics.classification_report(train_y,xgb_estimator.predict(train_X)))

In [None]:
print(metrics.classification_report(test_y,xgb_estimator.predict(test_X)))

In [None]:
print(metrics.roc_auc_score(train_y,pd.DataFrame(xgb_estimator.predict_proba(train_X))[1]))

print(metrics.roc_auc_score(test_y,pd.DataFrame(xgb_estimator.predict_proba(test_X))[1]))

#### Naive Bayes Classifier`

In [None]:
Nb_Clf = GaussianNB()
Nb_Clf.fit(train_X,train_y)

In [None]:
metrics.accuracy_score(train_y,Nb_Clf.predict(train_X))

In [None]:
print(metrics.classification_report(train_y,Nb_Clf.predict(train_X)))
print(metrics.classification_report(test_y,Nb_Clf.predict(test_X)))