In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split as tts, RandomizedSearchCV,StratifiedShuffleSplit
from sklearn.preprocessing import StandardScaler, LabelEncoder, power_transform
from sklearn.feature_selection import VarianceThreshold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report, r2_score, precision_recall_curve, roc_auc_score, roc_curve, auc, f1_score, recall_score, precision_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_selection import SelectKBest, RFE, RFECV, chi2   #chi2 aka. chi square is used when working with 2 categorical columns.
from sklearn.decomposition import PCA
from scipy import stats
import statsmodels.api as sm
import pprint
from statsmodels.stats.outliers_influence import variance_inflation_factor

**Importing Data**

In [None]:
missing_values = ["n/a", "na", "--", "NONE", "None", "none", "NA", "N/A",'inf','-inf']
data = pd.read_csv('../input/mushroom-classification/mushrooms.csv', na_values=missing_values)
data.head()

**Creating Train and Test sets using Stratified Shuffle Split**


**Stratified Shuffle split will not create sampling bias as it will choose some samples from all the target classes.**

In [None]:
split = StratifiedShuffleSplit(n_splits = 1, random_state=42, test_size=0.2) #n_splits = 1, because I want to divide data into train and test sets
for train_index, test_index in split.split(data, data['class']):
  stratified_train_data = data.loc[train_index]
  stratified_test_data = data.loc[test_index]

print(stratified_train_data.shape , stratified_test_data.shape)

In [None]:
stratified_test_data.drop(['class'],1,inplace=True)

**Using Stratified Train Data**

**Label Encoding the data as ML model does not accept non-numerical values**

In [None]:
le = LabelEncoder()
stratified_train_data[["cap-shape",'cap-surface','cap-color','bruises','odor','gill-attachment','gill-spacing','gill-size','gill-color','stalk-shape','stalk-root','stalk-surface-above-ring','stalk-surface-below-ring','stalk-color-above-ring','stalk-color-below-ring','veil-type','veil-color','ring-number','ring-type','spore-print-color','population','habitat']] = stratified_train_data[["cap-shape",'cap-surface','cap-color','bruises','odor','gill-attachment','gill-spacing','gill-size','gill-color','stalk-shape','stalk-root','stalk-surface-above-ring','stalk-surface-below-ring','stalk-color-above-ring','stalk-color-below-ring','veil-type','veil-color','ring-number','ring-type','spore-print-color','population','habitat']].apply(le.fit_transform)

In [None]:
stratified_train_data['class'] = stratified_train_data['class'].replace('p',0)
stratified_train_data['class'] = stratified_train_data['class'].replace('e',1)

**Creating a Pearson Correlation function for deleting column with above threshold value. In this case I have chosen 0.6 or 60%**

In [None]:
def correlation(dataset, threshold):
    col_corr = set()  # Set of all the names of correlated columns
    corr_matrix = dataset.corr()
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if (corr_matrix.iloc[i, j]) > threshold:
                colname = corr_matrix.columns[i]  # getting the name of column
                col_corr.add(colname)
    return col_corr

In [None]:
corr_features = correlation(stratified_train_data.iloc[:,:22], 0.6)
print("Number of correlated features are :",len(set(corr_features)))
print("Independent correlated features are to be deleted are :",corr_features)

**Deleting columns which have feature to target correlation between -0.1 to 0.1 as they are very less correlated to target**

In [None]:
stratified_train_data.head()

In [None]:
stratified_train_data = stratified_train_data[['cap-shape', 'cap-surface', 'cap-color', 'bruises', 'odor',
       'gill-attachment', 'gill-spacing', 'gill-size', 'gill-color',
       'stalk-shape', 'stalk-root', 'stalk-surface-above-ring',
       'stalk-surface-below-ring', 'stalk-color-above-ring',
       'stalk-color-below-ring', 'veil-type', 'veil-color', 'ring-number',
       'ring-type', 'spore-print-color', 'population', 'habitat','class']]

In [None]:
stratified_train_data.columns

In [None]:
print('FEATURES CORRELATION TO TARGET VALUES :')
train_data_corr = stratified_train_data[stratified_train_data.columns[1:]].corr()['class'][:]
print(train_data_corr)
print("=================================================")
print('DELETING FETAURES THAT ARE LESS CORRELATED TO TARGET VARIABLES BETWEEN -0.1 & 0.1')
train_data_corr.drop(train_data_corr[(train_data_corr.values > -0.1 ) & (train_data_corr.values < 0.1)].index, inplace=True)
print(train_data_corr)
print("=================================================")
print("PRINTING THE DELETED COLUMN NAMES")
new_train_data = stratified_train_data.columns[~stratified_train_data.columns.isin(train_data_corr.index)]
print(new_train_data)

In [None]:
#DELETING ALL THE UNWANTED COLUMNS AND ALSO DELETING THE 'veil-type' COLUMN AS IT IS USELESS FOR US
stratified_train_data.drop(['cap-shape','cap-color','veil-type','veil-color', 'spore-print-color', 'ring-type'],1,inplace=True)

In [None]:
X = stratified_train_data.drop('class',1)
y = stratified_train_data['class']

# Feature Selection

### Using SelectKBest library of sklearn to select features that are most important using chi-square hypothesis testing.


> chi sqaure hypothesis testing uses two columns for working. Here Two columns will be feature and another is target.


> This will be done iteratively with every feature until the k value of SelectKBest is reached.





In [None]:
select_K_Best = SelectKBest(k=9, score_func=chi2)
selected_features = select_K_Best.fit(X, y)
indices_selected = selected_features.get_support(indices=True)
colnames_selected = [X.columns[i] for i in indices_selected]

X = X[colnames_selected]

In [None]:
print("The 9 most important features chosen by SelectKBest are : \n {} ".format(X.columns))

## Calculating Multicollinearity in data

* Using Variance Inflation Factor to check the multicollinearity or dependency in independent columns

In [None]:
def calc_vif(X):
    vif = pd.DataFrame()
    vif["variables"] = X.columns
    vif["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]

    return(vif)

In [None]:
calc_vif(X)

In [None]:
X['stalk-surface'] = X['stalk-surface-above-ring'] + X['stalk-surface-below-ring']
X['stalk-surface']

In [None]:
X.head()

In [None]:
X.drop(['stalk-surface-above-ring','stalk-surface-below-ring'],1,inplace= True)

In [None]:
calc_vif(X)

**Models like linear regression and logistic regression assume that data follows gaussian distribution so to use that we transform skewed data to normal distributed 
data. But SVM, Neural Network, Tree based and boosting does not require to transform data.**

In [None]:
skewness_of_df = pd.DataFrame(X.skew())
skewness_of_df

In [None]:
def boxcox_transformation(df,column):
  try:
    for column in df:
      if ((df[column].skew() > 1.0) or (df[column].skew() < -1.0).any()):
        plt.figure(figsize=(15,6))
        plt.subplot(1, 2, 1)
        df[column].hist()

        plt.subplot(1, 2, 2)
        stats.probplot(df[column], dist="norm", plot=plt)
        print(df[column].skew())
 
        df[column], params = stats.boxcox(df[column]+1)

        plt.figure(figsize=(15,6))
        plt.subplot(2, 2, 1)
        df[column].hist()

        plt.subplot(2, 2, 2)
        stats.probplot(df[column], dist="norm", plot=plt)
        print(data[column].skew())

        return boxcox_transformation
  except TypeError:
       print("")


In [None]:
column = ['bruises','gill-spacing','gill-size','gill-color','stalk-root','population','habitat','stalk-surface']
boxcox_transformation(X,column)

## Pricipal Component Analysis

**Applying Principal Component Analysis (PCA), this helps to handle multicollinearity in data as column stalk-surface has high multicollinearity.**

In [None]:
pca = PCA(n_components = 7,random_state=42)
transformed_data = pca.fit_transform(X)
X = pd.DataFrame(data = transformed_data, columns = ['PCA1','PCA2','PCA3','PCA4','PCA5','PCA6','PCA7'])
X

**The explained_variance_ratio gives the variance of every column. In our case the remaining columns are 7 as passed in n_components = 7.**

**The noise_variance gives the overall noise in the data**

In [None]:
print(pca.explained_variance_ratio_)
print(pca.noise_variance_)

In [None]:
X_train, X_test, y_train, y_test = tts(X, y, random_state=42,test_size=0.3)

In [None]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [None]:
def evaluate(model, X_test, y_test, X_train, y_train):
    y_pred = model.predict(X_test)
    errors = abs(y_pred - y_test)
    print('Average Error: {:0.4f} degrees.'.format(np.mean(errors)))
    print(classification_report(y_test,y_pred))
    print(confusion_matrix(y_test,y_pred))
    print('Recall Score = ',recall_score(y_test, y_pred))
    print('Precision Score = ',precision_score(y_test, y_pred))

    return evaluate

In [None]:
def train_auc_roc_curve(model, X_test, y_test, X_train, y_train):
  y_pred = model.predict(X_test)
  print("roc curve :",roc_curve(y_test,y_pred))
  base_fpr,base_tpr,base_threshold = roc_curve(y_train, model.predict(X_train))
  plt.plot([0,1])
  plt.plot(base_fpr,base_tpr)
  print("auc score :",auc(base_fpr,base_tpr))

  return train_auc_roc_curve

In [None]:
def test_auc_ruc_curve(model, X_test, y_test):
  test_fpr, test_tpr, test_threshold = roc_curve(y_test,model.predict(X_test))
  test_auc = auc(test_fpr, test_tpr)
  print(test_auc)
  plt.plot([0,1])
  plt.plot(test_fpr, test_tpr)

  return test_auc_ruc_curve

## Applying Classification models

## Logistic Regression

## Default Logistic Regression

In [None]:
default_logistic_model = LogisticRegression(random_state = 1)
default_logistic_model.fit(X_train, y_train)
base_accuracy = evaluate(default_logistic_model, X_test, y_test, X_train, y_train)

In [None]:
train_auc_roc_curve(default_logistic_model, X_test, y_test, X_train, y_train)

## Tuned Logistic Regression 

In [None]:
logistic = LogisticRegression(random_state=42)

In [None]:
random_parameters = ({'C' : [0.001, 0.01, 0.1, 1.0],
                      'penalty' : ['l2'],
                      'solver' : ['lbfgs', 'newton-cg', 'saga'],
                      'max_iter' : [300,400,500,600,700,900,1000]})

random_search_logistic = RandomizedSearchCV(logistic, param_distributions= random_parameters, n_iter=60, cv=5)
random_search_logistic.fit(X,y)

In [None]:
print(random_search_logistic.best_estimator_)
print(random_search_logistic.best_params_)

In [None]:
logistic = LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=600,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=42, solver='newton-cg', tol=0.0001, verbose=0,
                   warm_start=False)
logistic.fit(X, y)

In [None]:
evaluate(logistic, X_test, y_test, X_train, y_train)

In [None]:
train_auc_roc_curve(logistic, X_test, y_test, X_train, y_train)

## Decision Tree Classifier 

### Default Decision Tree Classifier

In [None]:
default_decision_tree_model = DecisionTreeClassifier(random_state=42)
default_decision_tree_model.fit(X_train, y_train)
base_accuracy = evaluate(default_decision_tree_model, X_test, y_test, X_train, y_train)

In [None]:
train_auc_roc_curve(default_decision_tree_model, X_test, y_test, X_train, y_train)

### Tuned Decision Tree Classifier

In [None]:
dtree_classifier = DecisionTreeClassifier(random_state=42)

**cost_complexity_pruning_path helps to find different ccp values which will be later used in random search for choosing the best one**


In [None]:
path = dtree_classifier.cost_complexity_pruning_path(X_train, y_train)
alphas = path['ccp_alphas']
alphas

In [None]:
random_dtree_parameters = ({'ccp_alpha' : alphas,
                             'criterion' : ['gini','entropy'],
                            'splitter' : ['best','random'],
                            'max_depth' : [8,10,12,15,20,24,32],
                            'min_samples_leaf' : [2,3,5],
                            'max_features' : ['auto', 'sqrt', 'log2']})

random_search_dtree = RandomizedSearchCV(dtree_classifier, param_distributions= random_dtree_parameters, n_iter=60, cv=5)
random_search_dtree.fit(X,y)

In [None]:
print(random_search_dtree.best_estimator_)
print(random_search_dtree.best_params_)

In [None]:
dtree_classifier = DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='entropy',
                       max_depth=20, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=3, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=42, splitter='best')
dtree_classifier.fit(X,y)

In [None]:
evaluate(dtree_classifier, X_test, y_test, X_train, y_train)

In [None]:
train_auc_roc_curve(dtree_classifier, X_test, y_test, X_train, y_train)

# Using Stratified Test data

In [None]:
stratified_test_data.head()

In [None]:
le = LabelEncoder()
stratified_test_data[["cap-shape",'cap-surface','cap-color','bruises','odor','gill-attachment','gill-spacing','gill-size','gill-color','stalk-shape','stalk-root','stalk-surface-above-ring','stalk-surface-below-ring','stalk-color-above-ring','stalk-color-below-ring','veil-type','veil-color','ring-number','ring-type','spore-print-color','population','habitat']] = stratified_test_data[["cap-shape",'cap-surface','cap-color','bruises','odor','gill-attachment','gill-spacing','gill-size','gill-color','stalk-shape','stalk-root','stalk-surface-above-ring','stalk-surface-below-ring','stalk-color-above-ring','stalk-color-below-ring','veil-type','veil-color','ring-number','ring-type','spore-print-color','population','habitat']].apply(le.fit_transform)

**DELETING ALL THE UNWANTED COLUMNS AND ALSO DELETING THE 'veil-type' COLUMN AS IT IS USELESS FOR US**

In [None]:
stratified_test_data.drop(['cap-shape','cap-color','veil-type','veil-color', 'spore-print-color', 'ring-type'],1,inplace=True)

In [None]:
stratified_test_data.head()

In [None]:
print(stratified_test_data.columns)

In [None]:
stratified_test_data['stalk-surface'] = stratified_test_data['stalk-surface-above-ring'] + stratified_test_data['stalk-surface-below-ring']
stratified_test_data['stalk-surface']

In [None]:
stratified_test_data.drop(['stalk-color-above-ring', 'stalk-color-below-ring','stalk-surface-above-ring','stalk-surface-below-ring','cap-surface','odor', 'gill-attachment', 'stalk-shape', 'ring-number'],1,inplace=True)

In [None]:
print("Remaining Columns are \n: {}". format(stratified_test_data.columns))

In [None]:
column = ['bruises','gill-spacing','gill-size','gill-color','stalk-root','population','habitat','stalk-surface']
boxcox_transformation(stratified_test_data,column)

In [None]:
test_pca = PCA(n_components = 7,random_state=42)
transformed_test_data = test_pca.fit_transform(stratified_test_data)
test_x = pd.DataFrame(data = transformed_test_data, columns = ['PCA1','PCA2','PCA3','PCA4','PCA5','PCA6','PCA7'])
test_x

In [None]:
sc = StandardScaler()
test_x = sc.fit_transform(test_x)
print(test_x)

### Testing Default Logistic Regression

In [None]:
test_auc_ruc_curve(default_logistic_model, X_test, y_test)

### Testing Tuned Logistic Regression

In [None]:
test_auc_ruc_curve(logistic, X_test, y_test)

### Default Decision tree classifier

In [None]:
test_auc_ruc_curve(default_decision_tree_model, X_test, y_test)

### Testing Tuned Decision Tree 

In [None]:
test_auc_ruc_curve(dtree_classifier, X_test, y_test)

**My Conclusion

**Models applied :

** 1.Logistic Regression (Default and Tuned)**

   2.Decision Tree Classifier (Default and Tuned)**
* Default models run better than tuned models