In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import warnings 
warnings.filterwarnings('ignore')

In [None]:
# Read data file using Pandas  
mu_data=pd.read_csv('mushrooms.csv')

In [None]:
mu_data.head()

In [None]:
print(mu_data.shape)

In [None]:
print(mu_data.columns)

In [None]:
print(mu_data.info())

In [None]:
# using isnull() function for finding null values   
print(mu_data.isnull().sum())

In [None]:
# Visual Representation for finding null values using Heat Map
sns.heatmap(mu_data.isnull())
plt.show()

In [None]:
mu_data.head()

In [None]:
print(mu_data.nunique())

In [None]:
mu_data1 = mu_data['class'].to_frame()
mu_data1.head()

In [None]:
# Count the edible=e, poisonous=p 
c_count=mu_data['class'].value_counts()
print(c_count)

In [None]:
# Visual Representation of the dependent variable distribution in the dataset
sns.set(rc={'figure.figsize':(7,5)})
class_c=mu_data['class']
mu_data_count=sns.countplot(x=class_c, data=mu_data)
plt.title("Class Distribution")

In [None]:
#Visualize/Analysis how each independent variables affects the dependent variable
def plot_feature(class_target, feature_set):
    for k, col in enumerate(mu_data.columns):
        plt.figure(k)
        sns.set(rc={'figure.figsize': (12,9)})
        sns.set(style="whitegrid")
        c_count=sns.countplot(x=mu_data[col], hue=class_target, data=mu_data)        

In [None]:
#Plot all graphs
class_c=mu_data['class']
feature_to_plot=mu_data.drop('class', axis=1)
plot_feature(class_c, feature_to_plot)

In [None]:
# Seprate Data into features and target set
feature_data=mu_data.drop('class', axis=1)
target_class=mu_data['class']
print(feature_data.shape)
print(target_class.shape)

In [None]:
# Import Label Encoders for Target and Feaature variables Encoding
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

In [None]:
# Label Encoder used for the Target class
le=LabelEncoder()
mu_data['class']= le.fit_transform(mu_data['class'])
mu_data

In [None]:
# One hot encoding applied to feature variables
feature_data= pd.get_dummies(feature_data)
feature_data

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
feature_data_std = sc.fit_transform(feature_data)

In [None]:
# Reshape the target variable 
target_class=mu_data['class'].values.reshape(-1,1)
print(target_class.shape)

In [None]:
#Here we will find out random state value with which our linear model learns maximum
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn import metrics 
max_r_score=0
for r_state in range(42,200):
    x_train, x_test, y_train, y_test = train_test_split(feature_data_std, target_class ,random_state = r_state,test_size=0.30)
    log_reg = LogisticRegression()
    log_reg.fit(x_train,y_train)
    y_pred = log_reg.predict(x_test)
    r2_scr=r2_score(y_test,y_pred)
    if r2_scr>max_r_score:
        max_r_score=r2_scr
        final_r_state=r_state
print("max r2 score corresponding to ",final_r_state," is ",max_r_score)

In [None]:
# Evaluate cross validation score to find out the overfitting or underfitting issues while training 
from sklearn.model_selection import cross_val_score
cross_val_score=cross_val_score(LogisticRegression(),feature_data_std, target_class,cv=5,scoring="r2")
print("Cross Validation score for 5 iternations: {}%".format (cross_val_score.mean()))

# As we have achieved significantly impoved score as compared to cross validation. This indicates do not poses any underfitting
# and over fitting issues while modelling the ML model. 

In [None]:
#Finalise the model 
x_train, x_test, y_train, y_test = train_test_split(feature_data_std, target_class, random_state = 42,test_size=0.30)
log_reg =LogisticRegression()
log_reg.fit(x_train,y_train)
y_pred = log_reg.predict(x_test)

In [None]:
#Findout the rmse and r2 score
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
print("RMSE is: ",np.sqrt(mean_squared_error(y_test,y_pred)))
print("r2_score is: ",r2_score(y_test,y_pred) )

In [None]:
print("*Test Accuracy: {}%".format(round(log_reg.score(x_test,y_test)*100,2)))

In [None]:
#false positive and false negative rates are 0, meaning that all mushrooms were correctly classified as poisonous or not
print("*Confusion Matrix: \n {}".format (confusion_matrix(y_test,y_pred)))

In [None]:
print("Classification Report: \n {}".format (classification_report(y_test,y_pred)))

In [None]:
#The classification report visualizer displays the precision, recall, F1, and support scores for the model.
from yellowbrick.classifier import ClassificationReport
mu_viz = ClassificationReport(LogisticRegression(), cmap='GnBu')
mu_viz.fit(x_train, y_train)
mu_viz.score(x_test, y_test)
mu_viz.show()

In [None]:
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import roc_auc_score

#Area under the ROC curve for LogisticRegression
Logistic_Reg_ROC_Curve=metrics.roc_auc_score(y_test,y_pred)
print("ROC Score: ", Logistic_Reg_ROC_Curve)

# Define Function to plot the ROC curve 
def plot_roc_curve(roc_auc):
    plt.plot(fp_rate, tp_rate, color='blue', label='ROC = %0.2f' % Logistic_Reg_ROC_Curve)
    plt.plot([0, 1], [0, 1], color='red', linestyle='-.')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic Curve')
    plt.legend()
    plt.show()

# Plot AUC_ROC curve 
fp_rate, tp_rate, thresholds= roc_curve(y_test, y_pred)
roc_auc=auc(fp_rate, tp_rate)
plot_roc_curve(roc_auc)

In [None]:
# Lastly Save the model for futher use
from sklearn.externals import joblib  
joblib.dump(log_reg, 'Mushrooms_poisonous_nonpoisonous.pkl')