In [None]:
import warnings
import pandas as pd
import seaborn as sns
from sklearn.svm import SVC
import matplotlib.pyplot as plt
from yellowbrick.classifier import ROCAUC
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import auc, roc_curve, accuracy_score,r2_score, confusion_matrix
warnings.filterwarnings('ignore')
%matplotlib inline
plt.style.use('seaborn')

In [None]:
file_name = '../input/mushrooms.csv'
mushroom_df = pd.read_csv(file_name)
mushroom_df.head()

Taking a look at the Distribtion of Target Class Variable 'Class'

In [None]:
print("The total Number of Observations: {}".format(mushroom_df.shape[0]))
print("Total Number of Edible Mushroom in the data set: {}".format(mushroom_df['class'].value_counts()[0]))
print("Total Number of Poisinous Mushroom in the data set: {}".format(mushroom_df['class'].value_counts()[1]))

In [None]:
mushroom_df['class'].value_counts().plot(kind='bar')
plt.title('Distribution of Target Class')
plt.ylabel("Number of Observation")
plt.xlabel("Frequency of Edible and Poisinious Mushroom")

Taking a look at the distribution of each variable except for the target class

In [None]:
for col in mushroom_df.columns:
    print(col)
    mushroom_df[col].value_counts().plot(kind='bar')
    plt.title('Distribution {}'.format(col))
    plt.ylabel("Number of Observation")
    plt.xlabel("Frequency of {}".format(col))
    plt.show()

In [None]:
for col in mushroom_df.columns:
    print(col)
    print(mushroom_df[col].value_counts())

  Converting dummy variables from the Categorical variables. One of the column have to be removed in order to avoid the dummy variable trap

In [None]:
X_data = mushroom_df.drop(columns=['class'],axis=1)
y_data = mushroom_df['class']

In [None]:
X_data_dummy = pd.get_dummies(X_data, prefix=X_data.columns, drop_first=True)

In [None]:
X_data_dummy.head()

In [None]:
y_data = y_data.map({'p':0,'e':1}).values

In [None]:
X_data = X_data_dummy.values

In [None]:
X_train,X_test,y_train,y_test = train_test_split(X_data, y_data, test_size = 0.2,random_state=0)

In [None]:
classifier = LogisticRegression()

In [None]:
classifier.fit(X_train, y_train)

In [None]:
y_pred = classifier.predict(X_test)

In [None]:
# Instantiate the visualizer with the classification model
visualizer = ROCAUC(LogisticRegression(), classes=[1,0])

visualizer.fit(X_train, y_train)  # Fit the training data to the visualizer
visualizer.score(X_test, y_test)  # Evaluate the model on the test data
g = visualizer.poof()             # Draw/show/poof the data

In [None]:
accuracy_score(y_test, y_pred)

In [None]:
cm = confusion_matrix(y_test, y_pred)

In [None]:
# Transform to df for easier plotting
cm_df = pd.DataFrame(cm,
                     index = ['Poisinous','Edible'], 
                     columns = ['Poisinous','Edible'])

plt.figure(figsize=(5.5,4))
sns.heatmap(cm_df, annot=True)
plt.title('Logistic Regression \nAccuracy:{0:.3f}'.format(accuracy_score(y_test, y_pred)))
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.show()

In [None]:
clf = SVC(kernel = 'linear').fit(X_train,y_train)

In [None]:
y_pred = clf.predict(X_test)

In [None]:
accuracy_score(y_test, y_pred)

In [None]:
cm = confusion_matrix(y_test, y_pred)

In [None]:
# Transform to df for easier plotting
cm_df = pd.DataFrame(cm,
                     index = ['Poisinous','Edible'], 
                     columns = ['Poisinous','Edible'])

plt.figure(figsize=(5.5,4))
sns.heatmap(cm_df, annot=True)
plt.title('Logistic Regression \nAccuracy:{0:.3f}'.format(accuracy_score(y_test, y_pred)))
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.show()

In [None]:
clf = SVC(kernel='linear', C=1)
scores = cross_val_score(clf, X_data, y_data, cv=5)
scores 

In [None]:
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

In [None]:
scores = cross_val_score(classifier, X_data, y_data, cv=10)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))