# Assignment 1B: Basic Classification / Regression

Take the ODI dataset and load it. Alternatively, you can download a dataset of your own
choice from the web, and load that. If you opt for a downloaded dataset, write down
why that interests you, and why it is suited for classification/regression.

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import graphviz
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn import tree
from sklearn.metrics import accuracy_score, classification_report, plot_roc_curve
from sklearn.naive_bayes import CategoricalNB

In [2]:
# Initialize dataframe
df = pd.read_csv('./data/mushrooms.csv')
# Show first 5 rows
df.head()
# Class is indicative, p=poisonous and e=edible

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


In [3]:
# Turn labels of each attribute into unique integers
for col in df.columns:
    df[col] = LabelEncoder().fit_transform(df[col])
 
df.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,1,5,2,4,1,6,1,0,1,4,...,2,7,7,0,2,1,4,2,3,5
1,0,5,2,9,1,0,1,0,0,4,...,2,7,7,0,2,1,4,3,2,1
2,0,0,2,8,1,3,1,0,0,5,...,2,7,7,0,2,1,4,3,2,3
3,1,5,3,8,1,6,1,0,1,5,...,2,7,7,0,2,1,4,2,3,5
4,0,5,2,3,0,5,1,1,0,4,...,2,7,7,0,2,1,0,3,0,1


In [4]:
# Seperating attributes and label
X = df.iloc[:,1:23] 
y = df.iloc[:, 0]
X.head(), y.head()

(   cap-shape  cap-surface  cap-color  bruises  odor  gill-attachment  \
 0          5            2          4        1     6                1   
 1          5            2          9        1     0                1   
 2          0            2          8        1     3                1   
 3          5            3          8        1     6                1   
 4          5            2          3        0     5                1   
 
    gill-spacing  gill-size  gill-color  stalk-shape  ...  \
 0             0          1           4            0  ...   
 1             0          0           4            0  ...   
 2             0          0           5            0  ...   
 3             0          1           5            0  ...   
 4             1          0           4            1  ...   
 
    stalk-surface-below-ring  stalk-color-above-ring  stalk-color-below-ring  \
 0                         2                       7                       7   
 1                         2    

In [5]:
# Datasplit
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=1)

In [6]:
# DecisionTreeClassifier
clf = tree.DecisionTreeClassifier(criterion='entropy')
clf = clf.fit(X_train, y_train)

TypeError: fit() got an unexpected keyword argument 'criterion'

In [None]:
# Plot the tree
plt.figure(figsize=(20,20))
tree.plot_tree(clf, fontsize=10)

In [None]:
# Plot dot data
dot_data = tree.export_graphviz(clf, out_file=None, feature_names=X.columns, class_names=['Poisonous','Edible'], filled=True, rounded=True, special_characters=True)
graph = graphviz.Source(dot_data)
graph

In [None]:
# Regression with DT, determine accuracy
y_predict_dt = clf.predict(X_test)
y_prob_pred_dt = clf.predict_proba(X_test)
# how did our model perform?
count_misclassified = (y_test != y_predict_dt).sum()

print("DecisionTree")
print("=" * 30)
print('Misclassified samples: {}'.format(count_misclassified))
accuracy = accuracy_score(y_test, y_predict_dt)
print('Accuracy: {:.2f}'.format(accuracy))

print(classification_report(y_test, y_predict_dt))
plot_roc_curve(clf, X_test, y_test)


In [None]:
# CategoricalNB
cnb = CategoricalNB()
cnb.fit(X_train, y_train)