In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.svm import LinearSVC
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix



# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv('/kaggle/input/mushroom-classification/mushrooms.csv')
df_test = pd.read_csv('/kaggle/input/mushroom-classification/mushrooms.csv')

# Data Set Information:

**This data set includes descriptions of hypothetical samples corresponding to 23 species of gilled mushrooms in the Agaricus and Lepiota Family (pp. 500-525). Each species is identified as definitely edible, definitely poisonous, or of unknown edibility and not recommended. This latter class was combined with the poisonous one. The Guide clearly states that there is no simple rule for determining the edibility of a mushroom; no rule like ``leaflets three, let it be'' for Poisonous Oak and Ivy.**

# Attribute Information:

1. cap-shape: bell=b,conical=c,convex=x,flat=f, knobbed=k,sunken=s
2. cap-surface: fibrous=f,grooves=g,scaly=y,smooth=s
3. cap-color: brown=n,buff=b,cinnamon=c,gray=g,green=r, pink=p,purple=u,red=e,white=w,yellow=y
4. bruises?: bruises=t,no=f
5. odor: almond=a,anise=l,creosote=c,fishy=y,foul=f, musty=m,none=n,pungent=p,spicy=s
6. gill-attachment: attached=a,descending=d,free=f,notched=n
7. gill-spacing: close=c,crowded=w,distant=d
8. gill-size: broad=b,narrow=n
9. gill-color: black=k,brown=n,buff=b,chocolate=h,gray=g, green=r,orange=o,pink=p,purple=u,red=e, white=w,yellow=y
10. stalk-shape: enlarging=e,tapering=t
11. stalk-root: bulbous=b,club=c,cup=u,equal=e, rhizomorphs=z,rooted=r,missing=?
12. stalk-surface-above-ring: fibrous=f,scaly=y,silky=k,smooth=s
13. stalk-surface-below-ring: fibrous=f,scaly=y,silky=k,smooth=s
14. stalk-color-above-ring: brown=n,buff=b,cinnamon=c,gray=g,orange=o, pink=p,red=e,white=w,yellow=y
15. stalk-color-below-ring: brown=n,buff=b,cinnamon=c,gray=g,orange=o, pink=p,red=e,white=w,yellow=y
16. veil-type: partial=p,universal=u
17. veil-color: brown=n,orange=o,white=w,yellow=y
18. ring-number: none=n,one=o,two=t
19. ring-type: cobwebby=c,evanescent=e,flaring=f,large=l, none=n,pendant=p,sheathing=s,zone=z
20. spore-print-color: black=k,brown=n,buff=b,chocolate=h,green=r, orange=o,purple=u,white=w,yellow=y
21. population: abundant=a,clustered=c,numerous=n, scattered=s,several=v,solitary=y
22. habitat: grasses=g,leaves=l,meadows=m,paths=p, urban=u,waste=w,woods=d



In [None]:
df.shape

In [None]:
df.dtypes

In [None]:
for col in df.columns :
    print('Column Name : ' +str(col))
    print(df[col].value_counts())
    print('***************')
    

In [None]:
for col in df.columns :
    index = np.array(df[col].value_counts().index)
    index = index.astype('str')
    if(len(np.where(index == '?')[0])>0):
        print('Missing Values in : '+col)
    

In [None]:
df['class'] = df['class'].replace({'p':1,'e':0})

In [None]:
df['stalk-root'] = df['stalk-root'].replace({'?':np.NAN})

In [None]:
df.dropna(inplace = True)

In [None]:
def OneHotEncodeing (columns) :
 try :
    global df
    df_tempp = pd.DataFrame()
    df_temp = pd.DataFrame()
    flag = False
    for col in columns :
      if(not(flag)):
        df_tempp = pd.get_dummies(df[str(col)], prefix= str(col))
        flag = True
      else :
        df_temp = pd.get_dummies(df[str(col)], prefix= str(col))
        df_tempp = pd.concat([df_tempp,df_temp],axis = 1 )
      df.drop(columns= str(col),axis = 'columns',inplace = True)
    df = pd.concat([df_tempp,df],axis = 1 )
 except :
   print('Error at : ',col)

In [None]:
OneHotEncodeing (['cap-shape', 'cap-surface', 'cap-color', 'bruises', 'odor',
       'gill-attachment', 'gill-spacing', 'gill-size', 'gill-color',
       'stalk-shape', 'stalk-root', 'stalk-surface-above-ring',
       'stalk-surface-below-ring', 'stalk-color-above-ring',
       'stalk-color-below-ring', 'veil-type', 'veil-color', 'ring-number',
       'ring-type', 'spore-print-color', 'population', 'habitat'])

In [None]:
df_corr_mat = df.corr()['class']

In [None]:
feature_lst = []
corrleation_val = 0.3
for index in df_corr_mat.index :
  if((df_corr_mat[index]>=corrleation_val) or (df_corr_mat[index] <= -corrleation_val)):
    if(index != 'class') :
      feature_lst.append(index)

In [None]:
figure = plt.figure(figsize= (10,8))
ax = figure.add_subplot(111)
df_corr_mat[feature_lst].plot(kind = 'bar',ax=ax)

In [None]:
train, validate, test = np.split(df.sample(frac=1), [int(.6*len(df)), int(.8*len(df))])

In [None]:
X_train = train[feature_lst]
X_val = validate[feature_lst]
X_test = test[feature_lst]
Y_train = train['class']
Y_val = validate['class']
Y_test = test['class']

F1 score varying with C (regularization parameter in SVM)

In [None]:
c = 100
list = []
while c >=0.0001:
    list.append(c/2)
    c = c-c/2
    
score_list_val = []
score_list_train = []
C_list = []
for param in list :
    clf = LinearSVC(loss ="hinge", C=param,max_iter=10000)
    clf.fit(X_train,Y_train)
    Y_predict_val = clf.predict(X_val)
    score_list_val.append(f1_score(Y_val,Y_predict_val))
    
    Y_predict_train = clf.predict(X_train)
    score_list_train.append(f1_score(Y_train,Y_predict_train))
    C_list.append(str(param))
    
    
df_train_score =  pd.DataFrame(data = score_list_train,index = C_list,columns = ['Train_F1_Score'])
df_val_score =  pd.DataFrame(data = score_list_val,index = C_list,columns = ['Validation_F1_Score'])

In [None]:
df_train_score.reset_index(inplace = True)
df_val_score.reset_index(inplace = True)

In [None]:
figure = plt.figure(figsize= (18,5))
ax = figure.add_subplot(111)
df_val_score.plot(ax= ax,x='index',y='Validation_F1_Score',marker = 'o')
df_train_score.plot(ax= ax,x='index',y='Train_F1_Score',marker = 'o')
ax.set_xlabel("C parameter Values")
ax.set_title("F1_score VS parameter C")

In [None]:
df_val_score

In [None]:
C = 0.09765625

clf = LinearSVC(loss ="hinge", C=C,max_iter=10000)
clf.fit(X_train,Y_train)
Y_predict = clf.predict(X_test)

In [None]:
print('recall_score is : ' +str(recall_score(Y_test,Y_predict)))
print('precision_score is : ' +str(precision_score(Y_test,Y_predict)))
print('f1_score is : ' +str(f1_score(Y_test,Y_predict)))
cnf_matrix_val = confusion_matrix(Y_test,Y_predict)
sns.heatmap(pd.DataFrame(cnf_matrix_val), annot=True, cmap="YlGnBu" ,fmt='g')

# Thank You