In [None]:
# pandas
import pandas as pd 

# matplotlib
import matplotlib.pyplot as plt 
plt.style.use( 'ggplot')

# seaborn
import seaborn as sns
# numpy
import numpy as np 

# sklearn
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import confusion_matrix

import warnings
warnings.filterwarnings('ignore')


In [None]:
import logging
logger = logging.getLogger()
fhandler = logging.FileHandler(filename='test.log', mode='a')
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
fhandler.setFormatter(formatter)
logger.addHandler(fhandler)
logger.setLevel(logging.DEBUG)

# Read Data

In [None]:
data = pd.read_csv('../input/mushroom-classification/mushrooms.csv')
logging.debug("Print Shape of Dataset {} ".format(data.head()))

In [None]:
for i in data.columns:
    print(i,data[i].unique())

<!-- ### cap-shape: bell=b,conical=c,convex=x,flat=f, knobbed=k,sunken=s

cap-surface: fibrous=f,grooves=g,scaly=y,smooth=s

cap-color: brown=n,buff=b,cinnamon=c,gray=g,green=r,pink=p,purple=u,red=e,white=w,yellow=y

bruises: bruises=t,no=f

odor: almond=a,anise=l,creosote=c,fishy=y,foul=f,musty=m,none=n,pungent=p,spicy=s

gill-attachment: attached=a,descending=d,free=f,notched=n

gill-spacing: close=c,crowded=w,distant=d

gill-size: broad=b,narrow=n

gill-color: black=k,brown=n,buff=b,chocolate=h,gray=g, green=r,orange=o,pink=p,purple=u,red=e,white=w,yellow=y

stalk-shape: enlarging=e,tapering=t

stalk-root: bulbous=b,club=c,cup=u,equal=e,rhizomorphs=z,rooted=r,missing=?

stalk-surface-above-ring: fibrous=f,scaly=y,silky=k,smooth=s

stalk-surface-below-ring: fibrous=f,scaly=y,silky=k,smooth=s

stalk-color-above-ring: brown=n,buff=b,cinnamon=c,gray=g,orange=o,pink=p,red=e,white=w,yellow=y

stalk-color-below-ring: brown=n,buff=b,cinnamon=c,gray=g,orange=o,pink=p,red=e,white=w,yellow=y

veil-type: partial=p,universal=u

veil-color: brown=n,orange=o,white=w,yellow=y

ring-number: none=n,one=o,two=t

ring-type: cobwebby=c,evanescent=e,flaring=f,large=l,none=n,pendant=p,sheathing=s,zone=z

spore-print-color: black=k,brown=n,buff=b,chocolate=h,green=r,orange=o,purple=u,white=w,yellow=y

population: abundant=a,clustered=c,numerous=n,scattered=s,several=v,solitary=y

habitat: grasses=g,leaves=l,meadows=m,paths=p,urban=u,waste=w,woods=d  -->

# Exploratory Data Analysis (or EDA)

In [None]:
data.columns
logging.debug("Columns In our Dataset {}:".format(data.columns))

In [None]:
data.info()
logging.debug("Check data types of Columns {}".format(data.info()))

## Check for NULL Values

In [None]:
data.isnull().sum()
logging.debug("Check for NULL Values {}".format(data.isnull().sum()))

# Data Visualization

## Barchart for Mushorroms cap shapes

In [None]:
plt.figure(figsize=(8,6))
cap_shapes = data['cap-shape'].value_counts().tolist()
shape_names = ['convex','flat','knobbed','bell','sunken','conical']
colors = ['brown','lightcoral','darkorange','lime','navy','red']
plt.bar(shape_names,cap_shapes,color = colors);
plt.xlabel('Mushrooms Cap Shape');
plt.ylabel("Count");
plt.title("Count of Mushrooms Cap shape");
logging.debug("Barchart for Mushrooms Cap Shape")

## Mushrooms Cap Color

In [None]:
plt.figure(figsize=(9,6))
cap_color = data['cap-color'].value_counts().tolist()
color_name = ['brown', 'gray','red','yellow','white','buff','pink','cinnamon','purple','green']
colors = ['brown','gray','red','yellow','mistyrose','tomato','pink','violet','purple','green']
plt.bar(color_name,cap_color,color = colors);
plt.ylabel('Counts');
plt.xlabel("Mushroom Cap Color");
plt.title("Mushrooms Cap Color");
logging.debug("Barchart for Mushrooms Cap Color")

## No of Mushrooms are edible and poisonous based on cap color

In [None]:
edible_mushrooms = data[data['class'] == 'e']['cap-color'].value_counts().tolist()[:-2]
non_edible_mushrooms = data[data['class'] == 'p']['cap-color'].value_counts().tolist()
color_name = ['brown', 'gray','red','yellow','white','buff','pink','cinnamon']

In [None]:
fig, ax = plt.subplots(figsize=(12,7))
index = np.arange(8)
bar_width = 0.35
edible_bars = ax.bar(index, edible_mushrooms, bar_width,color='b',label='edible')
poison_bars = ax.bar(index+bar_width,non_edible_mushrooms ,bar_width, color='r',label='poisonous')
plt.xticks(index + bar_width, tuple(color_name));
plt.title("Edible and Poisonous Mushrooms based on Cap Color",{'fontsize':25})
plt.xlabel("Cap Color");
plt.ylabel("Count")
plt.legend();
logging.debug("Barchart for edible and poisonous mushrooms based on Cap Color")

In [None]:
plt.figure(figsize=(9,6))
odor_val = data['odor'].value_counts().tolist()
odors = ['no smell','foul','spicy','fishy','anise','alomond','punget','creosote','musty']
clrs = ['magenta','deeppink','cyan','gold','crimson','chocolate','springgreen','blue','red']
plt.bar(odors,odor_val,color = clrs);
plt.xlabel('Mushrooms Odor');
plt.ylabel("Count");
plt.title("Distribution of mushrooms based on odor",{'fontsize':20});
logging.debug("Distribution of mushrooms based on Color")

In [None]:
plt.figure(figsize=(8,8))
populations = data.population.value_counts().tolist()
labels = ['several','solitary','scattered','numerous','abundent','clustered']
explode =  (0, 0.1, 0, 0, 0, 0)
plt.pie(populations,labels=labels,startangle=150,explode=explode,autopct='%1.1f%%');
plt.title('Mushrooms Population',{'fontsize':25})
plt.legend(bbox_to_anchor=(1.2,1),loc="upper right",fontsize=14);
logging.debug("Piechart for Mushrooms Population")

In [None]:
data[data['class'] == 'p']['population'].value_counts()

## Distribution of poisounous mushrooms based on population

In [None]:
plt.figure(figsize=(8,7))
plt.pie(data[data['class'] == 'p']['population'].value_counts().tolist(),
        autopct='%1.1f%%',
        labels = ['several','solitary','scattered','numerous'],
        colors=['gold','cyan','royalblue','hotpink'],
        explode = (0,0,0,0.55)
        );
plt.title('Distribution of poisounous mushrooms based on population',{'fontsize':25});
plt.legend(bbox_to_anchor=(1.2,1),loc="upper right",fontsize=15);

## Edible mushrooms based on population

In [None]:
plt.figure(figsize=(8,7))
plt.pie(data[data['class'] == 'e']['population'].value_counts().tolist(),
        autopct='%1.1f%%',
        labels = ['several','solitary','scattered','numerous','abundent','clustered'],
        colors=['lightcoral','orange','lime','deepskyblue','yellow','pink']
        );
plt.title('Distribution of edible mushrooms based on population',{'fontsize':25});
plt.legend(bbox_to_anchor=(1.2,1),loc="upper right",fontsize=13);

## Habitat

In [None]:
plt.figure(figsize=(8,6))
habitats = data['habitat'].value_counts().tolist()
habitats_names = ['woods','grasses','paths','leaves','urban','meadows','waste']
colors = ['orange','lime','deepskyblue','magenta','crimson','blue','red']
plt.title("Mushrooms based on Habitat",{'fontsize':25});
plt.bar(habitats_names,habitats,color = colors);

In [None]:
data.head()

# Data Preprocessing

In [None]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
logging.debug("Import LabelEncoder and create object.")

## Dependent and Independent Features

In [None]:
X = data.drop('class',axis=1)
y = data['class']
logging.debug("Split data int dependent(y) and Independent features(X)")

In [None]:
def func(df):
    if df == 'p':
        return 1
    else:
        return 0
y = y.apply(func)
logging.debug("Function for dependent feature to convert categorical values to numeric values(If class is p set to 1 and for e set 0)")

In [None]:
objList = X.select_dtypes(include = "object").columns

In [None]:
for feat in objList:
    X[feat] = encoder.fit_transform(X[feat])
logging.debug("Concert categorical features into numeric.")

In [None]:
for i in data.columns:
    print(i,":",data[i].unique())

In [None]:
for i in X.columns:
    print(i,":",X[i].unique())

## To get a good prediction, divide the data into training and testing data, it is because as the name suggests you will train few data points and test few data points, and keep on doing that unless you get good results.


In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=43)
logging.debug("Split data into train data and test data(validation set).")

# Models

In [None]:
models = {
    SVC(kernel='rbf',C= 1.0,gamma=0.8):'Support Vector Machine',
    RandomForestClassifier(max_depth=8,n_estimators=120):'Random Forest',
    XGBClassifier(eval_metric  = 'error',booster='gbtree'):'XGBClassifier'
}
for m in models.keys():
    m.fit(X_train,y_train)
logging.debug("Create models and fit model based on train data and test data.")

# iNterpret

## To determine how well a model is performing, we often validate its performance on new unseen instances that were not available to the model during training

In [None]:
for model,name in models.items():
     print(f"Accuracy Score for {name} is : ",model.score(X_test,y_test)*100,"%")
     logging.debug("Accuracy of Each model")
     logging.debug("Accuracy Score for {} is {}%".format(name,model.score(X_test,y_test)*100))

# Feature Importance

## XGBOOST Classifier

In [None]:
plt.figure(figsize=(9,7))
xgboost = XGBClassifier(eval_metric  = 'error',booster='gbtree')
xgboost.fit(X_train,y_train)
feature_imp1 = xgboost.feature_importances_
sns.barplot(x=feature_imp1, y=X.columns)
# Add labels to your graph
plt.xlabel('Feature Importance Score')
plt.ylabel('Features')
plt.title("Visualizing Important Features For XGBClassifier",{'fontsize':25})
plt.show();
feature_dict = {k:v for (k,v) in zip(X.columns,feature_imp1)}
logging.debug("Feature Importance for Random Forest : {}".format(feature_dict))

## We can see here that many features are not very mush important to makepredictions so we remove them

In [None]:
mush_df = X.copy()

In [None]:
new_X = mush_df.drop(['cap-shape','gill-attachment','gill-spacing','veil-type', 'veil-color','stalk-surface-above-ring','ring-number',
       'ring-type'],axis=1)

In [None]:
new_X.columns

In [None]:
new_Xtrain,new_Xtest,new_ytrain,new_ytest = train_test_split(new_X,y,test_size=0.2,random_state=43)

## Check accuracy after removing some features.

In [None]:
models = {
    SVC(kernel='rbf',C= 1.0,gamma=0.8):'Support Vector Machine',
    RandomForestClassifier():'Random Forest',
    XGBClassifier(eval_metric  = 'error',booster='gbtree'):'XGBClassifier'
}
for m in models.keys():
    m.fit(new_Xtrain,new_ytrain)
for model,name in models.items():
     print(f"Accuracy Score for {name} is : ",model.score(new_Xtest,new_ytest)*100,"%")

## Plot Heatmaps for all models

In [None]:
class_names = [0,1]
fig,ax = plt.subplots()
tick_marks = np.arange(len(class_names))
plt.xticks(tick_marks,class_names)
plt.yticks(tick_marks,class_names)


for model,name in models.items():
    y_pred = model.predict(new_Xtest)
    cnf_matrix = confusion_matrix(new_ytest,y_pred)
    sns.heatmap(pd.DataFrame(cnf_matrix), annot = True, cmap = 'YlGn',
            fmt = 'g')
    ax.xaxis.set_label_position('top')
    plt.tight_layout()
    plt.title(f'Heat Map for {name}', {'fontsize':20})
    plt.ylabel('Actual label')
    plt.xlabel('Predicted label')
    plt.show()

# Save Model

In [None]:
forest = RandomForestClassifier()
forest.fit(new_Xtrain,new_ytrain)

In [None]:
y_pred = forest.predict(new_Xtest)

# F1 Score
## The F1 Score is the 2*((precision*recall)/(precision+recall))

In [None]:
from sklearn.metrics import f1_score
print(f1_score(y_test,y_pred))

with open('E:\DataScienceProjects/mushroom.sav','rb') as f:
    mp = pickle.load(f)