<a href="https://colab.research.google.com/github/sharmaprateek/scripts/blob/master/ML104/CodingtheShroomServiceClassifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setting up our environment

In the next two cells, we do things we're plenty familiar with by now: importing the needed libraries and mounting Google Drive.

In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split

In [2]:
from google.colab import drive
from google.colab import files
drive.mount('/content/drive')

Mounted at /content/drive


Now we're ready to load in the data and take a look.

In [3]:
file_string = "/content/drive/My Drive/quantic/mushroom_data.csv"
data = pd.read_csv(file_string)

# Data Exploration

In [4]:
data.describe()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,gill-color,stalk-shape,stalk-color-above-ring,stalk-color-below-ring,veil-color,ring-number,ring-type,spore-print-color,population,habitat
count,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124
unique,2,6,4,10,2,12,2,9,9,4,3,5,9,6,7
top,edible,convex,scaly,brown,no,buff,tapering,white,white,white,one,pendant,white,several,woods
freq,4208,3656,3244,2284,4748,1728,4608,4464,4384,7924,7488,3968,2388,4040,3148


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8124 entries, 0 to 8123
Data columns (total 15 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   class                   8124 non-null   object
 1   cap-shape               8124 non-null   object
 2   cap-surface             8124 non-null   object
 3   cap-color               8124 non-null   object
 4   bruises                 8124 non-null   object
 5   gill-color              8124 non-null   object
 6   stalk-shape             8124 non-null   object
 7   stalk-color-above-ring  8124 non-null   object
 8   stalk-color-below-ring  8124 non-null   object
 9   veil-color              8124 non-null   object
 10  ring-number             8124 non-null   object
 11  ring-type               8124 non-null   object
 12  spore-print-color       8124 non-null   object
 13  population              8124 non-null   object
 14  habitat                 8124 non-null   object
dtypes: o

In [6]:
# Null values check
data.isna().sum()

Unnamed: 0,0
class,0
cap-shape,0
cap-surface,0
cap-color,0
bruises,0
gill-color,0
stalk-shape,0
stalk-color-above-ring,0
stalk-color-below-ring,0
veil-color,0


In [7]:
label = data.loc[:, 'class']
label.unique()

array(['poisonous', 'edible'], dtype=object)

# Preprocessing

In [8]:
data_to_encode = data.drop(columns=['class'])
data_to_encode

Unnamed: 0,cap-shape,cap-surface,cap-color,bruises,gill-color,stalk-shape,stalk-color-above-ring,stalk-color-below-ring,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,convex,smooth,brown,bruises,black,enlarging,white,white,white,one,pendant,black,scattered,urban
1,convex,smooth,yellow,bruises,black,enlarging,white,white,white,one,pendant,brown,numerous,grasses
2,bell,smooth,white,bruises,brown,enlarging,white,white,white,one,pendant,brown,numerous,meadows
3,convex,scaly,white,bruises,brown,enlarging,white,white,white,one,pendant,black,scattered,urban
4,convex,smooth,gray,no,black,tapering,white,white,white,one,evanescent,brown,abundant,grasses
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8119,knobbed,smooth,brown,no,yellow,enlarging,orange,orange,orange,one,pendant,buff,clustered,leaves
8120,convex,smooth,brown,no,yellow,enlarging,orange,orange,brown,one,pendant,buff,several,leaves
8121,flat,smooth,brown,no,brown,enlarging,orange,orange,orange,one,pendant,buff,clustered,leaves
8122,knobbed,scaly,brown,no,buff,tapering,white,white,white,one,evanescent,white,several,leaves


In [9]:
X = pd.get_dummies(data_to_encode)
X.head()

Unnamed: 0,cap-shape_ knobbed,cap-shape_bell,cap-shape_conical,cap-shape_convex,cap-shape_flat,cap-shape_sunken,cap-surface_fibrous,cap-surface_grooves,cap-surface_scaly,cap-surface_smooth,...,population_scattered,population_several,population_solitary,habitat_grasses,habitat_leaves,habitat_meadows,habitat_paths,habitat_urban,habitat_waste,habitat_woods
0,False,False,False,True,False,False,False,False,False,True,...,True,False,False,False,False,False,False,True,False,False
1,False,False,False,True,False,False,False,False,False,True,...,False,False,False,True,False,False,False,False,False,False
2,False,True,False,False,False,False,False,False,False,True,...,False,False,False,False,False,True,False,False,False,False
3,False,False,False,True,False,False,False,False,True,False,...,True,False,False,False,False,False,False,True,False,False
4,False,False,False,True,False,False,False,False,False,True,...,False,False,False,True,False,False,False,False,False,False


In [10]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
y = encoder.fit_transform(label)
y

array([1, 0, 0, ..., 0, 1, 0])

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3, random_state=13)

In [12]:
# Training set contains 70% of original data
print(X_train.shape)
print(y_train.shape)

# Test set contains 30% of original data
print(X_test.shape)
print(y_test.shape)

(5686, 88)
(5686,)
(2438, 88)
(2438,)


# Training

In [13]:
from sklearn.linear_model import LogisticRegression
log_reg_model = LogisticRegression(random_state=13)

X_train = X_train.values
log_reg_model.fit(X_train, y_train)

In [14]:
print(log_reg_model.classes_)
print(log_reg_model.coef_)
print(log_reg_model.intercept_)
print(log_reg_model.n_features_in_)

[0 1]
[[ 3.86178409e-01 -1.13462227e+00  6.27915102e-01  5.60298023e-01
   7.34413988e-01 -1.41807911e+00 -1.13220227e+00  5.15852595e-01
  -2.60205595e-01  6.32659413e-01  1.82201870e-01  1.80155664e+00
  -1.24421512e+00  3.30109006e-01 -1.25276487e+00  2.00692884e+00
  -1.31136480e+00  2.22811493e-01  5.34130022e-01 -1.51328893e+00
   1.14756788e-01 -3.58652647e-01  5.68969967e-01 -8.39349614e-01
  -9.95208174e-01  7.39380931e+00 -4.55846574e-01 -1.23488745e+00
  -6.48487316e-01 -6.90674717e-01 -8.67200548e-01 -1.01548982e+00
  -1.09920288e+00 -3.60328048e-01  2.53867043e+00 -2.78256629e+00
   8.45318148e-01  9.39931808e-01  9.23507470e-01 -1.50971886e+00
  -1.70395731e+00 -4.67726531e-04 -7.06000426e-01 -1.76041725e-01
   1.14353276e+00 -9.70004332e-01  5.46194142e-01  9.23507470e-01
  -1.05780297e+00 -1.70395731e+00  3.56901706e-01 -4.45526303e-01
   5.70501176e-01  1.53629056e+00 -8.91290742e-01 -8.12666568e-01
   3.16528692e-01  1.14353276e+00  9.23507470e-01  1.87901532e+00
  -3

In [15]:
example = X_test.iloc[0,:]
w = log_reg_model.coef_
b = log_reg_model.intercept_
manual_prediction = 1 / (1 + np.exp(-(np.dot(w, example) + b)))
manual_prediction

array([0.57098354])

In [16]:
log_reg_model.predict_proba([example])

array([[0.42901646, 0.57098354]])

In [17]:
log_reg_model.predict([example])

array([1])

# Testing

In [19]:
y_pred = log_reg_model.predict(X_test.values)
y_proba = log_reg_model.predict_proba(X_test.values)
print(y_pred)
print(y_proba)

[1 1 0 ... 1 0 0]
[[4.29016462e-01 5.70983538e-01]
 [1.76112956e-04 9.99823887e-01]
 [9.99483925e-01 5.16075361e-04]
 ...
 [2.15961212e-02 9.78403879e-01]
 [9.92132540e-01 7.86746036e-03]
 [9.99825873e-01 1.74126665e-04]]


In [21]:
from sklearn.metrics import accuracy_score
total_num_test_records = len(y_test)
num_correct = accuracy_score(y_test, y_pred, normalize=False)
print(num_correct/total_num_test_records)

0.9901558654634947


In [None]:
log_reg_model # what method goes here?

# Deployment
Now that we know the model performs well, let's simulate how we might 'deploy' the model in a production-like environment. In this case, we'll make a small, simple function that Clayton's robot can use to classify foraged mushrooms in the field.  

If all goes well, Shroom Service will have a killer app on their hands!


In [None]:
def Fungolyzer9000(observations):
  cols = X.columns.tolist()
  df_obs = pd.DataFrame([observations])
  encoded_observation = pd.get_dummies(df_obs).reindex(columns=cols).fillna(0).values
  # remember poisonous is 1 and edible is 0
  result = log_reg_model.predict_proba(encoded_observation)[0][1] # first row, second column of result array is the poisonous probabilty
  return f"Probability this mushroom is poisonous is {round(result * 100, 3)}%."

In [None]:
mystery_mushroom_1 = {"cap-shape": "convex", "cap-surface": "smooth", "cap-color": "brown", "bruises": "bruises", "gill-color": "black", "stalk-shape": "enlarging", "stalk-color-above-ring": "white", "stalk-color-below-ring": "white", "veil-color": "white", "ring-number": "one", "ring-type": "pendant", "spore-print-color": "black", "population": "scattered", "habitat": "urban"}

answer = Fungolyzer9000(mystery_mushroom_1)
answer

In [None]:
mystery_mushroom_2 = {"cap-shape": "bell", "cap-surface": "smooth", "cap-color": "white", "bruises": "bruises", "gill-color": "brown", "stalk-shape": "enlarging", "stalk-color-above-ring": "white", "stalk-color-below-ring": "white", "veil-color": "white", "ring-number": "one", "ring-type": "pendant", "spore-print-color": "brown", "population": "numerous", "habitat": "meadows"}

answer = Fungolyzer9000(mystery_mushroom_2)
answer

In [None]:
mystery_mushroom_3 = {"cap-shape": "convex", "cap-surface": "scaly", "cap-color": "brown", "bruises": "bruises", "gill-color": "black", "stalk-shape": "enlarging", "stalk-color-above-ring": "white", "stalk-color-below-ring": "white", "veil-color": "white", "ring-number": "one", "ring-type": "pendant", "spore-print-color": "black", "population": "scattered", "habitat": "urban"}

answer = Fungolyzer9000(mystery_mushroom_3)
answer