# Project 4_ Sci-kit based prediction <br>

## Saul Alas-Garcia and Ranju Thomas

In [17]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline
import sklearn.model_selection
import sklearn.linear_model
from sklearn import metrics

Creating the dataframe from the .data file in the url. THe filed names are obtained from .names in the same url

In [18]:
df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data', 
                  sep = ',', 
                  header = None, 
                  usecols = [0,5,16,17,21, 22], 
                  names = ["Type", "Odor", "VielType", "VielColor", "Population","Habitat"])
df.head(5)

Unnamed: 0,Type,Odor,VielType,VielColor,Population,Habitat
0,p,p,p,w,s,u
1,e,a,p,w,n,g
2,e,l,p,w,n,m
3,p,p,p,w,s,u
4,e,n,p,w,a,g


**Converting Alphabets designation into integers from analysis**

In [19]:
# Converting alphabet values to numeric. The alphabet denotations as copied from the website's .names file
#Type (As mentiones in classes e means its edible and p is for poisonious)
df.replace(to_replace={"Type":{'e':0, 'p':1}}, inplace=True)
# Odor (almond=a,anise=l,creosote=c,fishy=y,foul=f, musty=m,none=n,pungent=p,spicy=s)
df.replace(to_replace={"Odor":{'a':0, 'l':1, 'c':2, 'y':3, 'f':4, 'm':5, 'n':6, 'p':7, 's':8}}, inplace=True)
# Viel Type (partial=p,universal=u)
df.replace(to_replace={"VielType":{'p':0, 'u':1}},inplace=True)
#Viel Color (brown=n,orange=o,white=w,yellow=y)
df.replace(to_replace={"VielColor":{'n':0, 'o':1, 'w':2, 'y':3}},inplace=True)
#Population (abundant=a,clustered=c,numerous=n,scattered=s,several=v,solitary=y)
df.replace(to_replace={"Population":{'a':0, 'c':1, 'n':2, 's':3, 'v':4, 'y':5}},inplace=True) 
#Habitat (grasses=g,leaves=l,meadows=m,paths=p, urban=u,waste=w,woods=d)
df.replace(to_replace={"Habitat":{'g':0, 'l':1, 'm':2, 'p':3, 'u':4, 'w':5, 'd':6}},inplace=True) 
df.head(5)

Unnamed: 0,Type,Odor,VielType,VielColor,Population,Habitat
0,1,7,0,2,3,4
1,0,0,0,2,2,0
2,0,1,0,2,2,2
3,1,7,0,2,3,4
4,0,6,0,2,0,0


Converting two columns into dummy varriables 

In [20]:
d_color = pd.get_dummies(df['VielColor'])
d_odor = pd.get_dummies(df['Odor'])
d_col = pd.concat([df, d_color, d_odor], axis = 1)
d_col.head()

Unnamed: 0,Type,Odor,VielType,VielColor,Population,Habitat,0,1,2,3,0.1,1.1,2.1,3.1,4,5,6,7,8
0,1,7,0,2,3,4,0,0,1,0,0,0,0,0,0,0,0,1,0
1,0,0,0,2,2,0,0,0,1,0,1,0,0,0,0,0,0,0,0
2,0,1,0,2,2,2,0,0,1,0,0,1,0,0,0,0,0,0,0
3,1,7,0,2,3,4,0,0,1,0,0,0,0,0,0,0,0,1,0
4,0,6,0,2,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0


Combining the dummy columns that were created previously 

In [21]:
mushroom_col = pd.concat([d_odor,d_color,df['Type']], axis=1)
mushroom_col.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,0.1,1.1,2.1,3.1,Type
0,0,0,0,0,0,0,0,1,0,0,0,1,0,1
1,1,0,0,0,0,0,0,0,0,0,0,1,0,0
2,0,1,0,0,0,0,0,0,0,0,0,1,0,0
3,0,0,0,0,0,0,0,1,0,0,0,1,0,1
4,0,0,0,0,0,0,1,0,0,0,0,1,0,0


In [22]:
# counting data shapes
mushroom_col.shape

(8124, 14)

Testing the locations that will be grabbed 

In [23]:
m_col = pd.concat([d_odor,d_color,df['Type']], axis=1)
cols = list(df.iloc[:, :-1])
cols


['Type', 'Odor', 'VielType', 'VielColor', 'Population']

In [28]:
m_col = pd.concat([d_odor,d_color,df['Type']], axis=1)
cols2 = list(df.iloc[:, 1])
cols2[:5]   #showing only first 5 inorder to avoid long listing of all values

[7, 0, 1, 7, 6]

Creating the training model by defining x and y from the previous locations tested 

In [29]:
X = df.iloc[:, :-1].values
Y = df.iloc[:, 1].values

X_learn, X_test, Y_learn, Y_test = sklearn.model_selection.train_test_split(X, Y, random_state=1)

We are trying to predict output using linear regression 

In [30]:
linreg = sklearn.linear_model.LinearRegression()
linreg.fit(X_learn, Y_learn)
Y_pred = linreg.predict(X_test)
a = [1, 0]
b = [1, 0]

print(sklearn.metrics.mean_absolute_error(a, b))
print(sklearn.metrics.mean_squared_error(a, b))
print(np.sqrt(sklearn.metrics.mean_squared_error(a, b)))

0.0
0.0
0.0


root mean squred error to find the margin of error

In [31]:
print(np.sqrt(metrics.mean_squared_error(Y_test, Y_pred)))


9.115313594604583e-14


***Removing cap color to determine using sci-kit whether or not it can predict edibility:***

In [32]:
X = mushroom_col.iloc[:, 0:9].values
Y = mushroom_col.iloc[:, 1].values

X_train, X_test, Y_train, Y_test = sklearn.model_selection.train_test_split(X, Y, random_state=1)
linreg.fit(X_train, Y_train)
Y_pred = linreg.predict(X_test)

print(np.sqrt(metrics.mean_squared_error(Y_test, Y_pred)))

1.5825422528853578e-15


***Removing Odor to determine using sci-kit whether or not it can predict edibility:***

In [33]:
X = mushroom_col.iloc[:, 10:18].values
Y = mushroom_col.iloc[:, 1].values

X_train, X_test, Y_train, Y_test = sklearn.model_selection.train_test_split(X, Y, random_state=1)
linreg.fit(X_train, Y_train)
Y_pred = linreg.predict(X_test)

print(np.sqrt(metrics.mean_squared_error(Y_test, Y_pred)))

0.20727884581498132


**Since ODOR square root mean error is lesser than cap color square root mean error, ODOR feature can be used to predict edible or poisonous mushroom. To elaborate if we remove the odor there will be a bigger error in predicting wheter the mushroom edible or not. (This because while removing cap color the rms error is 1.5e^-15 which is .000068 and while removing odor the rms increases to .207 and indicates chances of error is high) So it's vital to have odor while predicting than the cap color. We can use same Square root mean error on other features to find out which has less mean error to find out which feature can be used to make an accurate prediction.**