In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

# Mushroom Classification

<iframe src="https://giphy.com/embed/l0HlVnPVsBxctfm4o" width="480" height="320" frameBorder="0" class="giphy-embed" allowFullScreen></iframe><p><a href="https://giphy.com/gifs/cute-mushroom-mushy-l0HlVnPVsBxctfm4o">via GIPHY</a></p>

We are building a logistic regression model that can predict whether a mushroom is edible or poisonous based on several mushroom characteristics (Attribute information below). 

**Attribute Information: **

* Target Variable classes: edible=e, poisonous=p
* cap-shape: bell=b,conical=c,convex=x,flat=f, knobbed=k,sunken=s
* cap-surface: fibrous=f,grooves=g,scaly=y,smooth=s
* cap-color: brown=n,buff=b,cinnamon=c,gray=g,green=r,pink=p,purple=u,red=e,white=w,yellow=y
* bruises: bruises=t,no=f
* odor: almond=a,anise=l,creosote=c,fishy=y,foul=f,musty=m,none=n,pungent=p,spicy=s
* gill-attachment: attached=a,descending=d,free=f,notched=n
* gill-spacing: close=c,crowded=w,distant=d
* gill-size: broad=b,narrow=n
* gill-color: black=k,brown=n,buff=b,chocolate=h,gray=g, green=r,orange=o,pink=p,purple=u,red=e,white=w,yellow=y
* stalk-shape: enlarging=e,tapering=t
* stalk-root: bulbous=b,club=c,cup=u,equal=e,rhizomorphs=z,rooted=r,missing=?
* stalk-surface-above-ring: fibrous=f,scaly=y,silky=k,smooth=s
* stalk-surface-below-ring: fibrous=f,scaly=y,silky=k,smooth=s
* stalk-color-above-ring: brown=n,buff=b,cinnamon=c,gray=g,orange=o,pink=p,red=e,white=w,yellow=y
* stalk-color-below-ring: brown=n,buff=b,cinnamon=c,gray=g,orange=o,pink=p,red=e,white=w,yellow=y
* veil-type: partial=p,universal=u
* veil-color: brown=n,orange=o,white=w,yellow=y
* ring-number: none=n,one=o,two=t
* ring-type: cobwebby=c,evanescent=e,flaring=f,large=l,none=n,pendant=p,sheathing=s,zone=z
* spore-print-color: black=k,brown=n,buff=b,chocolate=h,green=r,orange=o,purple=u,white=w,yellow=y
* population: abundant=a,clustered=c,numerous=n,scattered=s,several=v,solitary=y
* habitat: grasses=g,leaves=l,meadows=m,paths=p,urban=u,waste=w,woods=d

In [None]:
# importing libraries and magic functions

import matplotlib.pyplot as plt
import seaborn as sns


from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
%config InlineBackend.figure_format ='retina'
%matplotlib inline

In [None]:
# read dataset
df = pd.read_csv('/kaggle/input/mushroom-classification/mushrooms.csv')

# check information and first glimpse at dataframe
df.head()
df.info()
df.columns

In [None]:
# We will rename the target variable class since class is also a built in function in Python/Pandas. 
df = df.rename(columns={"class": "poison"})

In [None]:
# Distribution target variable
df.poison.value_counts()
plt.box(False)
sns.countplot(df['poison'])
plt.title("Distribution of Target Variable", fontweight='bold')

The target variable is approximately equally distributed.This is important in order to assure a non-biased outcome of the result in the classification model.

In [None]:
# checking for null values
df.isnull().sum()

# checking for duplicate values
duplicate_df = df[df.duplicated()]
duplicate_df

In [None]:
# how many different values are there within the attributes?
df.nunique()

In [None]:
# closer look at veil-type since there is only 1 veil-type
df['veil-type'].unique()

Since there is only 1 veil-type, we can drop this characteristic from our dataframe since it doesn't provide any value to us.

In [None]:
# dropping veil-type from the dataframe
df = df.drop(['veil-type'],axis=1)

In [None]:
# test-train data split

from sklearn.model_selection import train_test_split

# clarify what is y and what is x label
y = df['poison']
X = df.drop(['poison'], axis = 1)

# divide train test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=29)

# summarize
print('Train', X_train.shape, y_train.shape)
print('Test', X_test.shape, y_test.shape)

In [None]:
# Creating dummies

from sklearn.preprocessing import OrdinalEncoder
# prepare input data
def prepare_inputs(X_train, X_test):
    oe = OrdinalEncoder()
    oe.fit(X_train)
    X_train_enc = oe.transform(X_train)
    X_test_enc = oe.transform(X_test)
    return X_train_enc, X_test_enc

from sklearn.preprocessing import LabelEncoder
# prepare target
def prepare_targets(y_train, y_test):
    le = LabelEncoder()
    le.fit(y_train)
    y_train_enc = le.transform(y_train)
    y_test_enc = le.transform(y_test)
    return y_train_enc, y_test_enc

In [None]:
# Assigning new dummy variables to train & test data

# prepare input data
X_train_enc, X_test_enc = prepare_inputs(X_train, X_test)
# prepare output data
y_train_enc, y_test_enc = prepare_targets(y_train, y_test)

In [None]:
# Feature Selection

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

def select_features(X_train, y_train, X_test):
    fs = SelectKBest(score_func=chi2, k=7)
    fs.fit(X_train, y_train)
    X_train_fs = fs.transform(X_train)
    X_test_fs = fs.transform(X_test)
    return X_train_fs, X_test_fs, fs

X_train_fs, X_test_fs, fs = select_features(X_train_enc, y_train_enc, X_test_enc)

dfscores = pd.DataFrame(fs.scores_)
dfcolumns = pd.DataFrame(X.columns)
#concat two dataframes for better visualization 
featureScores = pd.concat([dfcolumns,dfscores],axis=1)
featureScores.columns = ['Specs','Score']  #naming the dataframe columns
print(featureScores.nlargest(7,'Score'))  #print 10 best features


featureScores.plot(kind='bar')

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# fit the model
lr = LogisticRegression(solver='lbfgs')
lr.fit(X_train_fs, y_train_enc)
# evaluate the model
yhat = lr.predict(X_test_fs)
# evaluate predictions
accuracy = accuracy_score(y_test_enc, yhat)
print('Accuracy: %.2f' % (accuracy*100))