In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns

In [2]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

In [3]:
train_df.head(10)

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat,Id
0,p,x,s,n,t,p,f,c,n,k,...,w,w,p,w,o,p,k,s,u,0
1,e,x,s,y,t,a,f,c,b,k,...,w,w,p,w,o,p,n,n,g,1
2,e,b,s,w,t,l,f,c,b,n,...,w,w,p,w,o,p,n,n,m,2
3,p,x,y,w,t,p,f,c,n,n,...,w,w,p,w,o,p,k,s,u,3
4,e,x,s,g,f,n,f,w,b,k,...,w,w,p,w,o,e,n,a,g,4
5,e,x,y,y,t,a,f,c,b,n,...,w,w,p,w,o,p,k,n,g,5
6,e,b,s,w,t,a,f,c,b,g,...,w,w,p,w,o,p,k,n,m,6
7,e,b,y,w,t,l,f,c,b,n,...,w,w,p,w,o,p,n,s,m,7
8,p,x,y,w,t,p,f,c,n,p,...,w,w,p,w,o,p,k,v,g,8
9,e,b,s,y,t,a,f,c,b,g,...,w,w,p,w,o,p,k,s,m,9


In [4]:
test_df.head(10)

Unnamed: 0,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,stalk-shape,...,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat,Id
0,x,y,y,t,a,f,c,b,n,e,...,w,w,p,w,o,p,k,s,m,11
1,b,s,y,t,a,f,c,b,k,e,...,w,w,p,w,o,p,n,s,m,20
2,b,y,w,t,a,f,c,b,w,e,...,w,w,p,w,o,p,n,n,m,23
3,f,s,w,t,p,f,c,n,n,e,...,w,w,p,w,o,p,n,v,g,25
4,f,f,n,f,n,f,c,n,k,e,...,w,w,p,w,o,p,k,y,u,28
5,x,s,y,t,a,f,w,n,n,t,...,w,w,p,w,o,p,n,v,d,29
6,b,s,y,t,l,f,c,b,g,e,...,w,w,p,w,o,p,n,n,m,30
7,s,f,g,f,n,f,c,n,k,e,...,w,w,p,w,o,p,k,v,u,36
8,x,y,y,t,l,f,c,b,n,e,...,w,w,p,w,o,p,k,y,p,41
9,x,y,n,t,a,f,c,b,w,e,...,w,w,p,w,o,p,k,s,g,50


In [5]:
train_df['class'].value_counts()

e    2956
p    2701
Name: class, dtype: int64

The class distribution of the two classes are relevant balanced, so we don't need to rebalance data

In [6]:
train_df.isnull().sum()

class                       0
cap-shape                   0
cap-surface                 0
cap-color                   0
bruises                     0
odor                        0
gill-attachment             0
gill-spacing                0
gill-size                   0
gill-color                  0
stalk-shape                 0
stalk-root                  0
stalk-surface-above-ring    0
stalk-surface-below-ring    0
stalk-color-above-ring      0
stalk-color-below-ring      0
veil-type                   0
veil-color                  0
ring-number                 0
ring-type                   0
spore-print-color           0
population                  0
habitat                     0
Id                          0
dtype: int64

In [7]:
test_df.isnull().sum()

cap-shape                   0
cap-surface                 0
cap-color                   0
bruises                     0
odor                        0
gill-attachment             0
gill-spacing                0
gill-size                   0
gill-color                  0
stalk-shape                 0
stalk-root                  0
stalk-surface-above-ring    0
stalk-surface-below-ring    0
stalk-color-above-ring      0
stalk-color-below-ring      0
veil-type                   0
veil-color                  0
ring-number                 0
ring-type                   0
spore-print-color           0
population                  0
habitat                     0
Id                          0
dtype: int64

We can see that, no field in any rows are null, 

we don't need extra processing on missing values

Next, we convert the one-character class features into number labels

In [8]:
from sklearn.preprocessing import LabelEncoder

for col in train_df.columns:
    if col == 'Id':
        continue
    labelencoder = LabelEncoder()
    labelencoder.fit(train_df[col])
    
    train_df[col] = labelencoder.transform(train_df[col])
    
    if col in test_df.columns:
        test_df[col] = labelencoder.transform(test_df[col])

In [9]:
train_df.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat,Id
0,1,5,2,4,1,6,1,0,1,4,...,7,7,0,2,1,4,2,3,5,0
1,0,5,2,9,1,0,1,0,0,4,...,7,7,0,2,1,4,3,2,1,1
2,0,0,2,8,1,3,1,0,0,5,...,7,7,0,2,1,4,3,2,3,2
3,1,5,3,8,1,6,1,0,1,5,...,7,7,0,2,1,4,2,3,5,3
4,0,5,2,3,0,5,1,1,0,4,...,7,7,0,2,1,0,3,0,1,4


In [10]:
test_df.head()

Unnamed: 0,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,stalk-shape,...,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat,Id
0,5,3,9,1,0,1,0,0,5,0,...,7,7,0,2,1,4,2,3,3,11
1,0,2,9,1,0,1,0,0,4,0,...,7,7,0,2,1,4,3,3,3,20
2,0,3,8,1,0,1,0,0,10,0,...,7,7,0,2,1,4,3,2,3,23
3,2,2,8,1,6,1,0,1,5,0,...,7,7,0,2,1,4,3,4,1,25
4,2,0,4,0,5,1,0,1,4,0,...,7,7,0,2,1,4,2,5,5,28


In [11]:
train_df['class'].value_counts()

0    2956
1    2701
Name: class, dtype: int64

Now class 0 is e and class 1 is p.

In [13]:
train_df.describe()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat,Id
count,5657.0,5657.0,5657.0,5657.0,5657.0,5657.0,5657.0,5657.0,5657.0,5657.0,...,5657.0,5657.0,5657.0,5657.0,5657.0,5657.0,5657.0,5657.0,5657.0,5657.0
mean,0.477462,3.365388,1.818101,4.503977,0.417889,4.131872,0.974898,0.162984,0.305993,4.790525,...,5.789818,5.780803,0.0,1.96606,1.068588,2.287255,3.581757,3.645925,1.497967,4028.688351
std,0.499536,1.599579,1.23236,2.54125,0.493255,2.083291,0.156448,0.369384,0.460867,3.516399,...,1.925187,1.923492,0.0,0.241358,0.269694,1.801903,2.370047,1.249596,1.724902,2334.652887
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,2.0,0.0,3.0,0.0,2.0,1.0,0.0,0.0,2.0,...,6.0,6.0,0.0,2.0,1.0,0.0,2.0,3.0,0.0,2007.0
50%,0.0,3.0,2.0,4.0,0.0,5.0,1.0,0.0,0.0,5.0,...,7.0,7.0,0.0,2.0,1.0,2.0,3.0,4.0,1.0,4012.0
75%,1.0,5.0,3.0,8.0,1.0,5.0,1.0,0.0,1.0,7.0,...,7.0,7.0,0.0,2.0,1.0,4.0,7.0,4.0,2.0,6042.0
max,1.0,5.0,3.0,9.0,1.0,8.0,1.0,1.0,1.0,11.0,...,8.0,8.0,0.0,3.0,2.0,4.0,8.0,5.0,6.0,8123.0


In [22]:
features = [column for column in train_df.columns]
features.remove('Id')
features.remove('class')
print('Total number of features = %d' % len(features))

Total number of features = 22


In [25]:
X_all = train_df[features]
y_all = train_df['class']

In [26]:
X_all.head()

Unnamed: 0,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,stalk-shape,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,5,2,4,1,6,1,0,1,4,0,...,2,7,7,0,2,1,4,2,3,5
1,5,2,9,1,0,1,0,0,4,0,...,2,7,7,0,2,1,4,3,2,1
2,0,2,8,1,3,1,0,0,5,0,...,2,7,7,0,2,1,4,3,2,3
3,5,3,8,1,6,1,0,1,5,0,...,2,7,7,0,2,1,4,2,3,5
4,5,2,3,0,5,1,1,0,4,1,...,2,7,7,0,2,1,0,3,0,1


In [46]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X_all, y_all, test_size=0.2, random_state=4)

In [47]:
from sklearn.ensemble import RandomForestClassifier

model_RR=RandomForestClassifier()

In [48]:
model_RR.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [49]:
y_pred = model_RR.predict(X_val) # This will give you positive class prediction probabilities 
model_RR.score(X_val, y_val)

1.0

In [52]:
accuracy = 0
total = len(y_pred)
val_labels = y_val.values

for i in range(total):
    if y_pred[i] == val_labels[i]:
        accuracy = accuracy + 1

print('Accuracy = %d/%d' % (accuracy, total))

Accuracy = 1132/1132


### Crazily, 100% validation accuracy is obtained from Random Forest Model. This cannot be improved any more!

In [70]:
test_data = test_df[features]
test_pred = model_RR.predict(test_data)
test_pred

array([0, 0, 0, ..., 1, 1, 1])

In [71]:
test_df['Id'].head()

0    11
1    20
2    23
3    25
4    28
Name: Id, dtype: int64

In [72]:
class_map = {0: 'e', 1: 'p'}
test_pred = [class_map[i] for i in test_pred]
test_pred[:10]

['e', 'e', 'e', 'p', 'e', 'e', 'e', 'e', 'e', 'e']

In [73]:
test_df['class'] = test_pred
test_df.head()

Unnamed: 0,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,stalk-shape,...,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat,Id,class
0,5,3,9,1,0,1,0,0,5,0,...,7,0,2,1,4,2,3,3,11,e
1,0,2,9,1,0,1,0,0,4,0,...,7,0,2,1,4,3,3,3,20,e
2,0,3,8,1,0,1,0,0,10,0,...,7,0,2,1,4,3,2,3,23,e
3,2,2,8,1,6,1,0,1,5,0,...,7,0,2,1,4,3,4,1,25,p
4,2,0,4,0,5,1,0,1,4,0,...,7,0,2,1,4,2,5,5,28,e


In [74]:
submission_df = test_df[['Id', 'class']]
submission_df.to_csv('submission.csv', index=False)