In [78]:
import pandas as pd
import numpy as np
import matplotlib as plt
import seaborn as sns
%matplotlib inline

In [47]:
data=pd.read_csv("car.csv")
data.head()

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,class
0,vhigh,vhigh,2,2,small,low,unacc
1,vhigh,vhigh,2,2,small,med,unacc
2,vhigh,vhigh,2,2,small,high,unacc
3,vhigh,vhigh,2,2,med,low,unacc
4,vhigh,vhigh,2,2,med,med,unacc


In [48]:
data.describe()

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,class
count,1728,1728,1728,1728,1728,1728,1728
unique,4,4,4,3,3,3,4
top,vhigh,vhigh,2,more,big,high,unacc
freq,432,432,432,576,576,576,1210


In [49]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1728 entries, 0 to 1727
Data columns (total 7 columns):
buying      1728 non-null object
maint       1728 non-null object
doors       1728 non-null object
persons     1728 non-null object
lug_boot    1728 non-null object
safety      1728 non-null object
class       1728 non-null object
dtypes: object(7)
memory usage: 94.6+ KB


In [50]:
data.isnull().any()

buying      False
maint       False
doors       False
persons     False
lug_boot    False
safety      False
class       False
dtype: bool

In [51]:
data['buying'].unique()

array(['vhigh', 'high', 'med', 'low'], dtype=object)

In [52]:
data['buying'].value_counts()

vhigh    432
high     432
med      432
low      432
Name: buying, dtype: int64

In [53]:
for i in data.keys():
    print(i)
    print(data[i].unique())
    print(data[i].value_counts())

buying
['vhigh' 'high' 'med' 'low']
vhigh    432
high     432
med      432
low      432
Name: buying, dtype: int64
maint
['vhigh' 'high' 'med' 'low']
vhigh    432
high     432
med      432
low      432
Name: maint, dtype: int64
doors
['2' '3' '4' '5more']
2        432
5more    432
4        432
3        432
Name: doors, dtype: int64
persons
['2' '4' 'more']
more    576
2       576
4       576
Name: persons, dtype: int64
lug_boot
['small' 'med' 'big']
big      576
small    576
med      576
Name: lug_boot, dtype: int64
safety
['low' 'med' 'high']
high    576
med     576
low     576
Name: safety, dtype: int64
class
['unacc' 'acc' 'vgood' 'good']
unacc    1210
acc       384
good       69
vgood      65
Name: class, dtype: int64


In [54]:
y=data['class']

In [55]:
data.loc[data['doors']=='5more']='5'

In [56]:
data['doors'].unique()

array(['2', '3', '4', '5'], dtype=object)

In [57]:
data.loc[data['persons']=='more']='5'

In [58]:
num_cols=['doors','persons']
num_data=data[num_cols].apply(lambda x:pd.to_numeric(x))
num_data.head()

Unnamed: 0,doors,persons
0,2,2
1,2,2
2,2,2
3,2,2
4,2,2


In [59]:
data=data.drop('class',axis=1)

In [60]:
data=data.drop(num_cols,axis=1)

In [61]:
data.head()

Unnamed: 0,buying,maint,lug_boot,safety
0,vhigh,vhigh,small,low
1,vhigh,vhigh,small,med
2,vhigh,vhigh,small,high
3,vhigh,vhigh,med,low
4,vhigh,vhigh,med,med


In [62]:
data=pd.get_dummies(data)

In [63]:
data.head()

Unnamed: 0,buying_5,buying_high,buying_low,buying_med,buying_vhigh,maint_5,maint_high,maint_low,maint_med,maint_vhigh,lug_boot_5,lug_boot_big,lug_boot_med,lug_boot_small,safety_5,safety_high,safety_low,safety_med
0,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,1,0
1,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,1
2,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,1,0,0
3,0,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0,1,0
4,0,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0,0,1


In [64]:
data=pd.concat([data,num_data],axis=1)

In [65]:
data.columns

Index(['buying_5', 'buying_high', 'buying_low', 'buying_med', 'buying_vhigh',
       'maint_5', 'maint_high', 'maint_low', 'maint_med', 'maint_vhigh',
       'lug_boot_5', 'lug_boot_big', 'lug_boot_med', 'lug_boot_small',
       'safety_5', 'safety_high', 'safety_low', 'safety_med', 'doors',
       'persons'],
      dtype='object')

In [66]:
x=data.values

In [67]:
y=y.values

# Model

In [68]:
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score,precision_score,recall_score
import plotLearningCurves

In [69]:
x_train,x_test,y_train,y_test=train_test_split(data,y,test_size=.25)

# Testing model based on cross validation score since the dataset is small


In [70]:
clf=GaussianNB()
score=cross_val_score(clf,x_train,y_train,cv=10)
print("GaussianNB: ",score.mean())

GaussianNB:  0.925173481334


In [71]:
clf=RandomForestClassifier()
score=cross_val_score(clf,x_train,y_train)
print("RandomForestClassifier: ",score.mean())

RandomForestClassifier:  0.956035768201


In [72]:
clf=SVC()
score=cross_val_score(clf,x_train,y_train)
print("SVM: ",score.mean())

SVM:  0.940631950316


In [73]:
clf=KNeighborsClassifier()
score=cross_val_score(clf,x_train,y_train)
print("KNN: ",score.mean())

KNN:  0.944450795238


# Grid Search

In [97]:
from sklearn.grid_search import GridSearchCV

In [105]:
params={'n_estimators': [10,15,20] , 'criterion':['entropy','gini'] , 'min_samples_split': [2,5,10]}
x_train,x_test,y_train,y_test= train_test_split(data,y,test_size=.25,random_state=10)
model=RandomForestClassifier()
clf=GridSearchCV(model,params)
clf.fit(x_train,y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'n_estimators': [10, 15, 20], 'min_samples_split': [2, 5, 10], 'criterion': ['entropy', 'gini']},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)

In [112]:
print(clf.score(x_test,y_test))

0.983796296296


In [114]:
print(clf.get_params())

{'verbose': 0, 'estimator__oob_score': False, 'estimator__min_impurity_decrease': 0.0, 'estimator__bootstrap': True, 'param_grid': {'n_estimators': [10, 15, 20], 'min_samples_split': [2, 5, 10], 'criterion': ['entropy', 'gini']}, 'cv': None, 'estimator__n_estimators': 10, 'fit_params': {}, 'pre_dispatch': '2*n_jobs', 'scoring': None, 'error_score': 'raise', 'n_jobs': 1, 'estimator__min_weight_fraction_leaf': 0.0, 'estimator__max_features': 'auto', 'estimator__min_impurity_split': None, 'estimator__max_depth': None, 'estimator__n_jobs': 1, 'estimator__criterion': 'gini', 'iid': True, 'estimator__verbose': 0, 'estimator__class_weight': None, 'estimator__min_samples_split': 2, 'estimator__min_samples_leaf': 1, 'refit': True, 'estimator__max_leaf_nodes': None, 'estimator__random_state': None, 'estimator': RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, mi