In [1]:
import pandas as pd 

## Loading the dataset

In [2]:
df=pd.read_csv("car.data",names=["buying","maint","doors","persons","lug_boot","safety","labels"])

## Data exploration

### Checking amount of data

In [3]:
df.shape

(1728, 7)

### Checking for null or missing values

In [4]:
df.isnull().any()

buying      False
maint       False
doors       False
persons     False
lug_boot    False
safety      False
labels      False
dtype: bool

### Exploration about classes to be classified or predicted

In [5]:
df["labels"].unique()

array(['unacc', 'acc', 'vgood', 'good'], dtype=object)

### Amount of data availaible for each class 

In [6]:
print("unacc -->",df[df["labels"]=="unacc"].shape)
print("acc-->",df[df["labels"]=="acc"].shape)
print("vgood-->",df[df["labels"]=="vgood"].shape)
print("good-->",df[df["labels"]=="good"].shape)

unacc --> (1210, 7)
acc--> (384, 7)
vgood--> (65, 7)
good--> (69, 7)


### Feature Exploration

In [7]:
print("buying-->",df["buying"].unique())
print(" ")
print("maint-->",df["maint"].unique())
print(" ")
print("doors-->",df["doors"].unique())
print(" ")
print("person-->",df["persons"].unique())
print(" ")
print("lug_boot-->",df["lug_boot"].unique())
print(" ")
print("safety-->",df["safety"].unique())

buying--> ['vhigh' 'high' 'med' 'low']
 
maint--> ['vhigh' 'high' 'med' 'low']
 
doors--> ['2' '3' '4' '5more']
 
person--> ['2' '4' 'more']
 
lug_boot--> ['small' 'med' 'big']
 
safety--> ['low' 'med' 'high']


## Transforming the Categorical features into number 

In [None]:
## extracting input and output features

In [8]:
X=df.iloc[:,:-1]
Y=df.iloc[:,-1:]

In [9]:
from sklearn.preprocessing import OneHotEncoder

In [10]:
OneHotEncoder=OneHotEncoder()

In [11]:
OneHotEncoder.fit_transform(X)

<1728x21 sparse matrix of type '<class 'numpy.float64'>'
	with 10368 stored elements in Compressed Sparse Row format>

### One hot encoder vs Label encoder 

In [None]:
# One hot encoder  ## Input->["India","Nepal"] then it creates two collumns with India->[1,0] and nepal with[0,1] values 
## Label Encoder   ##Input-> ["India","Nepal"] then it return one array with [0,1] meaning 0 for India and 1 for Nepal 
## Label Encoder is used for cases EX: Input-->["low","medium","high"]  which returns us [0,1,2]
## where there is some kind of relationshiop between values
## so we will ditch One Hot Encoder in this case and use Labe Encoder for this 

In [12]:
from sklearn.preprocessing import LabelEncoder

In [13]:
LE=LabelEncoder()

In [14]:
import numpy as np ## importing numpy for saving of Encoders for new incoming test data
# np.load("safety.npy",allow_pickle=True) ## for encoding new test data

### Transforming Every Categorical Variables and saving an encoder for future inputs

In [15]:
X.loc[:,'safety']=LE.fit_transform(X["safety"])
np.save("safety.npy",LE.classes_)

X.loc[:,'buying']=LE.fit_transform(X["buying"])
np.save("buying.npy",LE.classes_)

X.loc[:,'lug_boot']=LE.fit_transform(X["lug_boot"])
np.save("lug_boot.npy",LE.classes_)

X.loc[:,'maint']=LE.fit_transform(X["maint"])
np.save("maint.npy",LE.classes_)

X.loc[:,'doors']=LE.fit_transform(X["doors"])
np.save("doors.npy",LE.classes_)

X.loc[:,'persons']=LE.fit_transform(X["persons"])
np.save("persons.npy",LE.classes_)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item_labels[indexer[info_axis]]] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


### Encoding classes 

In [16]:
Y["labels"]=LE.fit_transform(Y)
np.save("labels.npy",LE.classes_)

  y = column_or_1d(y, warn=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


## Data before Encoding 

In [17]:
print("buying-->",df["buying"].unique())
print(" ")
print("maint-->",df["maint"].unique())
print(" ")
print("doors-->",df["doors"].unique())
print(" ")
print("person-->",df["persons"].unique())
print(" ")
print("lug_boot-->",df["lug_boot"].unique())
print(" ")
print("safety-->",df["safety"].unique())

buying--> ['vhigh' 'high' 'med' 'low']
 
maint--> ['vhigh' 'high' 'med' 'low']
 
doors--> ['2' '3' '4' '5more']
 
person--> ['2' '4' 'more']
 
lug_boot--> ['small' 'med' 'big']
 
safety--> ['low' 'med' 'high']


## Data after Encoding

In [18]:
print("buying-->",X["buying"].unique())
print(" ")
print("maint-->",X["maint"].unique())
print(" ")
print("doors-->",X["doors"].unique())
print(" ")
print("person-->",X["persons"].unique())
print(" ")
print("lug_boot-->",X["lug_boot"].unique())
print(" ")
print("safety-->",X["safety"].unique())

buying--> [3 0 2 1]
 
maint--> [3 0 2 1]
 
doors--> [0 1 2 3]
 
person--> [0 1 2]
 
lug_boot--> [2 1 0]
 
safety--> [1 2 0]


## Checking the Saved Encoders for future inputs

In [19]:
print("buying-->",np.load("buying.npy",allow_pickle=True))
print(" ")
print("maint-->",np.load("maint.npy",allow_pickle=True))
print(" ")
print("doors-->",np.load("doors.npy",allow_pickle=True))
print(" ")
print("person-->",np.load("persons.npy",allow_pickle=True))
print(" ")
print("lug_boot-->",np.load("lug_boot.npy",allow_pickle=True))
print(" ")
print("safety-->",np.load("safety.npy",allow_pickle=True))
print("///")
print("labels-->",np.load("labels.npy",allow_pickle=True))

buying--> ['high' 'low' 'med' 'vhigh']
 
maint--> ['high' 'low' 'med' 'vhigh']
 
doors--> ['2' '3' '4' '5more']
 
person--> ['2' '4' 'more']
 
lug_boot--> ['big' 'med' 'small']
 
safety--> ['high' 'low' 'med']
///
labels--> ['acc' 'good' 'unacc' 'vgood']


## Dividing the data into train and test

In [20]:
from sklearn.model_selection import train_test_split

In [21]:
X_train,X_test,y_train,y_test= train_test_split(X,Y,test_size=0.20,random_state=42)

## Diffrent Algorithms for classification

### Support Vector machines

In [25]:
from sklearn.svm import SVC

In [137]:
## You can try differene kernels for svm like linear rvf sigmoid and poly which tries to capture more information
## about data points in higher dimenstions without actually projecting them into higher dimension. 
# RBF has worked best for me in this  problem statement.

In [129]:
classifier_A=SVC(kernel='rbf',decision_function_shape='ovr',probability=True)

In [138]:
classifier_A.fit(X_train,y_train)

  y = column_or_1d(y, warn=True)


SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=True, random_state=None, shrinking=True, tol=0.001,
    verbose=False)

In [139]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

In [143]:
y_pred_svc=classifier_A.predict(X_test)

In [144]:
accuracy_score(y_test,y_pred_svc)

0.9132947976878613

In [3]:
##Confusin matrix is the preferred evaluation metric for multiclass classification.
# Working:
##              predicted class 
# ground truth  0[0   1]--> O predicted as 0 or 1 
#               1[0   1]--> 1 predicted as 0 or 1

#     likewise 
# 0->[0 1 2 3]
# 1->[0 1 2 3]
# 2->[0 1 2 3]
# 3->[0 1 2 3]

In [145]:
confusion_matrix(y_test, y_pred_svc)

array([[ 63,   0,  20,   0],
       [  5,   5,   0,   1],
       [  2,   0, 233,   0],
       [  2,   0,   0,  15]], dtype=int64)

### Random Forest Classifier

In [146]:
## tree based algorithms are suggested for catgorical data so let's try Random forest for this one.

In [69]:
from sklearn.ensemble import RandomForestClassifier

In [117]:
classifier_B = RandomForestClassifier(random_state=42,criterion='gini',bootstrap=True)
classifier_B.fit(X_train, y_train)

  


RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)

In [118]:
y_pred_rf=classifier_B.predict(X_test)

In [119]:
accuracy_score(y_test,y_pred_rf)

0.9739884393063584

In [120]:
confusion_matrix(y_test, y_pred_rf)

array([[ 75,   6,   2,   0],
       [  0,  11,   0,   0],
       [  0,   0, 235,   0],
       [  1,   0,   0,  16]], dtype=int64)

In [122]:
## Once you have finalized the model just dump it for further predictions

In [151]:
import joblib

In [153]:
joblib.dump(classifier_B,"classifier.pkl")

['classifier.pkl']