# Multiclass Classification

The dataset we will be working with contains information on various cars. For each car we have information about the technical aspects of the vehicle such as the motor's displacement, the weight of the car, the miles per gallon, and how fast the car accelerates. Here are the columns in the dataset:

- `mpg` -- Miles per gallon, Continuous.
- `cylinders` -- Number of cylinders in the motor, Integer, Ordinal, and Categorical.
- `displacement` -- Size of the motor, Continuous.
- `horsepower` -- Horsepower produced, Continuous.
- `weight` -- Weights of the car, Continuous.
- `acceleration` -- Acceleration, Continuous.
- `year` -- Year the car was built, Integer and Categorical.
- `origin` -- Integer and Categorical. 1: North America, 2: Europe, 3: Asia.
- `car_name` -- Name of the car.

In [26]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression

### Exploring the Data

In [2]:
cars = pd.read_csv("auto.csv")

In [3]:
cars.shape

(392, 8)

In [4]:
cars.columns

Index(['mpg', 'cylinders', 'displacement', 'horsepower', 'weight',
       'acceleration', 'year', 'origin'],
      dtype='object')

In [5]:
cars.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,year,origin
0,18.0,8,307.0,130.0,3504.0,12.0,70,1
1,15.0,8,350.0,165.0,3693.0,11.5,70,1
2,18.0,8,318.0,150.0,3436.0,11.0,70,1
3,16.0,8,304.0,150.0,3433.0,12.0,70,1
4,17.0,8,302.0,140.0,3449.0,10.5,70,1


In [6]:
unique_regions = cars['origin'].unique()
unique_regions

array([1, 3, 2], dtype=int64)

### Using dummy variables for columns with categorical values

In [7]:
dummy_cyl = pd.get_dummies(cars["cylinders"], prefix = "cyl")
cars = pd.concat([cars, dummy_cyl], axis = 1)

In [8]:
cars.shape

(392, 13)

In [9]:
dummy_years = pd.get_dummies(cars["year"], prefix = "year")
cars = pd.concat([cars, dummy_years], axis = 1)

In [10]:
cars.shape

(392, 26)

In [11]:
cars = cars.drop("year", axis=1)
cars = cars.drop("cylinders", axis=1)

In [12]:
cars.shape

(392, 24)

In [13]:
cars.head()

Unnamed: 0,mpg,displacement,horsepower,weight,acceleration,origin,cyl_3,cyl_4,cyl_5,cyl_6,...,year_73,year_74,year_75,year_76,year_77,year_78,year_79,year_80,year_81,year_82
0,18.0,307.0,130.0,3504.0,12.0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,15.0,350.0,165.0,3693.0,11.5,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,18.0,318.0,150.0,3436.0,11.0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,16.0,304.0,150.0,3433.0,12.0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,17.0,302.0,140.0,3449.0,10.5,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### One-versus-all Multiclass Classification

In [16]:
shuffled_rows = np.random.permutation(cars.index)
shuffled_cars = cars.iloc[shuffled_rows]

In [18]:
shuffled_cars.head()

Unnamed: 0,mpg,displacement,horsepower,weight,acceleration,origin,cyl_3,cyl_4,cyl_5,cyl_6,...,year_73,year_74,year_75,year_76,year_77,year_78,year_79,year_80,year_81,year_82
16,18.0,199.0,97.0,2774.0,15.5,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
208,19.0,156.0,108.0,2930.0,15.5,3,0,0,0,1,...,0,0,0,1,0,0,0,0,0,0
225,19.0,225.0,100.0,3630.0,17.7,1,0,0,0,1,...,0,0,0,0,1,0,0,0,0,0
190,22.0,250.0,105.0,3353.0,14.5,1,0,0,0,1,...,0,0,0,1,0,0,0,0,0,0
245,39.4,85.0,70.0,2070.0,18.6,3,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0


In [22]:
ind = int(0.70 * cars.shape[0])
ind

274

In [23]:
train = shuffled_cars.iloc[0:ind]
test = shuffled_cars.iloc[ind:]

In [24]:
train.shape

(274, 24)

In [25]:
test.shape

(118, 24)

### Training a multiclass Logistic Regression model

In [31]:
train.columns

Index(['mpg', 'displacement', 'horsepower', 'weight', 'acceleration', 'origin',
       'cyl_3', 'cyl_4', 'cyl_5', 'cyl_6', 'cyl_8', 'year_70', 'year_71',
       'year_72', 'year_73', 'year_74', 'year_75', 'year_76', 'year_77',
       'year_78', 'year_79', 'year_80', 'year_81', 'year_82'],
      dtype='object')

In [32]:
features = ['cyl_3', 'cyl_4', 'cyl_5', 'cyl_6', 'cyl_8', 'year_70', 'year_71',
           'year_72', 'year_73', 'year_74', 'year_75', 'year_76', 'year_77',
           'year_78', 'year_79', 'year_80', 'year_81', 'year_82']

In [33]:
unique_origins = cars["origin"].unique()
unique_origins.sort()
unique_origins

array([1, 2, 3], dtype=int64)

In [34]:
lr_models = {}

for orig in unique_origins:
    lr = LogisticRegression()
    
    X = train[features]
    y = (train["origin"] == orig)

    lr.fit(X, y)
    lr_models[orig] = lr

lr_models

{1: LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
           intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
           penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
           verbose=0, warm_start=False),
 2: LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
           intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
           penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
           verbose=0, warm_start=False),
 3: LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
           intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
           penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
           verbose=0, warm_start=False)}

### Testing the models

In [35]:
testing_probs = pd.DataFrame(columns = unique_origins)
testing_probs

Unnamed: 0,1,2,3


In [38]:
for orig in unique_origins:
    x_test = test[features]
    testing_probs[orig] = lr_models[orig].predict_proba(x_test)[:,1]
    
testing_probs

Unnamed: 0,1,2,3
0,0.958446,0.029247,0.031171
1,0.953498,0.027057,0.038820
2,0.810496,0.123791,0.052420
3,0.373997,0.301688,0.311468
4,0.260727,0.216309,0.525711
5,0.613106,0.179167,0.222829
6,0.354183,0.349193,0.286167
7,0.285043,0.322874,0.383896
8,0.957262,0.032730,0.030759
9,0.851432,0.035766,0.135649


### Classify Observations

In [39]:
predicted_origins = testing_probs.idxmax(axis=1)
predicted_origins

0      1
1      1
2      1
3      1
4      3
5      1
6      1
7      3
8      1
9      1
10     1
11     1
12     1
13     1
14     1
15     1
16     1
17     3
18     1
19     1
20     1
21     1
22     1
23     3
24     1
25     1
26     3
27     3
28     3
29     3
      ..
88     2
89     3
90     1
91     2
92     1
93     1
94     1
95     1
96     1
97     3
98     1
99     1
100    1
101    2
102    2
103    1
104    2
105    1
106    1
107    3
108    1
109    2
110    1
111    1
112    3
113    1
114    3
115    3
116    3
117    1
Length: 118, dtype: int64