In [20]:
#Data source https://github.com/buswedg/dataquest/blob/master/Machine%20Learning%20in%20Python/Multiclass%20Classification/data/auto.txt

#Import file and check unique values of regions
import pandas as pd
cars = pd.read_csv("auto.csv", sep = " ")

unique_regions = cars['origin'].unique()
print (unique_regions)

[1 3 2]


In [21]:
#Create dummy columns according to number of cylinders
dummy_cylinders = pd.get_dummies(cars["cylinders"], prefix="cyl")
cars = pd.concat([cars, dummy_cylinders], axis=1)
print(cars.head())

#Create dummy columns according to year column
dummy_years = pd.get_dummies(cars["year"], prefix="year")
cars = pd.concat([cars, dummy_years], axis=1)
cars.drop(labels = 'year', axis = 1, inplace = True)
cars.drop(labels = 'cylinders', axis = 1, inplace = True)
print (cars.head())

    mpg  cylinders  displacement horsepower  weight  acceleration  year  \
0  18.0          8         307.0      130.0  3504.0          12.0    70   
1  15.0          8         350.0      165.0  3693.0          11.5    70   
2  18.0          8         318.0      150.0  3436.0          11.0    70   
3  16.0          8         304.0      150.0  3433.0          12.0    70   
4  17.0          8         302.0      140.0  3449.0          10.5    70   

   origin                   car_name  cyl_3  cyl_4  cyl_5  cyl_6  cyl_8  
0       1  chevrolet chevelle malibu      0      0      0      0      1  
1       1          buick skylark 320      0      0      0      0      1  
2       1         plymouth satellite      0      0      0      0      1  
3       1              amc rebel sst      0      0      0      0      1  
4       1                ford torino      0      0      0      0      1  
    mpg  displacement horsepower  weight  acceleration  origin  \
0  18.0         307.0      130.0  3504.

In [22]:
import numpy as np

#Shuffle dataset
shuffled_rows = np.random.permutation(cars.index)
shuffled_cars = cars.iloc[shuffled_rows]

#Split data into training and testing set
train = shuffled_cars.iloc[0:int(np.floor(len(cars)*0.7))]
test = shuffled_cars.iloc[int(np.floor(len(cars)*0.7)):len(cars)]

In [37]:
#Train logistic model to classify origin given number of cylinders and year

from sklearn.linear_model import LogisticRegression

unique_origins = cars["origin"].unique()
unique_origins.sort()

models = {}

features = [c for c in train.columns if c.startswith("cyl") or c.startswith("year")]

x = train [features]

for i in unique_origins:
    models[i] = LogisticRegression()
    y = (train['origin'] == i)
    models[i].fit(x,y)




In [38]:
#Predict %chance that test entries came from each origin
testing_probs = pd.DataFrame(columns=unique_origins)

x = test [features]

for i in unique_origins:
    prediction = models[i].predict_proba(x)[:,1]
    testing_probs [i]= prediction


In [39]:
#Classify test entries by max %chance

predicted_origins = testing_probs.idxmax( axis = 1)
print (predicted_origins)

0      1
1      3
2      3
3      1
4      2
5      1
6      1
7      1
8      1
9      2
10     1
11     3
12     3
13     3
14     1
15     1
16     1
17     2
18     2
19     1
20     3
21     1
22     1
23     1
24     2
25     1
26     1
27     1
28     2
29     1
      ..
90     1
91     2
92     3
93     1
94     1
95     1
96     1
97     1
98     1
99     2
100    2
101    1
102    1
103    1
104    1
105    2
106    1
107    1
108    1
109    1
110    2
111    1
112    1
113    3
114    1
115    2
116    1
117    3
118    1
119    3
dtype: int64


In [26]:
train['origin']

386    1
249    1
127    1
198    3
23     2
20     2
130    1
228    1
285    1
125    1
225    1
186    2
48     1
344    1
131    3
67     1
256    1
333    3
64     1
114    2
169    1
162    1
50     2
31     3
44     1
110    3
276    2
232    1
123    3
298    1
      ..
395    1
215    1
181    3
68     1
92     1
93     1
0      1
233    2
207    2
160    1
220    3
253    1
52     2
219    1
306    1
301    1
13     1
74     1
177    2
72     1
89     1
237    1
199    1
98     1
103    1
312    3
106    1
348    3
132    1
205    3
Name: origin, dtype: int64

In [27]:
unique_origins

array([1, 2, 3], dtype=int64)