In [21]:
import pandas as pd
import numpy as np
cars = pd.read_csv("https://dsserver-prod-resources-1.s3.amazonaws.com/24/auto.csv?versionId=76hpAhUMzqghjulnvc1J1nPIgi49AwAT")
cars.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,year,origin
0,18.0,8,307.0,130.0,3504.0,12.0,70,1
1,15.0,8,350.0,165.0,3693.0,11.5,70,1
2,18.0,8,318.0,150.0,3436.0,11.0,70,1
3,16.0,8,304.0,150.0,3433.0,12.0,70,1
4,17.0,8,302.0,140.0,3449.0,10.5,70,1


In [22]:
# Uniques are returned in order of appearance
unique_regions = cars["origin"].unique()
unique_regions

array([1, 3, 2], dtype=int64)

#### Creating dummy variable

In [23]:
dummy_cylinders = pd.get_dummies(cars["cylinders"], prefix="cyl")
cars = pd.concat([cars, dummy_cylinders], axis=1)
dummy_years = pd.get_dummies(cars["year"], prefix="year")
cars = pd.concat([cars, dummy_years], axis=1)

cars = cars.drop("year", axis=1)
cars = cars.drop("cylinders", axis=1)
cars.head()

Unnamed: 0,mpg,displacement,horsepower,weight,acceleration,origin,cyl_3,cyl_4,cyl_5,cyl_6,...,year_73,year_74,year_75,year_76,year_77,year_78,year_79,year_80,year_81,year_82
0,18.0,307.0,130.0,3504.0,12.0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,15.0,350.0,165.0,3693.0,11.5,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,18.0,318.0,150.0,3436.0,11.0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,16.0,304.0,150.0,3433.0,12.0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,17.0,302.0,140.0,3449.0,10.5,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Multiclass classification with one-versus-all method

* One-versus-all method: Technique where we choose a single category as the Positive case and group the rest of the categories as the False case. We're essentially splitting the problem into multiple binary classification problems. For each observation, the model will then output the probability of belonging to each category.
* In the one-vs-all approach, we're essentially converting an n-class (in our case n is 3) classification problem into n binary classification problems. 



In [24]:
# Randomize the row index number
shuffled_rows = np.random.permutation(cars.index)

# Return a Dataframe with randomized row index
shuffled_cars = cars.iloc[shuffled_rows]

highest_train_row = int(cars.shape[0] * .70)
train = shuffled_cars.iloc[0:highest_train_row]
test = shuffled_cars.iloc[highest_train_row:]

In [25]:

from sklearn.linear_model import LogisticRegression

unique_origins = cars["origin"].unique()
unique_origins.sort()

models = {}

# features columns
features = [c for c in train.columns if c.startswith("cyl")
            or c.startswith("year")]

for origin in unique_origins:
    model = LogisticRegression()
    
    X_train = train[features]
    
    # Boolean value
    y_train = train["origin"] == origin

    # Fitting a logistic model
    model.fit(X_train, y_train)
    # Add each "model" to "models" dictionary being a key (origin value) & a value (relevant model)
    models[origin] = model
    

print(models)

{1: LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False), 2: LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False), 3: LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)}




In [30]:
testing_probs = pd.DataFrame(columns=unique_origins)

for origin in unique_origins:
    # Select testing features.
    X_test = test[features]   
    # Insert probability of each TRUE observation of prediction into "testing_probs" dataframe 
    testing_probs[origin] = models[origin].predict_proba(X_test)[:,1]

testing_probs[:10]

Unnamed: 0,1,2,3
0,0.274457,0.138923,0.591288
1,0.838123,0.088172,0.074896
2,0.285947,0.379999,0.327604
3,0.834546,0.117958,0.052206
4,0.214343,0.310237,0.483431
5,0.946443,0.042916,0.02649
6,0.782968,0.165597,0.048454
7,0.946443,0.042916,0.02649
8,0.346233,0.339971,0.30092
9,0.540737,0.121382,0.351168


In [28]:
# To classify each observation, origin with the highest probability of classification for that observation should be selected.

# axis paramater to 1 since we want to calculate the maximum value across columns
predicted_origins = testing_probs.idxmax(axis=1)
predicted_origins[:10]

0    3
1    1
2    2
3    1
4    3
5    1
6    1
7    1
8    1
9    1
dtype: int64