In [18]:
import pandas as pd
import numpy as np
import pickle as p

from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

Compile the training data

In [2]:
meta_data = pd.read_csv('/home/ec2-user/training_data_meta.csv')

In [3]:
#Join cleaned class data with training metadata

class_data = pd.read_csv('/home/ec2-user/stanford_labels_cleaned.csv')

In [4]:
meta_data = pd.merge(meta_data, class_data, on = 'class', how = 'left')

In [5]:
meta_data['Body Type'].value_counts()

Sedan          2075
SUV            1558
Coupe          1540
Convertible    1036
Crew Cab        381
Hatchback       380
Cab             297
Van             291
Wagon           253
Minivan         250
Quad Cab         44
Club Cab         39
Name: Body Type, dtype: int64

In [6]:
meta_data = meta_data.loc[meta_data['Body Type'].isin(['Coupe', 'Sedan'])].copy()

In [7]:
meta_data['is_sedan_target'] = (meta_data['Body Type'] == 'Sedan').astype(int)

In [8]:
image_dict = p.load(open('/home/ec2-user/scaled_grayscale_dict.p', 'rb'))

In [24]:
training_data = []
for i in meta_data[['is_sedan_target', 'fname']].iterrows():
    row = [i[1]['is_sedan_target']]
    row.extend(image_dict[i[1]['fname']].flatten())
    training_data.append(row)
training_data = np.array(training_data).astype(float)

Fit the model

In [26]:
X_train, X_test, y_train, y_test = train_test_split(training_data[:,1:], 
                                                    training_data[:,0], 
                                                    test_size=0.33, 
                                                    random_state=42)

In [27]:
scaler = StandardScaler()

scaler.fit(X_train)

In [29]:
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [85]:
model = LogisticRegression(penalty = 'l1', class_weight = 'balanced', solver = 'liblinear',
                           max_iter=100000, verbose = 4, C = 0.01
                          )

model.fit(X_train, y_train)

[LibLinear]

LogisticRegression(C=0.01, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100000,
          multi_class='warn', n_jobs=None, penalty='l1', random_state=None,
          solver='liblinear', tol=0.0001, verbose=4, warm_start=False)

In [86]:
model.classes_

array([0., 1.])

In [87]:
print('training auroc : %f'%(roc_auc_score(y_train, model.predict_proba(X_train)[:,1])))

training auroc : 0.577371


In [88]:
print('testing auroc : %f'%(roc_auc_score(y_test, model.predict_proba(X_test)[:,1])))


testing auroc : 0.527468
