In [11]:
import pandas as pd
import numpy as np

#For Warnings 
import warnings 
warnings.filterwarnings('ignore')

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import log_loss, roc_auc_score, recall_score, precision_score, average_precision_score, f1_score, classification_report, accuracy_score
from sklearn.metrics import mean_absolute_error

df = pd.read_csv('vehicles_us_5000.csv')
del df['is_4wd']
df = df.dropna()
df

Unnamed: 0,price,model_year,model,condition,cylinders,fuel,odometer,transmission,type,paint_color,date_posted,days_listed
2,5500,2013.0,hyundai sonata,like new,4.0,gas,110000.0,automatic,sedan,red,2019-02-07,79
4,14900,2017.0,chrysler 200,excellent,4.0,gas,80903.0,automatic,sedan,black,2019-04-02,28
5,14990,2014.0,chrysler 300,excellent,6.0,gas,57954.0,automatic,sedan,black,2018-06-20,15
6,12990,2015.0,toyota camry,excellent,4.0,gas,79212.0,automatic,sedan,white,2018-12-27,73
7,15990,2013.0,honda pilot,excellent,6.0,gas,109473.0,automatic,SUV,black,2019-01-07,68
...,...,...,...,...,...,...,...,...,...,...,...,...
5091,7300,2013.0,nissan altima,excellent,6.0,gas,7100.0,automatic,sedan,black,2018-06-17,63
5092,17999,2014.0,ram 1500,like new,8.0,gas,154000.0,automatic,pickup,white,2018-06-21,11
5093,4800,2012.0,volkswagen jetta,good,4.0,gas,138000.0,automatic,sedan,silver,2018-07-19,39
5094,12000,2005.0,chevrolet silverado 2500hd,good,8.0,diesel,228000.0,automatic,pickup,silver,2018-08-18,52


In [12]:
df = df.rename(columns={'price': 'target'})

df['target'].value_counts

<bound method IndexOpsMixin.value_counts of 2        5500
4       14900
5       14990
6       12990
7       15990
        ...  
5091     7300
5092    17999
5093     4800
5094    12000
5096    11000
Name: target, Length: 3031, dtype: int64>

In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3031 entries, 2 to 5096
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   target        3031 non-null   int64  
 1   model_year    3031 non-null   float64
 2   model         3031 non-null   object 
 3   condition     3031 non-null   object 
 4   cylinders     3031 non-null   float64
 5   fuel          3031 non-null   object 
 6   odometer      3031 non-null   float64
 7   transmission  3031 non-null   object 
 8   type          3031 non-null   object 
 9   paint_color   3031 non-null   object 
 10  date_posted   3031 non-null   object 
 11  days_listed   3031 non-null   int64  
dtypes: float64(3), int64(2), object(7)
memory usage: 307.8+ KB


In [14]:
df['model'].value_counts(dropna=False)
df['condition'].value_counts(dropna=False)
df['fuel'].value_counts(dropna=False)
df['transmission'].value_counts(dropna=False)
df['type'].value_counts(dropna=False)
df['paint_color'].value_counts(dropna=False)
df['date_posted'].value_counts(dropna=False)

2019-03-11    19
2019-03-01    16
2018-06-30    16
2018-08-06    16
2018-10-22    16
              ..
2019-02-11     2
2018-12-11     2
2018-08-10     2
2018-05-05     1
2018-08-08     1
Name: date_posted, Length: 354, dtype: int64

In [15]:
df = pd.get_dummies(df, columns=['model', 'condition', 'fuel', 'transmission', 'type', 'paint_color', 'date_posted'], drop_first=True)

df

Unnamed: 0,target,model_year,cylinders,odometer,days_listed,model_bmw x5,model_buick enclave,model_cadillac escalade,model_chevrolet camaro,model_chevrolet camaro lt coupe 2d,...,date_posted_2019-04-10,date_posted_2019-04-11,date_posted_2019-04-12,date_posted_2019-04-13,date_posted_2019-04-14,date_posted_2019-04-15,date_posted_2019-04-16,date_posted_2019-04-17,date_posted_2019-04-18,date_posted_2019-04-19
2,5500,2013.0,4.0,110000.0,79,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,14900,2017.0,4.0,80903.0,28,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,14990,2014.0,6.0,57954.0,15,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,12990,2015.0,4.0,79212.0,73,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,15990,2013.0,6.0,109473.0,68,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5091,7300,2013.0,6.0,7100.0,63,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5092,17999,2014.0,8.0,154000.0,11,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5093,4800,2012.0,4.0,138000.0,39,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5094,12000,2005.0,8.0,228000.0,52,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [16]:
numeric_cols = ['model_year', 'cylinders', 'odometer', 'days_listed']
cat_cols = list(set(df.columns) - set(numeric_cols) - {'target'})
cat_cols.sort()

print(numeric_cols)
print(cat_cols)

['model_year', 'cylinders', 'odometer', 'days_listed']
['condition_fair', 'condition_good', 'condition_like new', 'condition_new', 'condition_salvage', 'date_posted_2018-05-02', 'date_posted_2018-05-03', 'date_posted_2018-05-04', 'date_posted_2018-05-05', 'date_posted_2018-05-06', 'date_posted_2018-05-07', 'date_posted_2018-05-08', 'date_posted_2018-05-09', 'date_posted_2018-05-10', 'date_posted_2018-05-11', 'date_posted_2018-05-12', 'date_posted_2018-05-13', 'date_posted_2018-05-14', 'date_posted_2018-05-15', 'date_posted_2018-05-16', 'date_posted_2018-05-17', 'date_posted_2018-05-18', 'date_posted_2018-05-19', 'date_posted_2018-05-20', 'date_posted_2018-05-21', 'date_posted_2018-05-22', 'date_posted_2018-05-23', 'date_posted_2018-05-24', 'date_posted_2018-05-25', 'date_posted_2018-05-26', 'date_posted_2018-05-27', 'date_posted_2018-05-28', 'date_posted_2018-05-29', 'date_posted_2018-05-30', 'date_posted_2018-05-31', 'date_posted_2018-06-01', 'date_posted_2018-06-02', 'date_posted_201

In [17]:
random_seed = 888
df_train, df_test = train_test_split(df, test_size=0.2, random_state=random_seed)


print(df_train.shape)
print(df_test.shape)
print()
print(df_train['target'].value_counts(normalize=True))
print()
print(df_test['target'].value_counts(normalize=True))

(2424, 489)
(607, 489)

6995     0.020215
4500     0.014439
3500     0.014439
8995     0.013614
6500     0.013201
           ...   
3890     0.000413
26600    0.000413
16990    0.000413
16700    0.000413
7250     0.000413
Name: target, Length: 756, dtype: float64

3500     0.016474
6995     0.016474
5500     0.016474
4995     0.016474
7995     0.014827
           ...   
15750    0.001647
10990    0.001647
14088    0.001647
6600     0.001647
35798    0.001647
Name: target, Length: 345, dtype: float64


In [18]:
scaler = StandardScaler()
scaler.fit(df_train[numeric_cols])

def get_features_and_target_arrays(df, numeric_cols, cat_cols, scaler):
    X_numeric_scaled = scaler.transform(df[numeric_cols])
    X_categorical = df[cat_cols].to_numpy()
    X = np.hstack((X_categorical, X_numeric_scaled))
    y = df['target']
    return X, y

X, y = get_features_and_target_arrays(df_train, numeric_cols, cat_cols, scaler)

In [19]:
clf = LogisticRegression(penalty='none') # logistic regression with no penalty term in the cost function.

clf.fit(X, y)

In [20]:
X_test, y_test = get_features_and_target_arrays(df_test, numeric_cols, cat_cols, scaler)

In [21]:
test_prob = clf.predict_proba(X_test)[:, 1]
test_pred = clf.predict(X_test)

In [22]:
print('Accuracy = ', accuracy_score(y_test, test_pred))
print('Precision = ', precision_score(y_test, test_pred, average='micro'))
print('Recall = ', recall_score(y_test, test_pred, average='micro'))
print('F1 score = ', f1_score(y_test, test_pred, average='micro'))

print('\nClassification Report')
print(classification_report(y_test, test_pred))

Accuracy =  0.05930807248764415
Precision =  0.05930807248764415
Recall =  0.05930807248764415
F1 score =  0.05930807248764415

Classification Report
              precision    recall  f1-score   support

           1       0.00      0.00      0.00         1
         111       0.00      0.00      0.00         1
         176       0.00      0.00      0.00         1
         196       0.00      0.00      0.00         1
         289       0.00      0.00      0.00         0
         295       1.00      1.00      1.00         1
         600       0.00      0.00      0.00         1
         850       0.00      0.00      0.00         1
         900       0.00      0.00      0.00         1
        1000       0.00      0.00      0.00         2
        1200       0.00      0.00      0.00         2
        1300       0.00      0.00      0.00         1
        1400       0.00      0.00      0.00         1
        1500       0.00      0.00      0.00         2
        1599       0.00      0.00      