In [None]:
import numpy as np
import pandas as pd
import matplotlib.pylab as plt
%matplotlib inline

import seaborn as sns
sns.set(style="ticks", color_codes=True)

In [None]:
from collections import Counter

from sklearn.tree import DecisionTreeRegressor

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB

from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import confusion_matrix, make_scorer, accuracy_score, precision_score, recall_score, f1_score


from sklearn.model_selection import StratifiedKFold, KFold, cross_val_score
from sklearn.base import clone

In [None]:
df = pd.read_csv('../Test_Dataset/Training_DataSet.csv', header=0)
df.head()

In [None]:
df_xt5 = df[df['VehModel'] == 'XT5']
df_xt5.shape

In [None]:
for name in df_xt5.columns:
    vc = df_xt5[name].value_counts()
    if vc.shape[0] < 5:
        print('\n')
        print(vc)


In [None]:
# 'ListingID', 
df_xt5.drop(columns=['SellerIsPriv', 'VehType', 'VehBodystyle', 'VehFuel', 'VehMake', 'VehModel'], inplace=True)
df_xt5.head()

In [None]:
df_xt5.drop(columns=['VehType', 'VehBodystyle', 'VehFuel', 'VehMake', 'VehModel'], inplace=True) # XT5
# 'VehTransmission', 'VehEngine' as in US there is only one option
df_xt5.head()

In [None]:
df_xt5.isna().sum(axis=0)

In [None]:
df_xt5.drop(columns=['VehColorInt'], inplace=True) # nan-cleaning
df_xt5.head()

In [None]:
nan_indices = df_xt5.isna().sum(axis=1).sort_values(ascending=False)
nan_indices[nan_indices > 0].value_counts()

In [None]:
nan_cols = df_xt5.isna().sum(axis=0).sort_values(ascending=False)
nan_cols
# nan_indices[nan_indices > 0].value_counts()

In [None]:
nan_indices[nan_indices > 0].shape

In [None]:
ndf_xt5 = df_xt5[nan_indices == 0]
ndf_xt5.shape

In [None]:
ndf_xt5.describe()

In [None]:
ndf_xt5['VehYear'].value_counts()

In [None]:
ndf_xt5['SellerZip'].unique().shape

In [None]:
ndf_xt5['VehDriveTrain'].value_counts()

In [None]:
ndf_xt5['VehHistory'].value_counts()

In [None]:
history_lst = []
for hist in  ndf_xt5['VehHistory']:
    print(hist, type(hist))
    if isinstance(hist, str):
        history_lst.extend(hist.split(', '))

In [None]:
Counter(history_lst).most_common(12)

In [None]:
ndf_xt5['VehPriceLabel'].value_counts()

In [None]:
ndf_xt5['VehSellerNotes'][:5]

In [None]:
ndf_xt5.columns

In [None]:
ndf_xt5['VehYear'].value_counts()

In [None]:
ndf_xt5['Vehicle_Trim'].value_counts()

In [None]:
cont_cols = ['SellerRating', 'SellerRevCnt', 'VehListdays', 'VehMileage']
cont_cols

In [None]:
ndf_xt5["VehCertified"].value_counts()

In [None]:
ndf_xt5["VehYear"].value_counts()

In [None]:
ndf_xt5["VehFuel"].value_counts()

In [None]:
cont_cols = ['SellerRating', 'SellerRevCnt', 'VehListdays', 'VehMileage', "Dealer_Listing_Price"]
ndf_xt5[cont_cols].corr()

In [None]:
_ind = np.random.permutation(ndf_xt5.shape[0])
_n = int(0.8 * ndf_xt5.shape[0])
train_index, test_index = _ind[:_n], _ind[_n:]
train_index.shape, test_index.shape

In [None]:
history_set = set(history_lst)
len(history_lst), len(history_set)

In [None]:
df_hist = pd.DataFrame(
    data=np.zeros(
        shape=(ndf_xt5.shape[0], len(history_set)),
        dtype=np.int_
        ),
    columns=[f"history_{h}" for h in history_set],
    index=ndf_xt5.index
    )
df_hist.head()

In [None]:
for ind in ndf_xt5.index:
    hist = ndf_xt5.at[ind, 'VehHistory']
    for h in history_set:
        if h in hist:
            df_hist.at[ind, f"history_{h}"] = 1

df_hist.head()    

In [None]:
cats = ["SellerState", "VehYear", 'VehPriceLabel', "Vehicle_Trim"]
enc = OneHotEncoder(handle_unknown='ignore', )
enc.fit(ndf_xt5[cats])
df_ohe = pd.DataFrame(
    data=enc.fit_transform(ndf_xt5[cats]).toarray(),
    index=ndf_xt5.index,
    columns=enc.get_feature_names_out()
)
df_ohe.head()


In [None]:
enc.get_feature_names_out()

In [None]:

df = pd.concat([ndf_xt5[['SellerRating', 'SellerRevCnt', 'VehListdays', 'VehMileage']], df_ohe, df_hist], axis=1)
df.head()


In [None]:
df.shape, df.isna().sum(axis=0).max(), df.isna().sum(axis=1).max()

In [None]:
_ind = np.random.permutation(df.shape[0])
_n = int(0.8 * df.shape[0])
train_index, test_index = _ind[:_n], _ind[_n:]
train_index.shape, test_index.shape, df.shape, ndf_xt5.shape

In [None]:
X_train, X_test = df.iloc[train_index], df.iloc[test_index]
y_train, y_test = ndf_xt5.iloc[train_index]["Dealer_Listing_Price"], ndf_xt5.iloc[test_index]["Dealer_Listing_Price"]

In [None]:
params = {
    "criterion": ["squared_error", "friedman_mse", "absolute_error", "poisson"],
    "splitter": ["best", "random"],
    "max_depth": [None, 5, 7, 10, 11, 12, 13, 14,  15, 17],
    "min_samples_leaf": [2, 3, 5],
    "max_features": [None, 'sqrt', 'log2', 0.2, 0.4, 0.6, 0.8]
}
grid = GridSearchCV(
    estimator=DecisionTreeRegressor(),
    param_grid=params,
    cv=4,
    error_score='raise'
    )

grid.fit(X=X_train, y=y_train)

In [None]:
grid.best_params_

In [None]:
grid.best_score_

In [None]:
from sklearn.tree import plot_tree
plot_tree(grid.best_estimator_)

In [None]:
y_pred = grid.best_estimator_.predict(df.iloc[test_index])
y_pred

In [None]:
np.max(y_pred - y_test)