In [None]:
import numpy as np
import pandas as pd
import matplotlib.pylab as plt
%matplotlib inline

import seaborn as sns
sns.set(style="ticks", color_codes=True)

In [None]:
from collections import Counter

from sklearn.tree import DecisionTreeRegressor

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB

from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import confusion_matrix, make_scorer, accuracy_score, precision_score, recall_score, f1_score


from sklearn.model_selection import StratifiedKFold, KFold, cross_val_score
from sklearn.base import clone

In [None]:
df = pd.read_csv('../Test_Dataset/Training_DataSet.csv', header=0)
df.head()

In [None]:
df_wk2 = df[df['VehModel'] == 'Grand Cherokee']
df_wk2.shape

In [None]:
for name in df_wk2.columns:
    vc = df_wk2[name].value_counts()
    if vc.shape[0] < 5:
        print('\n')
        print(vc)


In [None]:
# 'ListingID', 
df_wk2.drop(columns=['SellerIsPriv', 'VehType', 'VehBodystyle', 'VehFuel', 'VehMake', 'VehModel'], inplace=True)
df_wk2.head()

In [None]:
df_wk2.drop(columns=['SellerIsPriv', 'VehTransmission', 'VehEngine'], inplace=True) # XT5
# 'VehTransmission', 'VehEngine' as in US there is only one option
df_wk2.head()

In [None]:
df_wk2.isna().sum(axis=0)

In [None]:
df_wk2.drop(columns=['VehColorInt'], inplace=True) # nan-cleaning
df_wk2.head()

In [None]:
nan_indices = df_wk2.isna().sum(axis=1).sort_values(ascending=False)
nan_indices[nan_indices > 0].value_counts()

In [None]:
nan_indices[nan_indices > 0].shape

In [None]:
ndf_wk2 = df_wk2[nan_indices == 0]
ndf_wk2.shape

In [None]:
ndf_wk2.describe()

In [None]:
ndf_wk2['VehYear'].value_counts()

In [None]:
ndf_wk2['SellerZip'].unique().shape

In [None]:
ndf_wk2['VehDriveTrain'].value_counts()

In [None]:
ndf_wk2['VehHistory'].value_counts()

In [None]:
history_lst = []
for hist in  ndf_wk2['VehHistory']:
    print(hist, type(hist))
    if isinstance(hist, str):
        history_lst.extend(hist.split(', '))

In [None]:
Counter(history_lst).most_common(12)

In [None]:
ndf_wk2['VehPriceLabel'].value_counts()

In [None]:
ndf_wk2['VehSellerNotes'][:5]

In [None]:
ndf_wk2.columns

In [None]:
ndf_wk2['VehYear'].value_counts()

In [None]:
ndf_wk2['Vehicle_Trim'].value_counts()

In [None]:
cont_cols = ['SellerRating', 'SellerRevCnt', 'VehListdays', 'VehMileage']
cont_cols

In [None]:
fig, axs = plt.subplots(
    ncols=len(cont_cols), 
    nrows=len(cont_cols), 
    figsize=(25, 25)
)
for i in range(len(cont_cols)):
    for j in range(i+1):
        if i == j:
            sns.histplot(
                data=ndf_wk2, 
                x=cont_cols[i],
                bins=50, 
                ax=axs[i][i]
                )
        else:
            sns.scatterplot(
                data=ndf_wk2,
                x=cont_cols[i], 
                y=cont_cols[j], 
                hue="Dealer_Listing_Price", 
                # palette='dark',
                legend=True, 
                ax=axs[i][j]
                )

In [None]:
cont_cols = ['SellerRating', 'SellerRevCnt', 'VehListdays', 'VehMileage', "Dealer_Listing_Price"]
fig, axs = plt.subplots(ncols=len(cont_cols), nrows=len(cont_cols), figsize=(25,25))
for i in range(len(cont_cols)):
    for j in range(i+1):
        if i == j:
            sns.histplot(
                data=ndf_wk2, 
                x=cont_cols[i], 
                hue="VehPriceLabel", 
                palette=sns.color_palette("tab10", 3),  
                bins=50, 
                ax=axs[i][i]
                )
        else:
            sns.scatterplot(
                data=ndf_wk2,
                x=cont_cols[i], 
                y=cont_cols[j], 
                hue="VehPriceLabel", 
                palette=sns.color_palette("tab10", 3),
                legend=True, 
                ax=axs[i][j]
                )

In [None]:
df_wk2["VehCertified"].value_counts()

In [None]:
cont_cols = ['SellerRating', 'SellerRevCnt', 'VehListdays', 'VehMileage', "Dealer_Listing_Price"]
fig, axs = plt.subplots(ncols=len(cont_cols), nrows=len(cont_cols), figsize=(25,25))
for i in range(len(cont_cols)):
    for j in range(i+1):
        if i == j:
            sns.histplot(
                data=ndf_wk2,
                x=cont_cols[i],
                hue="VehCertified", 
                palette=sns.color_palette("tab10", 2),
                bins=20,
                ax=axs[i][i]
                )
        else:
            sns.scatterplot(
                data=ndf_wk2,
                x=cont_cols[i], 
                y=cont_cols[j], 
                hue="VehCertified", 
                palette=sns.color_palette("tab10", 2),
                legend=True, 
                ax=axs[i][j]
                )

In [None]:
ndf_wk2["VehYear"].value_counts()

In [None]:
cont_cols = ['SellerRating', 'SellerRevCnt', 'VehListdays', 'VehMileage', "Dealer_Listing_Price"]
fig, axs = plt.subplots(ncols=len(cont_cols), nrows=len(cont_cols), figsize=(25,25))
for i in range(len(cont_cols)):
    for j in range(i+1):
        if i == j:
            sns.histplot(
                data=ndf_wk2, 
                x=cont_cols[i], 
                hue="VehYear", 
                palette=sns.color_palette("tab10", 5),  
                bins=50, 
                ax=axs[i][i]
                )
        else:
            sns.scatterplot(
                data=ndf_wk2,
                x=cont_cols[i], 
                y=cont_cols[j], 
                hue="VehYear", 
                palette=sns.color_palette("tab10", 5),
                legend=True, 
                ax=axs[i][j]
                )

In [None]:
ndf_wk2["VehFuel"].value_counts()

In [None]:
cont_cols = ['SellerRating', 'SellerRevCnt', 'VehListdays', 'VehMileage', "Dealer_Listing_Price"]
fig, axs = plt.subplots(ncols=len(cont_cols), nrows=len(cont_cols), figsize=(25,25))
for i in range(len(cont_cols)):
    for j in range(i+1):
        if i == j:
            sns.histplot(
                data=ndf_wk2, 
                x=cont_cols[i], 
                hue="VehFuel", 
                palette=sns.color_palette("tab10", 5),  
                bins=50, 
                ax=axs[i][i]
                )
        else:
            sns.scatterplot(
                data=ndf_wk2,
                x=cont_cols[i], 
                y=cont_cols[j], 
                hue="VehYear", 
                palette=sns.color_palette("tab10", 5),
                legend=True, 
                ax=axs[i][j]
                )

In [None]:
cont_cols = ['SellerRating', 'SellerRevCnt', 'VehListdays', 'VehMileage', "Dealer_Listing_Price"]
ndf_wk2[cont_cols].corr()

In [None]:
_ind = np.random.permutation(ndf_wk2.shape[0])
_n = int(0.9 * ndf_wk2.shape[0])
train_index, test_index = _ind[:_n], _ind[_n:]
train_index.shape, test_index.shape

In [None]:
trim_df = pd.DataFrame()
trim_df['VehFeats'] = ndf_wk2['VehFeats'].map(lambda x: ' '.join([_x[1:-1] for _x in x[1:-1].split(', ')]))
trim_df['VehSellerNotes'] = ndf_wk2['VehSellerNotes']
trim_df['VehColorExt'] = ndf_wk2['VehColorExt']
trim_df['VehDriveTrain'] = ndf_wk2['VehDriveTrain']

trim_df['y'] = OrdinalEncoder().fit_transform(ndf_wk2[['Vehicle_Trim']])
trim_df['y'].value_counts()


In [None]:
from itertools import chain, combinations
col_names = ['VehFeats', 'VehSellerNotes', 'VehColorExt', 'VehDriveTrain']

for cols in chain.from_iterable(combinations(col_names, r) for r in range(1, len(col_names)+1)):
    print(cols)
    trim_df['data'] = ''
    for col in cols:
        trim_df['data'] += trim_df[col] + ' '

    train_df = trim_df.iloc[train_index]

    count_vector = CountVectorizer()
    tfidf_transformer = TfidfTransformer()
    train_counts = count_vector.fit_transform(train_df['data'])
    train_tfidf = tfidf_transformer.fit_transform(train_counts)

    clf = MultinomialNB().fit(train_tfidf, train_df['y'])

    test_counts = count_vector.transform(trim_df.iloc[test_index]['data'])
    test_tfidf = tfidf_transformer.transform(test_counts)
    y_pred = clf.predict(test_tfidf)

    print(confusion_matrix(y_pred, trim_df.iloc[test_index]['y']))


In [None]:
history_set = set(history_lst)
len(history_lst), len(history_set)

In [None]:
df_hist = pd.DataFrame(
    data=np.zeros(
        shape=(ndf_wk2.shape[0], len(history_set)),
        dtype=np.int_
        ),
    columns=[f"history_{h}" for h in history_set],
    index=ndf_wk2.index
    )
df_hist.head()

In [None]:
for ind in ndf_wk2.index:
    hist = ndf_wk2.at[ind, 'VehHistory']
    for h in history_set:
        if h in hist:
            df_hist.at[ind, f"history_{h}"] = 1

df_hist.head()    

In [None]:
cats = ["SellerState", "VehYear", 'VehPriceLabel', "Vehicle_Trim"]
enc = OneHotEncoder(handle_unknown='ignore', )
enc.fit(ndf_wk2[cats])
df_ohe = pd.DataFrame(
    data=enc.fit_transform(ndf_wk2[cats]).toarray(),
    index=ndf_wk2.index,
    columns=enc.get_feature_names_out()
)
df_ohe.head()


In [None]:
enc.get_feature_names_out()

In [None]:

df = pd.concat([ndf_wk2[['SellerRating', 'SellerRevCnt', 'VehListdays', 'VehMileage']], df_ohe, df_hist], axis=1)
df.head()


In [None]:
df.shape, df.isna().sum(axis=0).max(), df.isna().sum(axis=1).max()

In [None]:
_ind = np.random.permutation(df.shape[0])
_n = int(0.8 * df.shape[0])
train_index, test_index = _ind[:_n], _ind[_n:]
train_index.shape, test_index.shape

In [None]:
params = {
    "criterion": ["squared_error", "friedman_mse", "absolute_error", "poisson"],
    "splitter": ["best", "random"],
    "max_depth": [5, 7, 10, 11, 12, 13, 14,  15, 17],
    "min_samples_leaf": [2, 3, 5],
    "max_features": [None, 'sqrt', 'log2', 0.2, 0.4, 0.6, 0.8]
}
grid = GridSearchCV(
    estimator=DecisionTreeRegressor(),
    param_grid=params,
    cv=3,
    error_score='raise'
    )

grid.fit(X=df.iloc[train_index], y=ndf_wk2.iloc[train_index]["Dealer_Listing_Price"])

In [None]:
grid.best_params_

In [None]:
grid.best_score_

In [None]:
from sklearn.tree import plot_tree
plot_tree(grid.best_estimator_)

In [None]:
dir(grid.best_estimator_.tree_)

In [None]:
for n in grid.best_estimator_.tree_:
    print(n)

In [None]:
y_pred = grid.best_estimator_.predict(df.iloc[test_index])
y_pred

In [None]:
np.max(y_pred - ndf_wk2.iloc[test_index]['Dealer_Listing_Price'])