In [None]:
import numpy as np
import pandas as pd
import matplotlib.pylab as plt
import re
import numbers
%matplotlib inline

import seaborn as sns
sns.set(style="ticks", color_codes=True)

In [None]:
from sklearn.base import (
    BaseEstimator,
    ClassifierMixin,
    MetaEstimatorMixin,
    RegressorMixin,
    TransformerMixin,
    clone,
)

In [None]:
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeRegrespsor

from sklearn.pipeline import Pipeline

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB

from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder

from sklearn.metrics import confusion_matrix, make_scorer, accuracy_score, precision_score, recall_score, f1_score

from sklearn.model_selection import StratifiedKFold, KFold, cross_val_score
from sklearn.base import clone

In [None]:
def get_features(st: str) -> str:
    if isinstance(st, str):
        pattern = r'\'(.*?)\''
        m = re.findall(pattern, st[1:-1])
        return ' '.join(m) if m else ''
    return ''   


class FeatureTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X["VehFeats"] = X['VehFeats'].map(get_features)
        return X

class MainCleanerTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        self.col = ['SellerIsPriv', 'VehType', 'VehBodystyle', 'VehFuel', 'VehMake']
        self.nan_col = ['Vehicle_Trim', 'Dealer_Listing_Price', 'VehMileage']

        return self

    def transform(self, X, y=None):
        for col in self.col:
            if col in X.columns:
                X.drop(columns=col, inplace=True)
        nan_indices = X.isna().sum(axis=1).sort_values(ascending=False)
        return X[nan_indices == 0]

class NanCleanerTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        self.cont_cols = ['SellerRating', 'SellerRevCnt', 'VehListdays', 'VehMileage']
        self.cat_cols = ['SellerListSrc', 'SellerName','VehColorExt', 'VehDriveTrain', 'VehFeats', 'VehFuel', 'VehHistory', 'VehPriceLabel', 'VehSellerNotes', 'VehType']
        return self

    def transform(self, X, y=None):
        for cont in self.cont_cols:
            if cont in X.columns:
                X[cont] = SimpleImputer(missing_values=np.nan, strategy='mean').fit_transform(X[[cont]])
        # print(X.shape)
        for cat in self.cat_cols:
            if cat in X.columns:
                # pass
                # print(cat, SimpleImputer(missing_values=np.nan, strategy='constant', fill_value='').fit_transform(X[[cat]]))
                X[[cat]] = SimpleImputer(missing_values=np.nan, strategy='constant', fill_value='').fit_transform(X[[cat]])
        
        # print(X.shape)
        if "VehYear" in X.columns:
            X[["VehYear"]] = SimpleImputer(missing_values=np.nan, strategy='constant', fill_value=2018).fit_transform(X[["VehYear"]])
        # print(X.shape)
        if "SellerRevCnt" in X.columns:
            X[["SellerRevCnt"]] = SimpleImputer(missing_values=np.nan, strategy='constant', fill_value=2018).fit_transform(X[["SellerRevCnt"]])
        # print(X.shape)
        if "VehListdays" in X.columns:    
            X[["VehListdays"]] = SimpleImputer(missing_values=np.nan, strategy='mean').fit_transform(X[["VehListdays"]])

        return X

class XT5CleanerTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        self.col = ['SellerIsPriv', 'VehTransmission', 'VehEngine']
        return self

    def transform(self, X, y=None):
        for col in self.col:
            if col in X.columns:
                X.drop(columns=col, inplace=True)
        return X

class WK2CleanerTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        self.col = ['SellerIsPriv', 'VehTransmission', 'VehEngine']
        return self

    def transform(self, X, y=None):
        for col in self.col:
            if col in X.columns:
                X.drop(columns=col, inplace=True)
        return X

class HistoryTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        self.history_lst = []
        for hist in X['VehHistory']:
            if isinstance(hist, str):
                self.history_lst.extend(hist.split(', '))
        self.history_set = set(self.history_lst)
        return self

    def transform(self, X, y=None):
        df_hist = pd.DataFrame(
        data=np.zeros(
            shape=(X.shape[0], len(self.history_set)),
            dtype=np.float_
            ),
            columns=[f"history_{h}" for h in self.history_set],
            index=X.index
            )
        X.drop(columns='VehHistory', inplace=True)
        
        return pd.concat([X, df_hist], axis=1)

class OHETransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        cats = ["SellerState", "VehYear", 'VehPriceLabel', "Vehicle_Trim"]
        self.cats = [c for c in cats if c in X.columns]
        self.enc = OneHotEncoder(handle_unknown='ignore', )
        self.enc.fit(X[self.cats])
        return self

    def transform(self, X, y=None):
        df_ohe = pd.DataFrame(
            data=self.enc.transform(X[self.cats]).toarray(),
            index=X.index,
            columns=self.enc.get_feature_names_out()
        )
        X.drop(columns=self.cats, inplace=True)
        return pd.concat([X, df_ohe], axis=1)


class TrimEstimator(BaseEstimator, ClassifierMixin):
    def __init__(self):
        self.count_vector = CountVectorizer()
        self.tfidf_transformer = TfidfTransformer()
        self.clf = MultinomialNB()
        self.encoder = OrdinalEncoder()

    def fit(self, X, y=None):
        _y = self.encoder.fit_transform(y)
        train_counts = self.count_vector.fit_transform(X['VehFeats'])
        train_tfidf = self.tfidf_transformer.fit_transform(train_counts)
        print(train_tfidf.shape)
        self.clf.fit(train_tfidf, _y)
        return self

    def predict(self, X):
        train_counts = self.count_vector.transform(X['VehFeats'])
        train_tfidf = self.tfidf_transformer.transform(train_counts)
        print(train_tfidf.shape)
        res = self.clf.predict(train_tfidf)
        # print(res)
        return self.encoder.inverse_transform(res.reshape(-1, 1))
    
class WK2Estimator(BaseEstimator, ClassifierMixin):
    def __init__(self):
        self.cont_cols = ['SellerRating', 'SellerRevCnt', 'VehListdays', 'VehMileage']
        self.cat_cols = ["SellerState", "VehYear", 'VehPriceLabel', "Vehicle_Trim", 'history']
        self.params= {'criterion': 'friedman_mse',
            'max_depth': 17,
            'max_features': None,
            'min_samples_leaf': 5,
            'splitter': 'random'}
        self.clf = DecisionTreeRegressor(**self.params)

    def fit(self, X, y=None):
        self.columns = [col for col in X.columns if any(c in col for c in self.cont_cols + self.cat_cols)]
        self.clf.fit(X[self.columns], y)
        return self

    def predict(self, X):
        return self.clf.predict(X[self.columns])
    
class XT5Estimator(BaseEstimator, ClassifierMixin):
    def __init__(self):
        self.cont_cols = ['SellerRating', 'SellerRevCnt', 'VehListdays', 'VehMileage']
        self.cat_cols = ["SellerState", "VehYear", 'VehPriceLabel', "Vehicle_Trim", 'history']
        self.params = {'criterion': 'squared_error',
            'max_depth': 14,
            'max_features': None,
            'min_samples_leaf': 3,
            'splitter': 'random'}
        self.clf = DecisionTreeRegressor(**self.params)

    def fit(self, X, y=None):
        self.columns = [col for col in X.columns if any(c in col for c in self.cont_cols + self.cat_cols)]
        self.clf.fit(X[self.columns], y)
        return self

    def predict(self, X):
        return self.clf.predict(X[self.columns])

    
class XT5_WK2SplitterTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        self.xt5_indices = X['VehModel'] == 'XT5'
        self.wk2_indices = X['VehModel'] == 'Grand Cherokee'
        return self

    def transform(self, X, y=None):
        return X[self.xt5_indices], X[self.wk2_indices]


In [None]:
df = pd.read_csv('../Test_Dataset/Training_DataSet.csv', header=0)
df.head()

In [None]:
df_test = pd.read_csv('../Test_Dataset/Test_Dataset.csv', header=0)
df_test.head()

In [None]:
df = MainCleanerTransformer().fit_transform(df)
df = NanCleanerTransformer().fit_transform(df)

df_xt5, df_wk2 = XT5_WK2SplitterTransformer().fit_transform(df)
df_xt5.shape, df_wk2.shape 

In [None]:
df_test = MainCleanerTransformer().fit_transform(df_test)
df_test = NanCleanerTransformer().fit_transform(df_test)

df_test_xt5, df_test_wk2 = XT5_WK2SplitterTransformer().fit_transform(df_test)
df_test_xt5.shape, df_test_wk2.shape 

In [None]:
ft_xt5 = FeatureTransformer()
ft_wt2 = FeatureTransformer()
df_xt5 = ft_xt5.fit_transform(df_xt5)
df_wk2 = ft_wt2 .fit_transform(df_wk2)
df_test_xt5 = ft_xt5.fit_transform(df_test_xt5)
df_test_wk2 = ft_wt2 .fit_transform(df_test_wk2)

In [None]:
te_xt5 = TrimEstimator()
te_xt5.fit(X=df_xt5, y=df_xt5[['Vehicle_Trim']])
df_test_xt5[['Vehicle_Trim']] = te_xt5.predict(df_test_xt5)

In [None]:
te_wk2 = TrimEstimator()
te_wk2.fit(X=df_wk2, y=df_wk2[['Vehicle_Trim']])
df_test_wk2[['Vehicle_Trim']] = te_wk2.predict(df_test_wk2)

In [None]:
ht_xt5 = HistoryTransformer()
ohet_xt5 = OHETransformer()
df_xt5 = ht_xt5.fit_transform(df_xt5)
df_xt5 = ohet_xt5.fit_transform(df_xt5)
df_xt5.head()

In [None]:
df_test_xt5 = ht_xt5.transform(df_test_xt5)
df_test_xt5 = ohet_xt5.transform(df_test_xt5)
df_test_xt5.head()

In [None]:
xt5_est = XT5Estimator()
xt5_est.fit(df_xt5, df_xt5[["Dealer_Listing_Price"]])
xt5_est.predict(df_test_xt5)

In [None]:
wk2_est = WK2Estimator()
wk2_est.fit(df_wk2, df_wk2[["Dealer_Listing_Price"]])
wk2_est.predict(df_test_wk2)