In [None]:
# Import libraries and loading the csv file
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import string
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
nltk.download('punkt')
nltk.download('omw-1.4')
from sklearn.pipeline import Pipeline
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_selector

In [None]:
vehicles_clean = pd.read_csv(r"C:\Users\91886\OneDrive\QMUL Masterclass\vehicles_initialdatacleaning.csv")

In [None]:
vehicles_clean = pd.DataFrame(vehicles_clean)

In [None]:
vehicles_clean = vehicles_clean.head(5000)

In [None]:
vehicles_clean['description'] = vehicles_clean['description'].astype('string')
# Replace 'other' with 1 and strip 'cylinders' string from other values
vehicles_clean['cylinders'] = vehicles_clean['cylinders'].str.replace('other', '1').str.rstrip('cylinders').str.strip()
# Convert to float data type and replace '<NA>' values with NaN
vehicles_clean['cylinders'] = pd.to_numeric(vehicles_clean['cylinders'], errors='coerce').astype(float)
# drop model and posting_date column for encoding
vehicles_clean.drop(['condition','id','posting_date','model'], axis=1,inplace = True)

In [None]:
from sklearn.model_selection import train_test_split
import pandas as pd

# define the features and target variables
X = vehicles_clean.drop('price', axis=1)
y = vehicles_clean['price']

# divide the data into train, validation, and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42)

# display the shapes of the resulting datasets
print(f"Training set shape: {X_train.shape}, {y_train.shape}")
print(f"Validation set shape: {X_val.shape}, {y_val.shape}")
print(f"Test set shape: {X_test.shape}, {y_test.shape}")

In [None]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string
import numpy as np
from sklearn.base import TransformerMixin

class TokenizerTransformer(TransformerMixin):
    def transform(self, X, **transform_params):
        # Convert to lowercase
        X = X.apply(lambda x: x.lower())
        # Tokenize into words
        X = X.apply(lambda x: word_tokenize(x)[:3500])  # Limit tokens to 3500
        # Remove stop words
        stop_words = stopwords.words('english')
        X = X.apply(lambda x: [word for word in x if word not in stop_words])
        # Lemmatize words using WordNetLemmatizer
        lemmatizer = WordNetLemmatizer()
        X = X.apply(lambda x: [lemmatizer.lemmatize(word) for word in x])
        # Remove punctuation
        X = X.apply(lambda x: [word for word in x if word not in string.punctuation])
        # Return tokenized text
        return X
    
    def fit(self, X, y=None, **fit_params):
        return self

In [None]:
# Define pipeline with for tokenization
token_pipeline = Pipeline([
    ('tokenizer', TokenizerTransformer())
])

In [None]:
X_train['description'] = token_pipeline.fit_transform(X_train['description'])
X_val['description'] = token_pipeline.transform(X_val['description'])
X_test['description'] = token_pipeline.transform(X_test['description'])

In [None]:
# vehicles_clean['description'] = token_pipeline.fit_transform(vehicles_clean['description'])

# Cylinder

In [None]:
# Define predefined lists
# Define the list of valid cylinders
cylinder_list = ['i2','i3','i4','i5','i6','i8','i10','i12',
                 'v2','v3','v4','v5','v6','v8','v10','v12', 
                 '2cylinder','3cylinder','4cylinder','5cylinder','6cylinder','8cylinder','10cylinder','12cylinder',
                 '2cylinders','3cylinders','4cylinders','5cylinders','6cylinders','8cylinders','10cylinders',
                '12cylinders']

In [None]:
class CylindersCleaning(BaseEstimator, TransformerMixin):
# ReplaceNaNWithCylinders    
    def __init__(self, cylinder_list):
        self.cylinder_list = cylinder_list
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
# replace_nan_with_cylinder        
        def cylinders_cleaning(row):
            cylinders = row['cylinders']
            desc = row['description']
            if pd.isnull(cylinders):
                for c in self.cylinder_list:
                    if c in desc:
                        stripped_c = c.strip('ivcylinders')
                        try:
                            cylinders = float(stripped_c)
                        except ValueError:
                            pass
            row['cylinders'] = cylinders
            return row
        
        X = X.apply(cylinders_cleaning, axis=1)
        return X

In [None]:
cyl_pipeline = Pipeline([
    ('cylinders_cleaning', CylindersCleaning(cylinder_list)),
])

In [None]:
X_train_t = cyl_pipeline.fit_transform(X_train)

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin

class ColumnSelector(BaseEstimator, TransformerMixin):
    '''select specific columns of a given dataset'''
    def __init__(self, subset):
        self.subset = subset
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        return X.loc[:, self.subset]

In [None]:
cylclean_pipeline = Pipeline(steps=[('replace_cylinders', CylindersCleaning(cylinder_list)),
                           ('ct', ColumnTransformer(transformers=[('imputer', SimpleImputer(strategy='mean'), 
                                                                   ['cylinders'])],remainder='passthrough'))])

In [None]:
X_train['cylinders'].isna().sum()

In [None]:
X_train_t = cylclean_pipeline.fit_transform(X_train)
# X_val_t = cylclean_pipeline.transform(X_val)
# X_test_t = cylclean_pipeline.transform(X_test)

In [None]:
X_train_t

In [None]:
X_train = pd.DataFrame(X_train_t, columns=X_train.columns)
# X_val = pd.DataFrame(X_val_t, columns=X_val.columns)
# X_test = pd.DataFrame(X_test_t, columns=X_test.columns)

In [None]:
X_train['cylinders'].isna().sum()
X_val['cylinders'].isna().sum()
X_test['cylinders'].isna().sum()

In [None]:
X_train

# Drive

In [None]:
# split drive
class SplitDrive(TransformerMixin):
    def transform(self, X):
        X_new = []
        for row in X:
            new_row = []
            for val in row:
                if 'drive' in val:
                    split_vals = val.split('drive')
                    for i in range(len(split_vals)):
                        if i == 0:
                            new_row.append(split_vals[i])
                        elif i == len(split_vals) - 1:
                            if split_vals[i] != '':
                                if new_row[-1] == '':
                                    new_row.pop()
                                new_row.append('drive')
                                new_row.append(split_vals[i])
                            else:
                                new_row.append('drive')
                        elif split_vals[i] != '':
                            if new_row[-1] == '':
                                new_row.pop()
                            new_row.extend(['drive', split_vals[i]])
                else:
                    new_row.append(val)
            X_new.append(new_row)
        return X_new
    
    def fit(self, X, y=None, **fit_params):
        return self



In [None]:
# First level of cleaning - check for 2 drive occurences 
class DriveImputer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.mapping_dict = {'two wheel': 'rwd', 'all wheel': '4wd', '2 wheel': 'rwd', '4 wheel': '4wd','four wheel': '4wd',
                            'awd':'4wd','4x4':'4wd','xdrive':'4wd','quattro':'4wd'}
        self.drive_master = ['rwd','4wd','awd','xdrive','4x4','4matic','fwd','quattro']
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        nan_rows = X['drive'].isnull()
        X.loc[nan_rows, 'drive'] = X.loc[nan_rows, 'description'].apply(lambda x: self.get_drive(x))
        X['drive'] = X['drive'].map(self.mapping_dict).fillna(X['drive'])
        X['drive'] = X['drive'].apply(lambda x: self.check_drive(x))
        return X
        
    def get_drive(self, description):
        drive_idxs = [i for i, x in enumerate(description) if x == 'drive']
        if len(drive_idxs) >= 2:
            start_idx = drive_idxs[0]
            end_idx = drive_idxs[1]
            drive = ' '.join(description[start_idx+1:end_idx]).lower()
            return drive
        else:
            return np.nan
        
    def check_drive(self, drive):
        if drive in self.drive_master:
            return drive
        else:
            return np.nan

In [None]:
# Second level of cleaning - check for first drive occurence


class DriveTransformer(TransformerMixin):
    
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        # list of possible drive values
        drive_master = ['rwd', '4wd', 'awd', 'xdrive', '4x4', '4matic', 'fwd','awdtransmission','quattro']
        # dictionary mapping common drive phrases to standard values
        mapping_dict = {'two wheel': 'rwd', 'all wheel': '4wd', '2 wheel': 'rwd', 
                        '4 wheel': '4wd', 'four wheel': '4wd', 'awd': '4wd','awdtransmission':'4wd',
                       '4x4':'4wd','4x4':'4wd','xdrive':'4wd','quattro':'4wd'}
    
        # loop through the rows of the dataframe
        for i, row in X.iterrows():
            # check if the 'drive' value is NaN
            if pd.isna(row['drive']):
                # loop through the 'description' list to find the first occurrence of 'drive'
                if 'drive' in row['description']:
                    j = row['description'].index('drive')
                    # if 'drive' is found, replace the NaN value with the next non-empty token in the list
                    for k in range(j+1, len(row['description'])):
                        if row['description'][k] != '':
                            # check if the token is in the drive_master list
                            if row['description'][k] in drive_master:
                                # map the token to the standard value using the mapping_dict
                                X.at[i, 'drive'] = mapping_dict.get(row['description'][k], row['description'][k])
                            break
        return X

In [None]:
from transformers import YearTransformer
year_transformer = YearTransformer()
drive_imputer = DriveImputer()
vehicles_clean = year_transformer.fit_transform(vehicles_clean)
vehicles_clean = drive_imputer.fit_transform(vehicles_clean)
clean_data = pipeline.fit_transform(vehicles_clean)


In [None]:
from sklearn.pipeline import Pipeline

pipeline = Pipeline([
    ('split_drive', SplitDrive()),
    ('impute_drive', DriveImputer()),
    ('transform_drive', DriveTransformer())
])



In [None]:
class DriveImputer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.mapping_dict = {'4wd': 'four_wheel_drive',
                             'fwd': 'front_wheel_drive',
                             'rwd': 'rear_wheel_drive',
                             'awd': 'all_wheel_drive'}
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        X = pd.DataFrame(X)  # Convert X to a pandas DataFrame
        nan_rows = X['drive'].isnull()
        X.loc[nan_rows, 'drive'] = X.loc[nan_rows, 'description'].apply(lambda x: self.get_drive(x))
        X['drive'] = X['drive'].map(self.mapping_dict).fillna(X['drive'])
        return X.values.tolist()  # Convert X back to a list


In [None]:
clean_data = pipeline.fit_transform(vehicles_clean)

# Paint

In [None]:
X_train['description'].iloc[12]

In [None]:
X_val['description'].iloc[100]

In [None]:
class SplitExteriorInterior(TransformerMixin):
    def transform(self, X):
        X_new = []
        for row in X:
            new_row = []
            for val in row:
                if 'exterior' in val:
                    split_vals = val.split('exterior')
                    for i in range(len(split_vals)):
                        if i == 0:
                            new_row.append(split_vals[i])
                        elif i == len(split_vals) - 1:
                            if split_vals[i] != '':
                                if new_row[-1] == '':
                                    new_row.pop()
                                new_row.append('exterior')
                                new_row.append(split_vals[i])
                            else:
                                new_row.append('exterior')
                        elif split_vals[i] != '':
                            if new_row[-1] == '':
                                new_row.pop()
                            new_row.extend(['exterior', split_vals[i]])
                elif 'interior' in val:
                    split_vals = val.split('interior')
                    for i in range(len(split_vals)):
                        if i == 0:
                            new_row.append(split_vals[i])
                        elif i == len(split_vals) - 1:
                            if split_vals[i] != '':
                                if new_row[-1] == '':
                                    new_row.pop()
                                new_row.append('interior')
                                new_row.append(split_vals[i])
                            else:
                                new_row.append('interior')
                        elif split_vals[i] != '':
                            if new_row[-1] == '':
                                new_row.pop()
                            new_row.extend(['interior', split_vals[i]])
                else:
                    new_row.append(val)
            X_new.append(new_row)
        return X_new
    
    def fit(self, X, y=None, **fit_params):
        return self

In [None]:
split_ext_int_pipeline = Pipeline([
    ('split_ext_int', SplitExteriorInterior())
])

In [None]:
vehicles_clean['description'] = split_ext_int_pipeline.fit_transform(vehicles_clean['description'])

In [None]:
X_train['description'] = split_ext_int_pipeline.fit_transform(X_train['description'])

In [None]:
X_train['description'].iloc[12]

In [None]:
X_val['description'] = split_ext_int_pipeline.transform(X_val['description'])

In [None]:
X_val['description'].iloc[100]

In [None]:
X_test['description'] = split_ext_int_pipeline.transform(X_test['description'])

## paint_color cleaning

In [None]:
paint_master = ['white', 'blue', 'red', 'black', 'silver', 'grey', 'beige','brown', 'burgundy', 
               'gold', 'yellow', 'orange', 'green','purple', 'tan', 'charcoal','anvil', 
               'maroon','gray','champagne','olive','darkblue','darkgreen','lightblue','lightgray',
               'lightgrey','darkgray','darkgrey','teal','sapphireblue','midnightbblue','charcoalgray',
               'bronze','copper','pearlwhite','pearlblack','rossored','brilliantsilve','cyan','magenta',
                'aliceblue','antiquewhite']

In [None]:
mapping_dict = {'gray':'grey','whiteinterior':'white','brilliantsilve':'silver','pearlwhite':'white',
                'darkgray':'grey','lightgray':'grey','sapphireblue':'blue','darkblue':'blue','lightblue':'blue',
               'darkgreen':'green','aliceblue':'blue','antiquewhite':'white'}

In [None]:
from sklearn.base import TransformerMixin

class PaintColorImputer(TransformerMixin):
    def __init__(self, paint_master, mapping_dict):
        self.paint_master = paint_master
        self.mapping_dict = mapping_dict
    
    def transform(self, X):
        X_new = X.copy()
        for i, row in X_new.iterrows():
            if pd.isna(row['paint_color']):
                description_tokens = row['description']
                try:
                    color_token_idx = description_tokens.index('color')
                    if 'exterior' in description_tokens[color_token_idx-1]:
                        color = description_tokens[color_token_idx+1]
                        if color in self.paint_master:
                            if color in self.mapping_dict:
                                X_new.at[i, 'paint_color'] = self.mapping_dict[color]
                            else:
                                X_new.at[i, 'paint_color'] = color
                except (ValueError, IndexError):
                    continue
        return X_new
    
    def fit(self, X, y=None):
        return self

In [None]:
pipeline = Pipeline(steps=[('paint_color_imputer', PaintColorImputer(paint_master, mapping_dict))])

In [None]:
vehicles_clean['paint_color'].isna().sum()

In [None]:
vehicles_clean = pipeline.fit_transform(vehicles_clean)

In [None]:
vehicles_clean['paint_color'].isna().sum()

In [None]:
# color_pipeline = Pipeline([
#     ('paint_color_imputer', PaintColorImputer(paint_master, mapping_dict))
# ])

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin

class ColumnSelector(BaseEstimator, TransformerMixin):
    '''select specific columns of a given dataset'''
    def __init__(self, subset):
        self.subset = subset
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        return X.loc[:, self.subset]

In [None]:
# full_pipeline = Pipeline(steps=[('paint_color_imputer', PaintColorImputer(paint_master, mapping_dict)),
#                            ('ct', ColumnTransformer(transformers=[('imputer', SimpleImputer(strategy='mean'), 
#                                                                    ['paint_color'])],remainder='passthrough'))])

In [None]:
full_pipeline = Pipeline(steps=[
    ('paint_color_imputer', PaintColorImputer(paint_master, mapping_dict)),
    ('ct', ColumnTransformer(
        transformers=[
            ('imputer', SimpleImputer(strategy='most_frequent'), ['paint_color'])], 
        remainder='drop'))
])

In [None]:
vehicles_clean['paint_color'].isna().sum()

In [None]:
vehicles_clean = full_pipeline.fit_transform(vehicles_clean)

In [None]:
vehicles_clean['paint_color'].isna().sum()

In [None]:
vehicles_clean

In [None]:
pipeline = Pipeline(steps=[('paint_color_imputer', PaintColorImputer(paint_master, mapping_dict)),
                           ('ct', ColumnTransformer(transformers=[('imputer', SimpleImputer(strategy='most_frequent'), 
                                                                   ['paint_color'])],remainder='passthrough'))])

In [None]:
vehicles_clean_t = pipeline.fit_transform(vehicles_clean)

In [None]:
vehicles_clean_t = pd.DataFrame(vehicles_clean_t, columns=vehicles_clean.columns)

In [None]:
vehicles_clean_t['paint_color'].isna().sum()