In [52]:
import joblib
import pandas as pd
import re
import numpy as np
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder

### Testing assets

In [71]:
scaler_path = r"../assets/normalizer.save"
encoder_path = r"../assets/encoder.save"
columns_used_path = r"../assets/columns_used.save"
imputer_path = r"../assets/imputer.save"
model_path = r"../assets/model.save"

scaler = joblib.load(scaler_path)
encoder = joblib.load(encoder_path)
columns_used = joblib.load(columns_used_path)
imputer = joblib.load(imputer_path)
model = joblib.load(model_path)

assets = {"scaler":scaler, "encoder":encoder, "columns_used":columns_used, "imputer":imputer, "model":model}

### Load testing data

In [2]:
testing_data_path = r"../datasets/DatiumTest.rpt"
testing_df = pd.read_csv(testing_data_path, delimiter="\t")

  testing_df = pd.read_csv(testing_data_path, delimiter="\t")


In [3]:
testing_df.shape

(11488, 130)

In [6]:
[print(f"{col} : {null_count}") for col, null_count in testing_df.isnull().sum().items() if null_count>0]

Series : 165
SeriesModelYear : 5822
BadgeDescription : 813
BadgeSecondaryDescription : 10157
BodyConfigDescription : 9057
WheelBaseConfig : 11095
Roofline : 11288
ExtraIdentification : 10550
GearLocationDescription : 4
GearNum : 9
CamDescription : 9
FuelCapacity : 15
MethodOfDeliveryDescription : 12
GrossCombinationMAss : 5723
GrossVehicleMass : 3333
VIN : 54
WheelBase : 4
Height : 13
Length : 18
Width : 10
KerbWeight : 1126
TareMass : 1543
PayLoad : 4110
Power : 4
PowerRPMFrom : 11148
PowerRPMTo : 4
Torque : 3
TorqueRPMFrom : 8486
TorqueRPMTo : 12
RonRating : 4138
ModelCode : 1944
ValvesCylinder : 4
EngineConfigurationDescription : 7
EngineNum : 213
Acceleration : 9148
FrontTyreSize : 18
RearTyreSize : 18
FrontRimDesc : 10
RearRimDesc : 10
TowingBrakes : 519
TowingNoBrakes : 627
WarrantyCustAssist : 5952
FreeScheduledService : 11411
WarrantyYears : 14
WarrantyKM : 36
FirstServiceKM : 1759
FirstServiceMonths : 2537
RegServiceMonths : 322
AltEngEngineType : 11190
AltEngBatteryType : 111

[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None]

In [45]:
def clean_categorical_columns(input_df:pd.DataFrame):
    """
        Handle the categorical columns by dropping, converting data type

        - drop Model as could be data duplication of other cols
        - drop MakeCode, FamilyCode due to repeated col in Make,Model
        - drop Description due to length of time need to process
        - drop Series as indetifier col
        - drop VIN as identifier
        - drop EngineNum as identifier
        
        - Tyre ratings are in the format of: {width(mm)}/{ratio}R{rim size(inches)}
        - format FrontTyreSize, split into FrontTyreWidth(mm), FrontTyreRatio, FrontTyreRimSize(inches)
        - format RearTyreSize, split into RearTyreWidth(mm), RearTyreRatio, RearTyreRimSize(inches)
        - convert EngineDescription to float 

        - Drop FrontRimDesc, RearRimDesc, will have high correlation to Tyre size
        - Drop Colour, too many variations to format in limited time

        - format Sold_Date, Compliance_Date
    """

    df = input_df.copy()

    cols_to_drop = ['Model', 'MakeCode', 'FamilyCode', 'Description', 'Series', 'VIN', 'EngineNum', ]
    df.drop(cols_to_drop, axis=1, inplace=True)

    #tyre size contains two different formats
    #uncertain what format "'7.50 R16 C 6PR'" belongs to, treat as -1
    #only extract for format like 225/60 R16
    def is_standard_tyre_format(tyre_info):
        
        re_pattern = r"^\d{3}/\d{2}\s{1}R{1}\d{2}"
        pattern = re.compile(re_pattern)
        is_match = pattern.match(tyre_info)

        return is_match
        
    df['RearTyreRadius(inches)'] = df['RearTyreSize'].map(lambda tire_size: int(re.compile(r"R\d{2}").search(tire_size).group().strip().strip('R')[-1]) if is_standard_tyre_format(tire_size) else -1)
    df['RearTyreWidth(mm)'] = df['RearTyreSize'].map(lambda tire_size : int(tire_size.strip().split('/')[0]) if is_standard_tyre_format(tire_size) else -1)
    df['RearTyreRatio'] = df['RearTyreSize'].map(lambda tire_size : int(tire_size.strip().split('/')[1].split('R')[0].strip()) if is_standard_tyre_format(tire_size) else -1)
    df.drop('RearTyreSize', axis=1)

    df['FrontTyreRadius(inches)'] = df['FrontTyreSize'].map(lambda tire_size: int(re.compile(r"R\d{2}").search(tire_size).group().strip().strip('R')[-1]) if is_standard_tyre_format(tire_size) else -1)
    df['FrontTyreWidth(mm)'] = df['FrontTyreSize'].map(lambda tire_size : int(tire_size.strip().split('/')[0]) if is_standard_tyre_format(tire_size) else -1)
    df['FrontTyreRatio'] = df['FrontTyreSize'].map(lambda tire_size : int(tire_size.strip().split('/')[1].split('R')[0].strip()) if is_standard_tyre_format(tire_size) else -1)
    df.drop('FrontTyreSize', axis=1)

    df.loc[df[df['EngineDescription'] == '13B'].index,'EngineDescription'] = 3.0 #cheat here
    df['EngineDescription'] = df['EngineDescription'].astype(float)

    df.drop(['FrontRimDesc', 'RearRimDesc'], axis=1, inplace= True)

    df.drop('Colour', axis=1, inplace=True)

    #"2015-11-03 00:00:00.000"
    datetime_format = "%Y-%m-%d"
    df['Sold_Date'] = pd.to_datetime(df['Sold_Date'].map(lambda dt : dt[:10]), format=datetime_format)
    df['Sold_Year'] = df['Sold_Date'].dt.year
    df['Sold_Month'] = df['Sold_Date'].dt.month
    df.drop('Sold_Date', inplace=True, axis=1)
    
    #02/2008
    datetime_format = "%m/%Y"
    df['Compliance_Date'] = pd.to_datetime(df['Compliance_Date'], format=datetime_format)
    df['Compliance_Year'] = df['Compliance_Date'].dt.year
    df.drop('Compliance_Date', inplace=True, axis=1)

    return df

In [75]:
def normalize_data(input_df:pd.DataFrame, scaler:MinMaxScaler):
    """
        Normalise the input data
    """

    df = input_df.copy()

    SC = scaler
    numerical_df = df.select_dtypes(include=np.number)
    non_numerical_df = df.select_dtypes(exclude=np.number)
    normalized_data = SC.transform(numerical_df)
    normalized_df = pd.DataFrame(normalized_data)
    normalized_df.columns = numerical_df.columns

    normalized_df.reset_index(drop=True, inplace=True)
    non_numerical_df.reset_index(drop=True, inplace=True)

    joined_df = pd.concat([normalized_df,non_numerical_df], axis=1)

    normalize_data.scaler = SC

    return joined_df

In [69]:
def one_hot_encode(input_df:pd.DataFrame, encoder:OneHotEncoder):
    """
        One hot encode on the categorical cols
    """

    df = input_df.copy()

    non_numerical_cols_df = df[encoder.feature_names_in_]
    numerical_cols_df = df.drop(encoder.feature_names_in_, axis=1)
    
    non_numerical_cols_df.astype(str)

    en = encoder
    one_hot_encoded = en.transform(non_numerical_cols_df)

    one_hot_encode.transformer = en
    one_hot_df = pd.DataFrame(one_hot_encoded)
    one_hot_encode.one_hot_df = one_hot_df
    one_hot_df.columns = en.get_feature_names_out()
    numerical_cols_df.reset_index(drop=True, inplace=True)
    one_hot_df.reset_index(drop=True, inplace=True)
    joined_df = pd.concat([numerical_cols_df, one_hot_df], 1)

    return joined_df

In [62]:
def clean_data(input_df:pd.DataFrame, assets:dict):
    """
        Clean the testing data
    """

    df = input_df.copy()
    features = df[assets["columns_used"]]

    imputer = assets["imputer"]
    imputed_vals = imputer.transform(features)
    imputed_df = pd.DataFrame(imputed_vals)
    imputed_df.columns = features.columns

    formatted_df = clean_categorical_columns(imputed_df)

    normalized_df= normalize_data(formatted_df, assets['scaler'])

    encoded_df = one_hot_encode(normalized_df, assets['encoder'])
    

    

    return encoded_df

    

In [66]:
encoder = assets['encoder']

In [68]:
encoder.feature_names_in_

array(['Make', 'CurrentRelease', 'ImportFlag', 'LimitedEdition',
       'BodyStyleDescription', 'DriveDescription', 'DriveCode',
       'GearTypeDescription', 'GearLocationDescription',
       'FuelTypeDescription', 'InductionDescription', 'OptionCategory',
       'CamDescription', 'EngineTypeDescription',
       'FuelDeliveryDescription', 'MethodOfDeliveryDescription',
       'BuildCountryOriginDescription', 'EngineCycleDescription',
       'EngineConfigurationDescription', 'EngineLocation',
       'FrontTyreSize', 'RearTyreSize', 'VFactsClass', 'VFactsSegment',
       'IsPPlateApproved', 'Branch', 'SaleCategory'], dtype=object)

In [72]:
cleaned_features_df =  clean_data(testing_df, assets)

  joined_df = pd.concat([numerical_cols_df, one_hot_df], 1)


In [73]:
predictions = model.predict(cleaned_features_df.values)



In [74]:
cleaned_features_df.head()

Unnamed: 0,EngineDescription,RearTyreRadius(inches),RearTyreWidth(mm),RearTyreRatio,FrontTyreRadius(inches),FrontTyreWidth(mm),FrontTyreRatio,Sold_Year,Sold_Month,Compliance_Year,...,Branch_Tamworth (NSW),Branch_Townsville (QLD),Branch_Tullamarine (VIC),Branch_Welshpool (WA),SaleCategory_Auction,SaleCategory_Dealer Only Auction,SaleCategory_Fixed Price,SaleCategory_Pickles Online,SaleCategory_Special Fixed Price,SaleCategory_Tender
0,0.006687,0.7,0.836601,0.876543,0.7,0.864865,0.876543,0.5,0.636364,0.974359,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1,0.012483,1.0,0.803922,0.691358,1.0,0.831081,0.691358,0.5,0.363636,0.991453,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,0.008025,0.8,0.705882,0.62963,0.8,0.72973,0.62963,0.0,0.818182,0.957265,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,0.005796,0.8,0.705882,0.753086,0.8,0.72973,0.753086,0.0,0.181818,0.974359,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,0.008025,0.7,0.673203,1.0,0.7,0.695946,1.0,0.0,1.0,0.948718,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
