In [9]:
from feature_engine.pipeline import Pipeline as pipeline
from feature_engine.encoding import RareLabelEncoder as rare_encoder
from feature_engine.encoding import MeanEncoder as target_encoder
from feature_engine.imputation import MeanMedianImputer as median_imputer
from feature_engine.imputation import CategoricalImputer as categorical_imputer
from feature_engine.outliers import Winsorizer as winsorizer

In [None]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [19]:
from xgboost import XGBRegressor as xgbr
from sklearn.model_selection import train_test_split
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import plotly.graph_objects as go
from sklearn.model_selection import KFold, StratifiedKFold

In [None]:
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor

In [5]:
df = pd.read_csv("/Users/owner/Desktop/data_science/ml_course/aipi510/assignment9/train.csv")

In [6]:
def understand_df_compact(df):
    # Dimensions of dataset
    print("———————————————————————————————————————————————————————————")
    print("Dimension of the dataset is", df.shape, "\n")

    # Summary of dataset
    print("———————————————————————————————————————————————————————————")
    print("Summary of the dataset is \n", df.describe(), "\n")
    
    print("———————————————————————————————————————————————————————————")  
    print("Number of duplicates:")
    print(df.duplicated().sum())

    # Stats of dataset
    stats = []

    for col in df.columns:
        stats.append((col, df[col].nunique(), df[col].isnull().sum() * 100 / df.shape[0], df[col].dtype))

    stats_df = pd.DataFrame(stats, columns=['Feature', 'Unique_values', 'Percentage of missing values', 'Type'])
    print("———————————————————————————————————————————————————————————")
    
    print(f"Statistics of the dataset are \n {stats_df.sort_values('Percentage of missing values', ascending=False)}\n\n")

In [7]:
understand_df_compact(df)

———————————————————————————————————————————————————————————
Dimension of the dataset is (188533, 13) 

———————————————————————————————————————————————————————————
Summary of the dataset is 
                   id     model_year         milage         price
count  188533.000000  188533.000000  188533.000000  1.885330e+05
mean    94266.000000    2015.829998   65705.295174  4.387802e+04
std     54424.933488       5.660967   49798.158076  7.881952e+04
min         0.000000    1974.000000     100.000000  2.000000e+03
25%     47133.000000    2013.000000   24115.000000  1.700000e+04
50%     94266.000000    2017.000000   57785.000000  3.082500e+04
75%    141399.000000    2020.000000   95400.000000  4.990000e+04
max    188532.000000    2024.000000  405000.000000  2.954083e+06 

———————————————————————————————————————————————————————————
Number of duplicates:
0
———————————————————————————————————————————————————————————
Statistics of the dataset are 
          Feature  Unique_values  Percentage of

In [8]:
df.drop(columns=["id", "ext_col", "int_col", "clean_title"], inplace=True)

In [10]:
def discover_categorical_and_numerical(df):
  # type_dictionary = {col: ("categorical" if df[col].dtype == "O" else "numerical") for col in df.columns}
  # categorical = [key for key, val in type_dictionary.items() if val == "categorical"]

  categorical = df.select_dtypes(include=object).columns.to_list() # [c in df.columns if df[c].dtype == "O"]
  numerical = df.select_dtypes(include=np.number).columns.to_list() # df.columns.difference(categorical)

  # make a list of discrete variables
  discrete = [var for var in numerical if len(df[var].unique()) < 20]

  # categorical encoders work only with object type variables to treat numerical variables as categorical, we need to re-cast them
  df[discrete]= df[discrete].astype('O')

  # update numerical variables as continuous variables
  numerical = [var for var in numerical if var not in discrete]

  print(f"There are {len(categorical)} categorical, {len(discrete)} discrete numerical, and {len(numerical)} continuous numerical variables / features in the dataset")
  return categorical, discrete, numerical

In [11]:
categorical, discrete, numerical = discover_categorical_and_numerical(df)
categorical, discrete, numerical

There are 6 categorical, 0 discrete numerical, and 3 continuous numerical variables / features in the dataset


(['brand', 'model', 'fuel_type', 'engine', 'transmission', 'accident'],
 [],
 ['model_year', 'milage', 'price'])

In [12]:
y_train = df['price'] 
x_train = df.drop(['price'],axis=1)

In [15]:
numerical.remove("price")

In [24]:
x_train.to_csv("/Users/owner/Desktop/data_science/ml_course/aipi510/assignment9/x_train.csv")
y_train.to_csv("/Users/owner/Desktop/data_science/ml_course/aipi510/assignment9/y_train.csv")

In [18]:
processor = pipeline([
    # impute missing values with median value of the corresponding feature, applies to numerical features only
    ('median_imputation', median_imputer(imputation_method='median', variables=numerical)),

    # winsorizer works by capping, instead of removing the outliers. This preserves the useful info from the data and keep the distribution relatively in tact.
    ('outlier_treatment', winsorizer(capping_method='iqr', variables=numerical)),

    # encode NA or missing values in categorical features with "missing".
    ('missing_val_encoding', categorical_imputer(variables=categorical, return_object=False, ignore_format=False)),

    # should there be rare values within a feature, encode with a label 'rare'
    ('rare_val_encoding', rare_encoder(variables=categorical,tol=0.05, n_categories=10)),

    # Encode with the avarage target value after grouping by each categorical feature. 
    ('target_encoder', target_encoder(variables=categorical))
])

x_train_transformed = processor.fit_transform(x_train, y_train)
x_train_transformed.head(2)



Unnamed: 0,brand,model,model_year,milage,fuel_type,engine,transmission,accident
0,39350.16225,43878.016178,2007,202327.5,43360.013782,43878.016178,31711.97914,49024.804144
1,39350.16225,43878.016178,2002,143250.0,43360.013782,43878.016178,31711.97914,25334.071475


In [None]:
x_train

In [20]:
import pickle

In [23]:
# 1. Initialize the XGBoost Regressor
model = xgbr(n_jobs=-1)

# 2. Train the model
model.fit(x_train_transformed, y_train)

# 3. Save the model to a pickle file using with open()
filename = '/Users/owner/Desktop/data_science/ml_course/aipi510/assignment9/xgbr_model.pkl'
with open(filename, 'wb') as file:
    pickle.dump(model, file)

In [25]:
with open('/Users/owner/Desktop/data_science/ml_course/aipi510/assignment9/preprocessor.pkl', 'wb') as file:
    pickle.dump(processor, file)

In [26]:
x_train_transformed.columns

Index(['brand', 'model', 'model_year', 'milage', 'fuel_type', 'engine',
       'transmission', 'accident'],
      dtype='object')

In [30]:
x_train = pd.read_csv('/Users/owner/Desktop/data_science/ml_course/aipi510/assignment9/x_train.csv')
x_train = x_train[['brand', 'model', 'model_year', 'milage', 'fuel_type', 'engine', 'transmission', 'accident']]

In [36]:
x_train["model"].nunique()

1897

In [32]:
for c in x_train.columns:
    print(x_train[c].unique())

['MINI' 'Lincoln' 'Chevrolet' 'Genesis' 'Mercedes-Benz' 'Audi' 'Ford'
 'BMW' 'Tesla' 'Cadillac' 'Land' 'GMC' 'Toyota' 'Hyundai' 'Volvo'
 'Volkswagen' 'Buick' 'Rivian' 'RAM' 'Hummer' 'Alfa' 'INFINITI' 'Jeep'
 'Porsche' 'McLaren' 'Honda' 'Lexus' 'Dodge' 'Nissan' 'Jaguar' 'Acura'
 'Kia' 'Mitsubishi' 'Rolls-Royce' 'Maserati' 'Pontiac' 'Saturn' 'Bentley'
 'Mazda' 'Subaru' 'Ferrari' 'Aston' 'Lamborghini' 'Chrysler' 'Lucid'
 'Lotus' 'Scion' 'smart' 'Karma' 'Plymouth' 'Suzuki' 'FIAT' 'Saab'
 'Bugatti' 'Mercury' 'Polestar' 'Maybach']
['Cooper S Base' 'LS V8' 'Silverado 2500 LT' ... 'e-Golf SE'
 'Integra w/A-Spec Tech Package' 'IONIQ Plug-In Hybrid SEL']
[2007 2002 2017 2021 2018 2016 2020 2015 2011 2013 2023 2019 2012 2014
 2008 2009 2022 2003 2005 2001 2006 2000 2010 2004 1997 1998 1999 1994
 1993 1996 1995 2024 1974 1992]
[213000 143250 136731 ...   4721   2922 134603]
['Gasoline' 'E85 Flex Fuel' nan 'Hybrid' 'Diesel' 'Plug-In Hybrid' '–'
 'not supported']
['172.0HP 1.6L 4 Cylinder Engine Gas

In [None]:
manufacturers = ['MINI' 'Lincoln' 'Chevrolet' 'Genesis' 'Mercedes-Benz' 'Audi' 'Ford'
 'BMW' 'Tesla' 'Cadillac' 'Land' 'GMC' 'Toyota' 'Hyundai' 'Volvo'
 'Volkswagen' 'Buick' 'Rivian' 'RAM' 'Hummer' 'Alfa' 'INFINITI' 'Jeep'
 'Porsche' 'McLaren' 'Honda' 'Lexus' 'Dodge' 'Nissan' 'Jaguar' 'Acura'
 'Kia' 'Mitsubishi' 'Rolls-Royce' 'Maserati' 'Pontiac' 'Saturn' 'Bentley'
 'Mazda' 'Subaru' 'Ferrari' 'Aston' 'Lamborghini' 'Chrysler' 'Lucid'
 'Lotus' 'Scion' 'smart' 'Karma' 'Plymouth' 'Suzuki' 'FIAT' 'Saab'
 'Bugatti' 'Mercury' 'Polestar' 'Maybach']

