<a href="https://colab.research.google.com/github/richlee-Lee/richlee-code-book/blob/main/chapter8/sklearn_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!wget -q https://storage.googleapis.com/low-code-ai-book/car_prices_train.csv
!wget -q https://storage.googleapis.com/low-code-ai-book/car_prices_valid.csv
!wget -q https://storage.googleapis.com/low-code-ai-book/car_prices_test.csv

In [None]:
import pandas as pd

train_df = pd.read_csv('./car_prices_train.csv')
valid_df = pd.read_csv('./car_prices_valid.csv')
test_df = pd.read_csv('./car_prices_test.csv')

In [None]:
train_df.head()

In [None]:
train_df.describe(include='all')

In [None]:
valid_df.describe(include='all')

In [None]:
test_df.describe(include='all')

In [None]:
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline

y_train = train_df['sellingprice']
X_train = train_df.drop('sellingprice', axis=1)

def drop_columns(df, columns):
    return df.drop(columns, axis=1)

preproc_cols = FunctionTransformer(drop_columns, kw_args={"columns":['Unnamed: 0', 'mmr']})

numeric_columns = ['year', 'condition', 'odometer']
categorical_columns = ['make', 'model', 'trim', 'body',
                       'transmission', 'state', 'color', 'interior']

col_transformer = ColumnTransformer(
  [('ohe', OneHotEncoder(drop='if_binary', handle_unknown='infrequent_if_exist'),
      categorical_columns),
   ('minmax', MinMaxScaler(), numeric_columns)])

model = LinearRegression()

pipeline = Pipeline(steps=[('preproc_cols' , preproc_cols),
                           ('col_transformer', col_transformer),
                           ('model', model)])

pipeline.fit(X_train, y_train)


In [None]:
import math
from sklearn.metrics import mean_squared_error, mean_absolute_error

y_valid = valid_df['sellingprice']
X_valid = valid_df.drop('sellingprice', axis=1)

print('R2:', pipeline.score(X_valid, y_valid))
print('RMSE:',math.sqrt(mean_squared_error(y_valid, pipeline.predict(X_valid))))
print('MAE:', mean_absolute_error(y_valid, pipeline.predict(X_valid)))


In [None]:
import pandas as pd

from sklearn.preprocessing import (OneHotEncoder, MinMaxScaler,
                                   FunctionTransformer,
                                   KBinsDiscretizer)
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline

y_train = train_df['sellingprice']
X_train = train_df.drop('sellingprice', axis=1)

def preproc_cols(df, drop_cols):
    return df.drop(drop_cols, axis=1)

drop_cols = FunctionTransformer(preproc_cols, kw_args={"drop_cols":['Unnamed: 0', 'mmr']})
ohe = OneHotEncoder(drop='if_binary', handle_unknown='infrequent_if_exist')
minmax = MinMaxScaler()
bucket_cond = KBinsDiscretizer(n_bins=10, encode='onehot', strategy='uniform')

numeric_columns = ['year', 'odometer']
categorical_columns = ['make', 'model', 'trim', 'body',
                       'transmission', 'state', 'color', 'interior']

col_transformer = ColumnTransformer(
  [('ohe', ohe, categorical_columns),
   ('minmax', minmax, numeric_columns),
   ('bucket_cond', bucket_cond, ['condition'])])

pipeline = Pipeline(steps=[('drop_cols' , drop_cols),
                           ('col_transformer', col_transformer),
                           ('model', model)])

pipeline.fit(X_train, y_train)


In [None]:
print('R2:', pipeline.score(X_valid, y_valid))
print('RMSE:',math.sqrt(mean_squared_error(y_valid, pipeline.predict(X_valid))))
print('MAE:', mean_absolute_error(y_valid, pipeline.predict(X_valid)))

In [None]:
(train_df.trim.value_counts()==1).sum()

In [None]:
import pandas as pd

from sklearn.preprocessing import (OneHotEncoder, MinMaxScaler,
                                   FunctionTransformer,
                                   KBinsDiscretizer)
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline

y_train = train_df['sellingprice']
X_train = train_df.drop('sellingprice', axis=1)

def preproc_cols(df, drop_cols):
    return df.drop(drop_cols, axis=1)

drop_cols = FunctionTransformer(preproc_cols, kw_args={"drop_cols":['Unnamed: 0', 'mmr']})
ohe = OneHotEncoder(drop='if_binary', handle_unknown='infrequent_if_exist', min_frequency=1)
minmax = MinMaxScaler()
bucket_cond = KBinsDiscretizer(n_bins=10, encode='onehot', strategy='uniform')

numeric_columns = ['year', 'odometer']
categorical_columns = ['make', 'model', 'trim', 'body',
                       'transmission', 'state', 'color', 'interior']

col_transformer = ColumnTransformer(
  [('ohe', ohe, categorical_columns),
   ('minmax', minmax, numeric_columns),
   ('bucket_cond', bucket_cond, ['condition'])])

pipeline = Pipeline(steps=[('drop_cols' , drop_cols),
                           ('col_transformer', col_transformer),
                           ('model', model)])

pipeline.fit(X_train, y_train)


In [None]:
print('R2:', pipeline.score(X_valid, y_valid))
print('RMSE:',math.sqrt(mean_squared_error(y_valid, pipeline.predict(X_valid))))
print('MAE:', mean_absolute_error(y_valid, pipeline.predict(X_valid)))

In [None]:
import pandas as pd

from sklearn.preprocessing import (OneHotEncoder, MinMaxScaler,
                                   FunctionTransformer,
                                   KBinsDiscretizer)
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline

y_train = train_df['sellingprice']
X_train = train_df.drop('sellingprice', axis=1)

def preproc_cols(df, drop_cols):

    df['model_trim'] = df['model'] + df['trim']
    df['model_trim'] = df['model_trim'].str.lower()

    df['color_interior'] = df['color'] + df['interior']
    df['color_interior'] = df['color_interior'].str.lower()

    return df.drop(drop_cols, axis=1)

drop_cols = FunctionTransformer(preproc_cols, kw_args={"drop_cols":['Unnamed: 0', 'mmr']})
ohe = OneHotEncoder(drop='if_binary', handle_unknown='infrequent_if_exist', min_frequency=1)
minmax = MinMaxScaler()
bucket_cond = KBinsDiscretizer(n_bins=10, encode='onehot', strategy='uniform')

numeric_columns = ['year', 'odometer']
categorical_columns = ['make', 'model', 'trim', 'model_trim', 'body',
                       'transmission', 'state', 'color', 'interior',
                       'color_interior']

col_transformer = ColumnTransformer(
  [('ohe', ohe, categorical_columns),
   ('minmax', minmax, numeric_columns),
   ('bucket_cond', bucket_cond, ['condition'])])

pipeline = Pipeline(steps=[('drop_cols' , drop_cols),
                           ('col_transformer', col_transformer),
                           ('model', model)])

pipeline.fit(X_train, y_train)

In [None]:
print('R2:', pipeline.score(X_valid, y_valid))
print('RMSE:',math.sqrt(mean_squared_error(y_valid, pipeline.predict(X_valid))))
print('MAE:', mean_absolute_error(y_valid, pipeline.predict(X_valid)))

In [None]:
import pandas as pd

from sklearn.preprocessing import (OneHotEncoder, MinMaxScaler,
                                   FunctionTransformer,
                                   KBinsDiscretizer)
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline

y_train = train_df['sellingprice']
X_train = train_df.drop('sellingprice', axis=1)

def preproc_cols(df, drop_cols):

    df['model_trim'] = df['model'] + df['trim']
    df['model_trim'] = df['model_trim'].str.lower()

    df['color_interior'] = df['color'] + df['interior']
    df['color_interior'] = df['color_interior'].str.lower()

    return df.drop(drop_cols, axis=1)

drop_cols = FunctionTransformer(preproc_cols, kw_args={"drop_cols":['Unnamed: 0', 'mmr']})
ohe = OneHotEncoder(drop='if_binary', handle_unknown='infrequent_if_exist', min_frequency=1)
minmax = MinMaxScaler()
bucket_cond = KBinsDiscretizer(n_bins=10, encode='onehot',
    strategy='uniform')
bucket_odo = KBinsDiscretizer(n_bins=10, encode='onehot',
    strategy='quantile')
bucket_year = KBinsDiscretizer(n_bins=10, encode='onehot',
    strategy='uniform')

categorical_columns = ['make', 'model', 'trim', 'model_trim', 'body',
                       'transmission', 'state', 'color', 'interior',
                       'color_interior']

col_transformer = ColumnTransformer(
  [('ohe', ohe, categorical_columns),
   ('minmax', minmax, numeric_columns),
   ('bucket_cond', bucket_cond, ['condition']),
   ('bucket_odo', bucket_odo, ['odometer']),
   ('bucket_year', bucket_year, ['year'])]
   )

pipeline = Pipeline(steps=[('drop_cols' , drop_cols),
                           ('col_transformer', col_transformer),
                           ('model', model)])

grid_params = {'col_transformer__bucket_cond__n_bins': range(8,13),
               'col_transformer__bucket_odo__n_bins': range(8,13),
               'col_transformer__bucket_year__n_bins': range(8,13),
               'col_transformer__ohe__min_frequency': range(5)
              }

from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingGridSearchCV

grid_search = HalvingGridSearchCV(pipeline, grid_params,
                                  cv=3, verbose=0,
                                  scoring='neg_mean_absolute_error')

grid_search.fit(X_train, y_train)
print(grid_search.best_params_)

In [None]:
y_valid = valid_df['sellingprice']
X_valid = valid_df.drop('sellingprice', axis=1)

print('MAE:', mean_absolute_error(y_valid, grid_search.predict(X_valid)))
print('MAE:', math.sqrt(mean_squared_error(y_valid, grid_search.predict(X_valid))))