# Homework 12

https://scikit-learn.org/0.15/modules/scaling_strategies.html#incremental-learning

* Implement a mini batch functionality to train a regressor.
    - (Optional) If anyone want to do this in a pipeline can do this: https://koaning.github.io/tokenwiser/api/pipeline.html

* Save model, load the model again and test it on `X_test` __Do NOT commit the pickle file__

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
def test_df():
    url = 'https://raw.githubusercontent.com/msaricaumbc/DS_data/master/ds602/car_prices/car_prices.csv'
    df = pd.read_csv(url, low_memory=False)

    df = df.sample(5000, random_state=100).reset_index(drop=True)
    
    y = df['sellingprice']
    X = df.drop('sellingprice', axis=1)
    
    return X, y

def partial_df():
    url = 'https://raw.githubusercontent.com/msaricaumbc/DS_data/master/ds602/car_prices/car_prices.csv'
    chunk_size = 100
    df_chunk = pd.read_csv(url, low_memory=False, chunksize=chunk_size)
   
    for chunk in df_chunk:
        yield chunk

gen = partial_df()


In [3]:
X_test, y_test = test_df()

In [4]:
# each time you call this you will get a new slice of the dataframe.
next(gen)

Unnamed: 0,year,make,model,trim,body,transmission,vin,state,condition,odometer,color,interior,seller,mmr,sellingprice,saledate
0,2015,Kia,Sorento,LX,SUV,automatic,5xyktca69fg566472,ca,5.0,16639,white,black,"kia motors america, inc",20500,21500,Tue Dec 16 2014 12:30:00 GMT-0800 (PST)
1,2015,Kia,Sorento,LX,SUV,automatic,5xyktca69fg561319,ca,5.0,9393,white,beige,"kia motors america, inc",20800,21500,Tue Dec 16 2014 12:30:00 GMT-0800 (PST)
2,2014,BMW,3 Series,328i SULEV,Sedan,automatic,wba3c1c51ek116351,ca,4.5,1331,gray,black,financial services remarketing (lease),31900,30000,Thu Jan 15 2015 04:30:00 GMT-0800 (PST)
3,2015,Volvo,S60,T5,Sedan,automatic,yv1612tb4f1310987,ca,4.1,14282,white,black,volvo na rep/world omni,27500,27750,Thu Jan 29 2015 04:30:00 GMT-0800 (PST)
4,2014,BMW,6 Series Gran Coupe,650i,Sedan,automatic,wba6b2c57ed129731,ca,4.3,2641,gray,black,financial services remarketing (lease),66000,67000,Thu Dec 18 2014 12:30:00 GMT-0800 (PST)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,2014,BMW,6 Series,650i,Convertible,automatic,wbayp9c53ed169288,ca,3.7,8891,silver,black,the hertz corporation,68000,67200,Wed Dec 17 2014 12:30:00 GMT-0800 (PST)
96,2014,Chevrolet,Cruze,2LT,Sedan,automatic,1g1pe5sb8e7420706,ca,2.0,8154,silver,black,enterprise vehicle exchange / tra / rental / t...,14150,10000,Thu Dec 18 2014 12:00:00 GMT-0800 (PST)
97,2014,Chevrolet,Silverado 1500,LT,Crew Cab,automatic,3gcpcrecxeg363552,ca,3.7,6726,black,black,repo remarketing/visterra credit union,26900,30250,Wed Dec 17 2014 12:30:00 GMT-0800 (PST)
98,2014,Chevrolet,Cruze,2LT,Sedan,automatic,1g1pe5sb4e7109323,ca,3.7,36771,silver,black,avis rac/san leandro,13000,13600,Wed Dec 17 2014 12:30:00 GMT-0800 (PST)


In [5]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import SGDRegressor, LinearRegression
from sklearn.feature_extraction.text import TfidfVectorizer

In [6]:
def train_regressor_with_mini_batch(n, batch_size=100, num_epochs=10):
    

    # Define preprocessing steps for numerical and categorical columns
    numerical_preprocessor = Pipeline([
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])
    categorical_preprocessor = OneHotEncoder(handle_unknown='ignore')

    # Define column transformer to apply different preprocessing steps to different columns
    column_transformer = ColumnTransformer([
        ('numerical', numerical_preprocessor, ['year', 'odometer']),
        ('categorical', categorical_preprocessor, ['make', 'model', 'trim', 'body', 'transmission', 'vin', 'state', 'condition', 'color', 'interior', 'seller', 'mmr', 'saledate']),
    ])

    # Define the pipeline
    pipeline = Pipeline([
        ('preprocess', column_transformer),
        ('regressor', SGDRegressor(learning_rate='constant', eta0=0.01))
    ])

    for epoch in range(num_epochs):

        for i in range(n// batch_size):
            df_batch = next(gen)
            X_batch = df_batch.drop('sellingprice', axis=1)
            y_batch = df_batch['sellingprice']

            # Fit the regressor on the mini-batch using fit
            pipeline.fit(X_batch, y_batch)
        print(".", end="")

    return pipeline

In [7]:
n = len(X_test)
regressor = train_regressor_with_mini_batch(n, batch_size=10, num_epochs=10)

..........

In [8]:
import joblib
# Save the trained model
joblib.dump(regressor, 'regressor_model.pkl')

['regressor_model.pkl']

In [9]:
# Load the model from file
loaded_regressor = joblib.load('regressor_model.pkl')

In [10]:
y_pred = loaded_regressor.predict(X_test)

In [11]:
mse = mean_squared_error(y_test, y_pred)

print("Mean Squared Error:", mse)

Mean Squared Error: 49687333.0059129
