In [None]:
import csv
import pandas as pd
import numpy as np
import datetime
import matplotlib.pylab as plt
import os
import importlib

import my_transformers
import utils
import const

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

from sklearn.metrics import mean_squared_error

from sklearn.base import BaseEstimator, TransformerMixin

%matplotlib inline

In [None]:
importlib.reload(utils)

df = utils.PrepareData().df
df.head()

In [None]:
importlib.reload(my_transformers)
from my_transformers import DropColumns, YearTransformer, ColumnToDateFormat, Drop33Rooms

transform_pipeline = Pipeline([
        ('yr_built_transformer', YearTransformer(column='yr_built')),
        ('33_bedrooms_row_drop', Drop33Rooms()),
        ('clean1', DropColumns(columns=['id', 'date', 'yr_renovated', 'bathrooms'])),
        # ('clean2', DropColumns(columns=['criminal_activities'])),
        ('clean3', DropColumns(columns=['zipcode'])),
        
     ])
ndf = transform_pipeline.transform(df)
# ndf.head()

# Linear Regression

In [None]:
train_X, test_X, train_y, test_y = train_test_split(
    ndf.drop(columns=['price','price_bin'], inplace=False),
    ndf['price'],
    test_size=const.TEST_SIZE,
    random_state=const.RANDOM_STATE
    )

for v in [train_X, test_X, train_y, test_y]:
    print(getattr(v, 'shape'), end=' - ')

In [None]:
lr = LinearRegression()
lr.fit(train_X, train_y)


In [None]:
lr.coef_, lr.intercept_

In [None]:
housing_predict = lr.predict(test_X)
lr_mse = mean_squared_error(test_y, housing_predict)
np.sqrt(lr_mse)

In [None]:
scores = cross_val_score(lr,
                         train_X,
                         train_y,
                         cv=4,
                         scoring='r2')
print('Mean: ', np.mean(scores))
print('STD: ', np.std(scores))

In [None]:
ridge_reg = Ridge(alpha=1, solver='cholesky')
ridge_reg.fit(train_X, train_y)
predict = lr.predict(test_X)
ridge_mse = mean_squared_error(test_y, predict)
np.sqrt(ridge_mse)

In [None]:
scores = cross_val_score(ridge_reg,
                         train_X,
                         train_y,
                         cv=4,
                         scoring='r2')
print('Mean: ', np.mean(scores))
print('STD: ', np.std(scores))

In [None]:
param_grid = {
    'alpha': [0.0001, 0.01, 0.1, 1, 10, 100, ],
    'normalize': [True, False],
    'solver': ['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga']
    }
grid = GridSearchCV(estimator=ridge_reg,
                    param_grid=param_grid,
                    scoring='r2',
                    verbose=1,
                    n_jobs=-1)
grid_result = grid.fit(train_X, train_y)

print('Best Score: ', grid_result.best_score_)
print('Best Params: ', grid_result.best_params_)

In [None]:
lasso_reg = Lasso(alpha=1)
lasso_reg.fit(train_X, train_y)
predict = lr.predict(test_X)
lasso_mse = mean_squared_error(test_y, predict)
np.sqrt(lasso_mse)

In [None]:
scores = cross_val_score(lasso_reg,
                         train_X,
                         train_y,
                         cv=5,
                         scoring='r2')
print('Mean: ', np.mean(scores))
print('STD: ', np.std(scores))

In [None]:
param_grid = {
    'alpha': [ 0.1, 1, 5, 10, 20, 30, 50, 100],
    'selection': ['cyclic', 'random'],
    'normalize': [True, False],
    }
grid = GridSearchCV(estimator=lasso_reg,
                    param_grid=param_grid,
                    scoring='r2',
                    verbose=1,
                    n_jobs=-1)
grid_result = grid.fit(train_X, train_y)

print('Best Score: ', grid_result.best_score_)
print('Best Params: ', grid_result.best_params_)


In [None]:
lasso_reg_opt= Lasso(**grid_result.best_params_)
lasso_reg_opt.fit(train_X, train_y)
predict = lr.predict(test_X)
lasso_mse = mean_squared_error(test_y, predict)
np.sqrt(lasso_mse)

scores = cross_val_score(lasso_reg_opt,
                         train_X,
                         train_y,
                         cv=5,
                         scoring='r2')
print('Mean: ', np.mean(scores))
print('STD: ', np.std(scores))


In [None]:
tr = DecisionTreeRegressor()
tr.fit(train_X, train_y)
housing_predict = tr.predict(test_X)
tr_mse = mean_squared_error(test_y, housing_predict)
np.sqrt(tr_mse)

In [None]:
param_grid = {
    'criterion': ['mse', 'friedman_mse', 'mae'],
    'splitter': ['best', 'random'],
    'max_depth': [5, 15, 25, None],
    }
grid = GridSearchCV(estimator=tr,
                    param_grid=param_grid,
                    scoring='r2',
                    verbose=1,
                    n_jobs=-1)
grid_result = grid.fit(train_X, train_y)

print('Best Score: ', grid_result.best_score_)
print('Best Params: ', grid_result.best_params_)


In [None]:
scores = cross_val_score(tr,
                         train_X,
                         train_y,
                         cv=5,
                         scoring='r2')
print('CV Mean: ', np.mean(scores))
print('STD: ', np.std(scores))

In [None]:
tr_opt= DecisionTreeRegressor(**grid_result.best_params_)
tr_opt.fit(train_X, train_y)
predict = lr.predict(test_X)
tr_mse = mean_squared_error(test_y, predict)
np.sqrt(tr_mse)

scores = cross_val_score(tr_opt,
                         train_X,
                         train_y,
                         cv=4,
                         scoring='r2')
print('Mean: ', np.mean(scores))
print('STD: ', np.std(scores))