# Introduction
Know the practical flow of machine learning
Complete a model with high generalization performance

In [55]:
# predefines and import
import pandas as pd
import sklearn as sk
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Data Prep
Let's copy the baseline model

In [56]:
init_data = pd.read_csv('../Data/Normal/houseprice_train.csv')
house_price = init_data[['GrLivArea','YearBuilt','TotalBsmtSF','OverallQual','SalePrice']]
print('Any Null value?',house_price.isnull().sum().all())

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X = house_price[['GrLivArea','YearBuilt','TotalBsmtSF','OverallQual']]
Y = house_price[['SalePrice']]
X = scaler.fit_transform(X)
Y = scaler.fit_transform(Y).reshape((Y.shape[0],))
x_train,x_test,y_train,y_test = train_test_split(X,Y,test_size=0.2)
train = x_train,y_train
test = x_test,y_test


def plot_prediction(xtest,ytest,pred, methodname):
    fig = plt.figure(figsize=[8,8])
    ax = fig.add_subplot(projection='3d')
    ax.set_title(methodname)
    ax.scatter(xtest[:,0],xtest[:,1],ytest,c = 'lightblue')
    ax.scatter(xtest[:,0],xtest[:,1],pred,c = 'red')
    ax.legend(['Real Values', 'Predictions'])
    ax.set_xlabel('LivingRoom Area')
    ax.set_ylabel('Year Built')
    ax.set_zlabel('Sale Price')
    plt.show()

def test_plot_summary(cls, train,test,plot = True):
    x_train,y_train = train
    x_test,y_test = test
    cls.fit(x_train,y_train)
    prediction = cls.predict(x_test)

    method_name = cls.__class__.__name__
    if plot: plot_prediction(x_test,y_test,prediction,method_name)
    return pd.DataFrame(columns=[method_name], index= ['MSE'], data=[[mean_squared_error(y_test,prediction)]])


from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.svm import SVR

cls = SVR()
summary = test_plot_summary(cls, train,test,plot = False)
print(summary)

Any Null value? False
         SVR
MSE  0.13696


# Problem 1
## Cross Validation

In [57]:
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
model = SVR()
k = 5
kfold = KFold(n_splits= 5)
score = []
for train_index, test_index in kfold.split(X):
    x_train,x_test = X[train_index], X[test_index]
    y_train,y_test = Y[train_index], Y[test_index]
    model.fit(x_train,y_train)
    pred = model.predict(x_test)
    score.append(mean_squared_error(y_test,pred))
print(score)
print('AVG score: ', np.average(score))


[0.14291340718541917, 0.19050930282014564, 0.2401485514475553, 0.11284134542682014, 0.28461876174798084]
AVG score:  0.19420627372558422


# Problem 2
## Grid Search

In [58]:
display(model.get_params())

{'C': 1.0,
 'cache_size': 200,
 'coef0': 0.0,
 'degree': 3,
 'epsilon': 0.1,
 'gamma': 'scale',
 'kernel': 'rbf',
 'max_iter': -1,
 'shrinking': True,
 'tol': 0.001,
 'verbose': False}

In [59]:
from sklearn.model_selection import GridSearchCV
param_svr = {
    'kernel' : ('linear', 'poly', 'rbf', 'sigmoid'),
    'degree' : [3,8],
    'coef0' : [0.01,0.1,0.5],
    }
base_estimator = SVR() 
grid_search_model = GridSearchCV(base_estimator,param_svr,cv = 3,n_jobs = -1, verbose = 2,scoring='neg_mean_squared_error')
grid_search_model.fit(X,Y)
print(grid_search_model.best_params_)
#Model score of the best param 
score0 = grid_search_model.score(X,Y)
print('SVR Error: ', score0)


Fitting 3 folds for each of 24 candidates, totalling 72 fits
{'coef0': 0.01, 'degree': 3, 'kernel': 'rbf'}
SVR Error:  -0.14436565770267767


# Problem 3
## Survey the Kaggle Community

### NOTE: Ideas
**So this assignment is about comparing the 'generalization' of models**

Some modes that can be used (Surveyed Kaggle):
- Lasso Regression
- Random Forest Regressor
- Gradient Boosting Regressor

I'll compare these models with K-fold validation and choose the best performing one.

Parameter tuning if possible 

# Problem 4
## Creating a model with high generalization performance

In [None]:
param_lasso = [
    {
        'alpha' : [0.005, 0.02, 0.03, 0.05, 0.06],
    }
]

from sklearn.linear_model import Lasso
lasso = Lasso()
grid_search_model = GridSearchCV(lasso,param_lasso,scoring='neg_mean_squared_error',cv = 3,n_jobs = -1, verbose = 2)
grid_search_model.fit(X,Y)
print(grid_search_model.best_params_)
score1 = grid_search_model.score(X,Y)
print('Lasso regression score:', score1)

Fitting 5 folds for each of 5 candidates, totalling 25 fits
{'alpha': 0.005}
Lasso regression score: -0.2421882692302284


In [63]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor

forest = RandomForestRegressor()
param_grid = { 
    'n_estimators': [200, 500],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth' : [4,5,6,7,8],
    'criterion' :['mse', 'mae']
}
random_forest = RandomForestRegressor(random_state=42)
grid_search_model = GridSearchCV(forest,param_grid,cv = 3,scoring='neg_mean_squared_error',verbose=2, n_jobs=-1)
grid_search_model.fit(X,Y)
print(grid_search_model.best_params_)
score2 = grid_search_model.score(X,Y)
print('Random Forest regression score:', score2)  

Fitting 3 folds for each of 60 candidates, totalling 180 fits
{'criterion': 'mae', 'max_depth': 8, 'max_features': 'sqrt', 'n_estimators': 500}
Random Forest regression score: -0.07024430907219678


In [64]:
from sklearn.ensemble import GradientBoostingRegressor
# gradient = GradientBoostingClassifier(max_features = 'sqrt', max_depth = 2, learning_rate = 0.5)
gadient = GradientBoostingRegressor()
param_gradient = {'learning_rate': [0.01,0.05,0.1],
                  'subsample'    : [0.9, 0.5, 0.1],
                  'n_estimators' : [100,500,1000],
                  'max_depth'    : [4,8,10]
                 }

grid_search_model = GridSearchCV(gadient,param_gradient,cv = 3,scoring='neg_mean_squared_error',verbose=2, n_jobs=-1)
grid_search_model.fit(X,Y)
print(grid_search_model.best_params_)
score3 = grid_search_model.score(X,Y)
print('Gradient Boosting score: ',score3)   

Fitting 3 folds for each of 81 candidates, totalling 243 fits
{'learning_rate': 0.01, 'max_depth': 4, 'n_estimators': 500, 'subsample': 0.9}
Gradient Boosting score:  -0.07280230776660525


In [67]:
# Comparing all:
score_map = {'SVR': score0, 'Lasso': score1, 'GradientBoost': score3,'RandomForest': score2}
best = -10
mod = None
print(score_map)
for model,value in score_map.items():
    if value > best: 
        best = value
        mod = model
print('Best Model: ', mod)
        


{'SVR': -0.14436565770267767, 'Lasso': -0.2421882692302284, 'GradientBoost': -0.07280230776660525, 'RandomForest': -0.07024430907219678}
Best Model:  RandomForest


# Problem 5
## Final model selection

### NOTE: From the above result
I'll go with the random forest model.

**Model will be re-train on the full dataset to get most of the value**

In [68]:
# best param: {'criterion': 'mae', 'max_depth': 8, 'max_features': 'sqrt', 'n_estimators': 500}
final_model = RandomForestRegressor(criterion='mae',max_depth=8, max_features='sqrt', n_estimators=500)
final_model.fit(X,Y)


RandomForestRegressor(criterion='mae', max_depth=8, max_features='sqrt',
                      n_estimators=500)

In [91]:
#Load
test_data = pd.read_csv('../Data/Normal/test_house_price.csv')
test_ids = test_data['Id']

#preprocess
x_test = test_data[['GrLivArea','YearBuilt','TotalBsmtSF','OverallQual']]
print('Null check: ')
display(x_test.isna().sum())
x_test = x_test.fillna(0)
# show test data
print('Test Data')
display(x_test.head())
#predict
prediction = final_model.predict(x_test)
result = scaler.inverse_transform(prediction)
print('result')
display(result[:10])
#put into a frame
submission = pd.DataFrame()
submission['Id'] = test_ids
submission['SalePrice'] = result
display(submission.head())

Null check: 


GrLivArea      0
YearBuilt      0
TotalBsmtSF    1
OverallQual    0
dtype: int64

Test Data


Unnamed: 0,GrLivArea,YearBuilt,TotalBsmtSF,OverallQual
0,896,1961,882.0,5
1,1329,1958,1329.0,6
2,1629,1997,928.0,5
3,1604,1998,926.0,6
4,1280,1992,1280.0,8


result


array([255160.438, 255160.438, 255160.438, 255160.438, 255160.438,
       255160.438, 255160.438, 255160.438, 255160.438, 255160.438])

Unnamed: 0,Id,SalePrice
0,1461,255160.438
1,1462,255160.438
2,1463,255160.438
3,1464,255160.438
4,1465,255160.438


In [None]:
#write csv
submission.to_csv('../Data/Normal/submission_house_price_sprint1.csv',index = False)

## Submission Result
![image](./sprint_submission.PNG)

## Conclusion
Seems that the result is real bad, most of the houses predicted value is quite similar.

Could be because of the fact that i was picking only a few attributes.