## Assignment-B Multivariant Linear Regression
@author: Kai-Ping Wang

## Importing the libraries
Import all required libraries, and also set some libraries options.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

pd.set_option('display.max_columns', None)

## Importing the dataset

In [2]:
file_url = 'https://raw.githubusercontent.com/aso-uts/applied_ds/master/assignment1/cancer_reg.csv'
df = pd.read_csv(file_url, encoding='ISO-8859-1')

## Data Processing
As we can use all features in the data, we will have to convert all categorical data. Use previous knowledge to remove insignificant features with lots of missing value or dummy value.

In [3]:
df_base = pd.concat([df.TARGET_deathRate,df.drop(columns='TARGET_deathRate')],axis=1)
df_base.AvgHouseholdSize = df_base.AvgHouseholdSize.apply(lambda x: x if x > 1 else x*100)
df_base.MedianAge = df_base.MedianAge.apply(lambda x: x if x < 100 else x/10)


In [4]:
from sklearn.preprocessing import OneHotEncoder

df_base[['county','state']] = df_base['Geography'].str.split(', ',1,expand=True)
enc = OneHotEncoder()
X = enc.fit_transform(df_base[['state']]).toarray()
df_state = pd.DataFrame(X, columns=enc.categories_[0])
df_state.head()

Unnamed: 0,Alabama,Alaska,Arizona,Arkansas,California,Colorado,Connecticut,Delaware,District of Columbia,Florida,Georgia,Hawaii,Idaho,Illinois,Indiana,Iowa,Kansas,Kentucky,Louisiana,Maine,Maryland,Massachusetts,Michigan,Minnesota,Mississippi,Missouri,Montana,Nebraska,Nevada,New Hampshire,New Jersey,New Mexico,New York,North Carolina,North Dakota,Ohio,Oklahoma,Oregon,Pennsylvania,Rhode Island,South Carolina,South Dakota,Tennessee,Texas,Utah,Vermont,Virginia,Washington,West Virginia,Wisconsin,Wyoming
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [5]:
mapping = {'[22640, 34218.1]':1,
           '(34218.1, 37413.8]':2,
           '(37413.8, 40362.7]':3,
           '(40362.7, 42724.4]':4,
           '(42724.4, 45201]':5,
           '(45201, 48021.6]':6,
           '(48021.6, 51046.4]':7,
           '(51046.4, 54545.6]':8,
           '(54545.6, 61494.5]':9,
           '(61494.5, 125635]':10,
          }
df_binnedInc = df_base['binnedInc'].map(mapping)
df_binnedInc.value_counts()

6     306
1     306
9     306
8     305
5     305
7     305
2     304
4     304
3     304
10    302
Name: binnedInc, dtype: int64

In [6]:
df_base = df_base.drop(columns=['PctSomeCol18_24','PctPrivateCoverageAlone','PctEmployed16_Over','Geography','state','county','binnedInc']).dropna(how='any')
df_all = pd.concat([df_base,df_state, df_binnedInc],axis=1)
df_all.describe()

Unnamed: 0,TARGET_deathRate,avgAnnCount,avgDeathsPerYear,incidenceRate,medIncome,popEst2015,povertyPercent,studyPerCap,MedianAge,MedianAgeMale,MedianAgeFemale,AvgHouseholdSize,PercentMarried,PctNoHS18_24,PctHS18_24,PctBachDeg18_24,PctHS25_Over,PctBachDeg25_Over,PctUnemployed16_Over,PctPrivateCoverage,PctEmpPrivCoverage,PctPublicCoverage,PctPublicCoverageAlone,PctWhite,PctBlack,PctAsian,PctOtherRace,PctMarriedHouseholds,BirthRate,Alabama,Alaska,Arizona,Arkansas,California,Colorado,Connecticut,Delaware,District of Columbia,Florida,Georgia,Hawaii,Idaho,Illinois,Indiana,Iowa,Kansas,Kentucky,Louisiana,Maine,Maryland,Massachusetts,Michigan,Minnesota,Mississippi,Missouri,Montana,Nebraska,Nevada,New Hampshire,New Jersey,New Mexico,New York,North Carolina,North Dakota,Ohio,Oklahoma,Oregon,Pennsylvania,Rhode Island,South Carolina,South Dakota,Tennessee,Texas,Utah,Vermont,Virginia,Washington,West Virginia,Wisconsin,Wyoming,binnedInc
count,3047.0,3047.0,3047.0,3047.0,3047.0,3047.0,3047.0,3047.0,3047.0,3047.0,3047.0,3047.0,3047.0,3047.0,3047.0,3047.0,3047.0,3047.0,3047.0,3047.0,3047.0,3047.0,3047.0,3047.0,3047.0,3047.0,3047.0,3047.0,3047.0,3047.0,3047.0,3047.0,3047.0,3047.0,3047.0,3047.0,3047.0,3047.0,3047.0,3047.0,3047.0,3047.0,3047.0,3047.0,3047.0,3047.0,3047.0,3047.0,3047.0,3047.0,3047.0,3047.0,3047.0,3047.0,3047.0,3047.0,3047.0,3047.0,3047.0,3047.0,3047.0,3047.0,3047.0,3047.0,3047.0,3047.0,3047.0,3047.0,3047.0,3047.0,3047.0,3047.0,3047.0,3047.0,3047.0,3047.0,3047.0,3047.0,3047.0,3047.0,3047.0
mean,178.664063,606.338544,185.965868,448.268586,47063.281917,102637.4,16.878175,155.399415,40.906964,39.570725,42.145323,2.529682,51.773679,18.22445,35.002068,6.158287,34.80466,13.282015,7.852412,64.354939,41.196324,36.252642,19.240072,83.645286,9.107978,1.253965,1.983523,51.243872,5.640306,0.020676,0.005907,0.004923,0.024614,0.018707,0.019691,0.002626,0.000985,0.000328,0.021661,0.05087,0.001313,0.013784,0.033476,0.030194,0.032491,0.033476,0.039383,0.021004,0.005251,0.007877,0.004595,0.02724,0.028553,0.026912,0.037742,0.015753,0.026255,0.005579,0.003282,0.006892,0.010502,0.020348,0.032491,0.016738,0.028553,0.025271,0.011815,0.021989,0.001641,0.015097,0.019363,0.031178,0.076469,0.008861,0.004595,0.041024,0.012799,0.018051,0.02363,0.007548,5.497867
std,27.751511,1416.356223,504.134286,54.560733,12040.090836,329059.2,6.409087,529.628366,5.271319,5.226017,5.292849,0.248449,6.896928,8.093064,9.069722,4.529059,7.034924,5.394756,3.452371,10.647057,9.447687,7.841741,6.113041,16.380025,14.534538,2.610276,3.51771,6.572814,1.985816,0.142321,0.076645,0.070002,0.154972,0.13551,0.138961,0.051181,0.031368,0.018116,0.145597,0.219768,0.036214,0.116613,0.179904,0.171148,0.177329,0.179904,0.194536,0.143422,0.072286,0.088415,0.067639,0.162808,0.166573,0.161852,0.190603,0.12454,0.15992,0.074498,0.057203,0.082745,0.101957,0.14121,0.177329,0.128308,0.166573,0.156972,0.10807,0.146671,0.040482,0.121958,0.137821,0.173828,0.26579,0.093731,0.067639,0.198378,0.112427,0.133156,0.151918,0.086567,2.87138
min,59.7,6.0,3.0,201.3,22640.0,827.0,3.2,0.0,22.3,22.4,22.3,1.86,23.1,0.0,0.0,0.0,7.5,2.5,0.4,22.3,13.5,11.2,2.6,10.199155,0.0,0.0,0.0,22.99249,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
25%,161.2,76.0,28.0,420.3,38882.5,11684.0,12.15,0.0,37.7,36.35,39.1,2.38,47.75,12.8,29.2,3.1,30.4,9.4,5.5,57.2,34.5,30.9,14.85,77.29618,0.620675,0.254199,0.295172,47.763063,4.521419,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0
50%,178.1,171.0,61.0,453.549422,45207.0,26643.0,15.9,0.0,41.0,39.6,42.4,2.5,52.4,17.1,34.7,5.4,35.3,12.3,7.6,65.1,41.1,36.3,18.8,90.059774,2.247576,0.549812,0.826185,51.669941,5.381478,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0
75%,195.2,518.0,149.0,480.85,52492.0,68671.0,20.4,83.650776,43.9,42.5,45.3,2.64,56.4,22.7,40.7,8.2,39.65,16.1,9.7,72.1,47.7,41.55,23.1,95.451693,10.509732,1.221037,2.17796,55.395132,6.493677,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0
max,362.8,38150.0,14010.0,1206.9,125635.0,10170290.0,47.4,9762.308998,65.3,64.7,65.7,3.97,72.5,64.1,72.5,51.8,54.8,42.2,29.4,92.3,70.7,65.1,46.6,100.0,85.947799,42.619425,41.930251,78.075397,21.326165,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,10.0


In [7]:
df_all.iloc[:,-52:].values

array([[ 0.,  0.,  0., ...,  0.,  0., 10.],
       [ 0.,  0.,  0., ...,  0.,  0.,  7.],
       [ 0.,  0.,  0., ...,  0.,  0.,  7.],
       ...,
       [ 0.,  0.,  0., ...,  0.,  0.,  8.],
       [ 0.,  0.,  0., ...,  0.,  0.,  7.],
       [ 0.,  0.,  0., ...,  0.,  0.,  4.]])

In [8]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

# X_base and y_base are for data prepared/extracted from assignment B
X_base = df_base.iloc[:,1:].values
y_base = df_base.iloc[:,0].values

# X_all and y_all are for X_base and y_base plus one-hot encoded "geography" and mapped "binnedInc"
X_all = df_all.iloc[:,1:].values
y_all = df_all.iloc[:,0].values

# X_ss and y_ss are for StandardScaler
X_ss = StandardScaler().fit_transform(X_all)
y_ss = StandardScaler().fit_transform(y_all.reshape(len(y_all),1))

# X_base_ss and y_base_ss are for StandardScaler
X_base_ss = StandardScaler().fit_transform(X_base)
y_base_ss = StandardScaler().fit_transform(y_base.reshape(len(y_base),1))

## Run Linear Regression
Run the multivariate linear regression on against TARGET_deathRate to get baseline. Then run cross-validation for new baseline as we will be using cross validation plus MSE to measure our model performance.

Then print out the r2 score and MSE.

### Create getScore(X,y) method for reuse
Set up a method to split the preprocessed data into training (80%) and testing (20%) set, and then run linear regression on the training set while printing out the R Square score and MSE using the testing set.

In [9]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error as mse
from sklearn.model_selection import train_test_split

def getManualScore(X,y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)
    reg = LinearRegression()
    reg.fit(X_train,y_train)
    print("Score is %.4f" % reg.score(X_test,y_test))
    print("Training set MSE is %.4f" % mse(y_train, reg.predict(X_train)))
    print("Testing set MSE is %.4f" % mse(y_test, reg.predict(X_test)))
    return reg.score(X_test,y_test)

getManualScore(X_base, y_base)


Score is 0.5640
Training set MSE is 376.0007
Testing set MSE is 343.9928


0.5639999062278258

In [10]:
def getScore(X,y,model):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)
    SSx = StandardScaler().fit(X_train)
    SSy = StandardScaler().fit(y_train.reshape(len(y_train),1))
    X_train = SSx.transform(X_train)
    y_train = SSy.transform(y_train.reshape(len(y_train),1))
    X_test = SSx.transform(X_test)
    y_test = SSy.transform(y_test.reshape(len(y_test),1))
    
    model.fit(X_train,y_train)
    print("R2 Score is %.4f" % model.score(X_test,y_test))
    print("Training set MSE is %.4f" % mse(y_train, model.predict(X_train)))
    print("Testing set MSE is %.4f" % mse(y_test, model.predict(X_test)))
    return model

getScore(X_base, y_base, LinearRegression())

R2 Score is 0.5640
Training set MSE is 0.4917
Testing set MSE is 0.4498


LinearRegression()

Here we use StandardScaler for training set only, and then transform test set using the same scaler to avoid data leakage.

We can see the R2 score is about the same, but the MSE is different due to the standardization.

In [11]:
from sklearn.model_selection import KFold 

def performCV(X,y,model):
    kf = KFold(n_splits=5, random_state=None) 
    r2_test = []
    mse_test = []
    mse_train = []

    for train_index , test_index in kf.split(X):
        X_train , X_test = X[train_index,:],X[test_index,:]
        y_train , y_test = y[train_index] , y[test_index]
        SSx = StandardScaler().fit(X_train)
        SSy = StandardScaler().fit(y_train.reshape(len(y_train),1))
        X_train = SSx.transform(X_train)
        y_train = SSy.transform(y_train.reshape(len(y_train),1))
        X_test = SSx.transform(X_test)
        y_test = SSy.transform(y_test.reshape(len(y_test),1))
        model.fit(X_train,y_train)
        r2_test.append(model.score(X_test,y_test))
        mse_test.append(mse(y_test, model.predict(X_test)))
        mse_train.append(mse(y_train, model.predict(X_train)))
    r2score=sum(r2_test)/5
    mse_test_score=sum(mse_test)/5
    mse_train_score=sum(mse_train)/5
    print("R2 Score is %.4f" % r2score)
    print("Training set MSE is %.4f" % mse_train_score)
    print("Testing set MSE is %.4f" % mse_test_score)

performCV(X_base, y_base, LinearRegression())

R2 Score is 0.4751
Training set MSE is 0.4759
Testing set MSE is 0.5164


In [12]:
from sklearn.model_selection import cross_validate

def getCVScore(X,y,mode):
    kf = KFold(n_splits=5)
    scores = cross_validate(mode, X, y, cv=kf,
             scoring=('r2', 'neg_mean_squared_error'),
             return_train_score=True)
    print("R2 score is %.4f" % scores['test_r2'].mean())
    print("Training set MSE is %.4f" % scores['train_neg_mean_squared_error'].mean())
    print("Testing set MSE is %.4f" % scores['test_neg_mean_squared_error'].mean())
    return scores['test_r2'].mean()

getCVScore(X_ss,y_ss,LinearRegression())


R2 score is -10253286208494751491031040.0000
Training set MSE is -0.4061
Testing set MSE is -10635994002028504679448576.0000


-1.0253286208494751e+25

In [13]:
performCV(X_all, y_all, LinearRegression())

R2 Score is -673763315015156907376640.0000
Training set MSE is 0.4068
Testing set MSE is 669451877048378086916096.0000


In [14]:
performCV(X_all[:,:-28], y_all, LinearRegression())

R2 Score is -8616755731612977594368.0000
Training set MSE is 0.4421
Testing set MSE is 9762714617283428220928.0000


The cross validation split the data into 5 different combination of training and test sets, and we use the mean of results from all combination as the final score of the performance of the model.

We can see the value is very off after cross validation with all features.

It is actually not right to standardize both training set and testing set together, because it causes the data leakage as testing set should be unseen data.

In [15]:
getScore(X_all, y_all, LinearRegression())

R2 Score is 0.6051
Training set MSE is 0.4177
Testing set MSE is 0.4075


LinearRegression()

In [16]:
getScore(X_all, y_all, LinearRegression(fit_intercept=False))

R2 Score is 0.6051
Training set MSE is 0.4177
Testing set MSE is 0.4074


LinearRegression(fit_intercept=False)

When we include one-hot encoded Geography feature and mapped binnednc, we can see the performance has improved for both training MSE and test MSE. However, the difference between training and test set is much bigger, which shows the model is overfitting.

**New Baseline** -
R2 Score is 0.6050
Training set MSE is 0.4177
Testing set MSE is 0.4075

## Use different regression models
In following sections, we will be experimenting different models and hyper parameter tuning in pursue a model with better performance and less overfitting that can be a candidate for Production.

### Random Forest Regression
We will be using random forest regression model in this section and see the performance.

In [17]:
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor()
performCV(X_all,y_all,model)

  model.fit(X_train,y_train)
  model.fit(X_train,y_train)
  model.fit(X_train,y_train)
  model.fit(X_train,y_train)
  model.fit(X_train,y_train)
R2 Score is 0.5185
Training set MSE is 0.0628
Testing set MSE is 0.4744


We can see the training set MSE is very low, but the testing set MSE is quite high. This shows the model is very overfitting. We need to explore hyperparameter to bring up the consistency. 

We start with max_depth

In [18]:
model = RandomForestRegressor(max_depth=15)
performCV(X_all,y_all,model)

  model.fit(X_train,y_train)
  model.fit(X_train,y_train)
  model.fit(X_train,y_train)
  model.fit(X_train,y_train)
  model.fit(X_train,y_train)
R2 Score is 0.5200
Training set MSE is 0.0733
Testing set MSE is 0.4734


It is still too overfitting.

In [19]:
model = RandomForestRegressor(max_depth=5)
performCV(X_all,y_all,model)

  model.fit(X_train,y_train)
  model.fit(X_train,y_train)
  model.fit(X_train,y_train)
  model.fit(X_train,y_train)
  model.fit(X_train,y_train)
R2 Score is 0.4647
Training set MSE is 0.3833
Testing set MSE is 0.5282


In [20]:
model = RandomForestRegressor(max_depth=2)
performCV(X_all,y_all,model)

  model.fit(X_train,y_train)
  model.fit(X_train,y_train)
  model.fit(X_train,y_train)
  model.fit(X_train,y_train)
  model.fit(X_train,y_train)
R2 Score is 0.3104
Training set MSE is 0.6379
Testing set MSE is 0.6818


Now the consistency is close, but the model is underfitting. We will use max_depth = 5 along with other hyperparameters

In [21]:
model = RandomForestRegressor(max_depth=5, min_samples_split=100)
performCV(X_all,y_all,model)

  model.fit(X_train,y_train)
  model.fit(X_train,y_train)
  model.fit(X_train,y_train)
  model.fit(X_train,y_train)
  model.fit(X_train,y_train)
R2 Score is 0.4527
Training set MSE is 0.4316
Testing set MSE is 0.5402


In [22]:
model = RandomForestRegressor(max_depth=5, min_samples_split=200)
performCV(X_all,y_all,model)

  model.fit(X_train,y_train)
  model.fit(X_train,y_train)
  model.fit(X_train,y_train)
  model.fit(X_train,y_train)
  model.fit(X_train,y_train)
R2 Score is 0.4321
Training set MSE is 0.4709
Testing set MSE is 0.5599


In [23]:
model = RandomForestRegressor(max_depth=5, min_samples_split=100, min_samples_leaf=100)
performCV(X_all,y_all,model)

  model.fit(X_train,y_train)
  model.fit(X_train,y_train)
  model.fit(X_train,y_train)
  model.fit(X_train,y_train)
  model.fit(X_train,y_train)
R2 Score is 0.4127
Training set MSE is 0.5229
Testing set MSE is 0.5789


In [24]:
model = RandomForestRegressor(max_depth=5, min_samples_split=100, min_samples_leaf=50)
performCV(X_all,y_all,model)

  model.fit(X_train,y_train)
  model.fit(X_train,y_train)
  model.fit(X_train,y_train)
  model.fit(X_train,y_train)
  model.fit(X_train,y_train)
R2 Score is 0.4512
Training set MSE is 0.4624
Testing set MSE is 0.5413


In [25]:
model = RandomForestRegressor(max_depth=5, min_samples_split=100, min_samples_leaf=30)
performCV(X_all,y_all,model)

  model.fit(X_train,y_train)
  model.fit(X_train,y_train)
  model.fit(X_train,y_train)
  model.fit(X_train,y_train)
  model.fit(X_train,y_train)
R2 Score is 0.4543
Training set MSE is 0.4505
Testing set MSE is 0.5384


In [26]:
model = RandomForestRegressor(n_estimators=200 ,max_features='auto', max_depth=5, min_samples_split=100, min_samples_leaf=30)
performCV(X_all,y_all,model)

  model.fit(X_train,y_train)
  model.fit(X_train,y_train)
  model.fit(X_train,y_train)
  model.fit(X_train,y_train)
  model.fit(X_train,y_train)
R2 Score is 0.4551
Training set MSE is 0.4482
Testing set MSE is 0.5376


### Support Vector Regression
Try SVR in this section

In [27]:
from sklearn.svm import SVR

model = SVR(kernel = 'rbf')
performCV(X_all,y_all,model)


  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
R2 Score is 0.4680
Training set MSE is 0.3059
Testing set MSE is 0.5235


In [28]:
model = SVR(kernel = 'linear')
performCV(X_all,y_all,model)

  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
R2 Score is 0.5019
Training set MSE is 0.4155
Testing set MSE is 0.4889


In [29]:
model = SVR(kernel = 'sigmoid')
performCV(X_all,y_all,model)

  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
R2 Score is -19.9812
Training set MSE is 20.2363
Testing set MSE is 21.9596


In [30]:
model = SVR(kernel = 'poly')
performCV(X_all,y_all,model)

  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
R2 Score is 0.3085
Training set MSE is 0.2771
Testing set MSE is 0.6811


From above we can see that the linear kernel performs much better than others. We will continue explore other hyperparameters along with this kernel

In [31]:
model = SVR(kernel = 'linear', epsilon=0.5)
performCV(X_all,y_all,model)

  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
R2 Score is 0.4967
Training set MSE is 0.4119
Testing set MSE is 0.4947


In [32]:
model = SVR(kernel = 'linear', epsilon=1)
performCV(X_all,y_all,model)

  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
R2 Score is 0.4569
Training set MSE is 0.4345
Testing set MSE is 0.5319
  return f(*args, **kwargs)


In [33]:
model = SVR(kernel = 'linear', epsilon=0.5, C=2)
performCV(X_all,y_all,model)

  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
R2 Score is 0.4954
Training set MSE is 0.4121
Testing set MSE is 0.4958


In [34]:
model = SVR(kernel = 'linear', epsilon=0.5, C=0.5)
performCV(X_all,y_all,model)

  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
R2 Score is 0.4966
Training set MSE is 0.4118
Testing set MSE is 0.4947


Looks like the default hyperparameters has better performance. Although it is still quite overfitting and will not have good performance in Production.