In [33]:
# Importing Libraries


import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder as ohc
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
np.set_printoptions(formatter={'float_kind':"{:3.2f}".format})
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVR
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_regression
%matplotlib inline


In [34]:
# Read the default file as a data frame
default = pd.read_csv("default.csv")

The code below accomplishes the same thing as the onehot encoding for me it was easier to use and more intuitive, however, for the sake of the assignment I shall use the onehot ecoding to follow the instructions.
However, feel free to remove the hashes below to see the result of getting the dummy variables dropping the first of the binary set. 

In [35]:
#getting dummy variables and dropping first value for student
#df_encoded = pd.get_dummies(default["student"], drop_first=True, prefix="student")

# Concatenate the original DataFrame with the one-hot encoded DataFrame
#default_enc = pd.concat([default, df_encoded], axis=1)

# Drop the original "student" column
#default_enc = default_enc.drop(["student"], axis=1)
#default_enc

# 1. One hot encoding the Student Data and Drop first if Binary

In [36]:
# onehot encode student column
enc = ohc(drop='if_binary',sparse=False)
df_encoded=enc.fit_transform(default[["student"]])
df_encoded

array([[0.00],
       [1.00],
       [0.00],
       ...,
       [0.00],
       [0.00],
       [1.00]])

'sparse_output'or 'sparse':Will return sparse matrix if set True else will return an array. The default is set to true, however since I set it to false over here I got an array for df_encoded.

‘if_binary’ : drops the first category in each feature with two categories. Features with 1 or more than 2 categories are left intact. In this case it is binary so it drops the Student_No.



In [37]:
enclabels=enc.categories_

enclabels


[array(['No', 'Yes'], dtype=object)]

From here avove I can see that the labels are binary and the first is **No** which got dropped when we used the enc.fit_transform on the student column after defining enc in the previous line of code 

In [38]:
# Create a DataFrame with the one-hot encoded column
df_encoded = pd.DataFrame(df_encoded)
# we rename the column to S_Yes for the yes value under student column
df_encoded = df_encoded.rename(columns={0:'S_Yes'})
df_encoded

Unnamed: 0,S_Yes
0,0.0
1,1.0
2,0.0
3,0.0
4,0.0
...,...
9995,0.0
9996,0.0
9997,0.0
9998,0.0


In [39]:
# Create a DataFrame with the one-hot encoded columns
df_encoded = pd.concat([default, df_encoded], axis=1)
df_encoded

Unnamed: 0.1,Unnamed: 0,default,student,balance,income,S_Yes
0,1,No,No,729.526495,44361.625074,0.0
1,2,No,Yes,817.180407,12106.134700,1.0
2,3,No,No,1073.549164,31767.138947,0.0
3,4,No,No,529.250605,35704.493935,0.0
4,5,No,No,785.655883,38463.495879,0.0
...,...,...,...,...,...,...
9995,9996,No,No,711.555020,52992.378914,0.0
9996,9997,No,No,757.962918,19660.721768,0.0
9997,9998,No,No,845.411989,58636.156984,0.0
9998,9999,No,No,1569.009053,36669.112365,0.0


In [40]:
# Drop the original "student" column
df_encoded = df_encoded.drop(["student"], axis=1)
df_encoded

Unnamed: 0.1,Unnamed: 0,default,balance,income,S_Yes
0,1,No,729.526495,44361.625074,0.0
1,2,No,817.180407,12106.134700,1.0
2,3,No,1073.549164,31767.138947,0.0
3,4,No,529.250605,35704.493935,0.0
4,5,No,785.655883,38463.495879,0.0
...,...,...,...,...,...
9995,9996,No,711.555020,52992.378914,0.0
9996,9997,No,757.962918,19660.721768,0.0
9997,9998,No,845.411989,58636.156984,0.0
9998,9999,No,1569.009053,36669.112365,0.0


In [41]:
# Splitting the dataset into features and target variables 
x = df_encoded[['balance','income','S_Yes']]

#Below is default the target 
y = df_encoded['default']


In [42]:
# SPlit the training and testing sets 
X_train, X_test, y_train, y_test = train_test_split(x, y, train_size=0.8, test_size=0.2, random_state=5)

# Initialize the scaler
sc = StandardScaler()

# Fit the scaler on the training data and transform it
X_train = sc.fit_transform(X_train)

# Use the same scaler to transform the test data
X_test = sc.transform(X_test)

- Mean and standard deviation is the default scaling parameter 

In [43]:
#y=pd.get_dummies(default["default"])

In [44]:
# KNN
model = KNeighborsClassifier()
#Fit models with k = 1, 2, 3, 10, 20, 500 neighbors. 
#Explain your results; particularly for k = 1 and large values of k.
#Why does the performance not decrease below a threshold?
k_values = [i for i in (1,2,3,10,20,500)]

# grid search
param_grid = {'n_neighbors':k_values}
grid = GridSearchCV(model, param_grid, cv=5)
grid.fit(X_train, y_train)
print("Grid Search: best parameters: {}".format(grid.best_params_))

# accuracy of best model with confidence interval
best_model = grid.best_estimator_
predict_y = best_model.predict(X_test)
acc = accuracy_score(y_test, predict_y)
print("Accuracy: {:3.2f}".format(acc))
    

Grid Search: best parameters: {'n_neighbors': 20}
Accuracy: 0.97


One-Hot Encoding:

One-hot encoding is used for categorical variables, converting them into binary vectors. For example, if you have a categorical variable with three classes ('A', 'B', 'C'), one-hot encoding creates three binary columns where each column corresponds to one class.
Data leakage in one-hot encoding is generally not a concern. The reason is that each category is represented by its own binary column, and the encoding process is typically applied independently to each subset of data (training, validation, test). The transformation doesn't involve any computation across rows, so there's no risk of information from the test set influencing the encoding process.

Scaling (Standardization or Normalization):

Scaling involves transforming numerical features to a standard scale, often using the mean and standard deviation (for standardization) or minimum and maximum values (for normalization).
Data leakage in scaling can occur if the mean and standard deviation (or min and max) are computed using information from the test set. This would mean that the scaling parameters are influenced by the distribution of the test set, potentially leading to over-optimistic performance estimates.
To avoid data leakage, the mean and standard deviation (or other scaling parameters) should be computed solely on the training set and then applied to the test set.

In summary, while one-hot encoding is generally less prone to data leakage concerns, it's crucial to be mindful of how scaling is applied, ensuring that the scaling parameters are derived solely from the training set to maintain the integrity of the model evaluation on unseen data.







# 4. Repeating exercise 3 without the drop argument in the one hot encoding

In [45]:
# using sparse equals false turns it into an array 
enco = ohc(sparse=False)
df_enco=enco.fit_transform(default[["student"]])

In [46]:
# called this to see what encoding and sparse accomplised 
df_enco

array([[1.00, 0.00],
       [0.00, 1.00],
       [1.00, 0.00],
       ...,
       [1.00, 0.00],
       [1.00, 0.00],
       [0.00, 1.00]])

In [47]:
# confirm the labels of the ecoded variable
enlabels=enco.categories_

enlabels

[array(['No', 'Yes'], dtype=object)]

In [48]:
# Create a DataFrame with the one-hot encoded column
df_enco = pd.DataFrame(df_enco, columns=enlabels)
# Create a DataFrame with the one-hot encoded columns
df_enco = pd.concat([default, df_enco], axis=1)
# Drop the original "student" column
df_enco = df_enco.drop(["student"], axis=1)
df_enco.dtypes

Unnamed: 0      int64
default        object
balance       float64
income        float64
(No,)         float64
(Yes,)        float64
dtype: object

In [49]:
# Splitting the dataset into features and target variables 
x1 = df_enco.drop(["default"], axis=1)
# we are going to use the previous Y

In [50]:
# SPlit the training and testing sets 
X_traind, X_testd, y_train, y_test = train_test_split(x1, y, train_size=0.8, test_size=0.2, random_state=5)

# Initialize the scaler
sc = StandardScaler()

# Fit the scaler on the training data and transform it
X_traind = sc.fit_transform(X_traind)

# Use the same scaler to transform the test data
X_testd = sc.transform(X_testd)

In [51]:
#fitting the train data on the grid 
grid.fit(X_traind, y_train)
print("Grid Search: best parameters: {}".format(grid.best_params_))

# accuracy of best model with confidence interval
best_model1 = grid.best_estimator_
predict_y1 = best_model1.predict(X_testd)
accd = accuracy_score(y_test, predict_y1)
print("Accuracy: {:3.2f}".format(accd))

Grid Search: best parameters: {'n_neighbors': 10}
Accuracy: 0.97


# Decision Trees and SVMs

## Forest Cover 

In [80]:
# Reading the forest cover type data set as a data frame 
covert = pd.read_csv("covtype.csv")

In [81]:
# It is a huge dataset and I would like to see as much as I can 
covert

Unnamed: 0,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,...,Soil_Type32,Soil_Type33,Soil_Type34,Soil_Type35,Soil_Type36,Soil_Type37,Soil_Type38,Soil_Type39,Soil_Type40,Cover_Type
0,2596,51,3,258,0,510,221,232,148,6279,...,0,0,0,0,0,0,0,0,0,5
1,2590,56,2,212,-6,390,220,235,151,6225,...,0,0,0,0,0,0,0,0,0,5
2,2804,139,9,268,65,3180,234,238,135,6121,...,0,0,0,0,0,0,0,0,0,2
3,2785,155,18,242,118,3090,238,238,122,6211,...,0,0,0,0,0,0,0,0,0,2
4,2595,45,2,153,-1,391,220,234,150,6172,...,0,0,0,0,0,0,0,0,0,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
581007,2396,153,20,85,17,108,240,237,118,837,...,0,0,0,0,0,0,0,0,0,3
581008,2391,152,19,67,12,95,240,237,119,845,...,0,0,0,0,0,0,0,0,0,3
581009,2386,159,17,60,7,90,236,241,130,854,...,0,0,0,0,0,0,0,0,0,3
581010,2384,170,15,60,5,90,230,245,143,864,...,0,0,0,0,0,0,0,0,0,3


In [85]:
#looking at the data types 
covert.dtypes

Elevation                             int64
Aspect                                int64
Slope                                 int64
Horizontal_Distance_To_Hydrology      int64
Vertical_Distance_To_Hydrology        int64
Horizontal_Distance_To_Roadways       int64
Hillshade_9am                         int64
Hillshade_Noon                        int64
Hillshade_3pm                         int64
Horizontal_Distance_To_Fire_Points    int64
Wilderness_Area1                      int64
Wilderness_Area2                      int64
Wilderness_Area3                      int64
Wilderness_Area4                      int64
Soil_Type1                            int64
Soil_Type2                            int64
Soil_Type3                            int64
Soil_Type4                            int64
Soil_Type5                            int64
Soil_Type6                            int64
Soil_Type7                            int64
Soil_Type8                            int64
Soil_Type9                      

- I wanted to use this method below for sampling randomly but found a better way to do it

In [86]:
#sample_size = 20000

#cover_samp = covert.sample(n=sample_size, random_state = 42)

In [87]:
# Splitting the dataset into features and target variables 
x2 = covert.drop(["Cover_Type"], axis=1)
y1 = covert["Cover_Type"]
y1 = covert["Cover_Type"].astype("category") 


- This make classification method below is a better way of sampling and I can choose the number of columns as well as observations 

In [120]:
# Import the make classification library 
from sklearn.datasets import make_classification

# I am choosing 10000 observations and 25 columns 
x2,y1 = make_classification(n_samples=10000, n_features=25, random_state=42)
# SPlit the training and testing sets 
X_traint, X_testt, y_traint, y_testt = train_test_split(x2, y1, train_size=0.7, test_size=0.3, random_state=3)

# Initialize the scaler
sc = StandardScaler()

# Fit the scaler on the training data and transform it
X_traint = sc.fit_transform(X_traint)

# Use the same scaler to transform the test data
X_testt = sc.transform(X_testt)

In [121]:
# decision trees
model = DecisionTreeClassifier(random_state=40)

# grid search
param_grid = {'max_depth': list(range(1,21)), 'criterion': ['entropy','gini'] }
grid = GridSearchCV(model, param_grid, cv=5)
%time grid.fit(X_traint, y_traint)
print("Grid Search: best parameters: {}".format(grid.best_params_))



CPU times: user 35.4 s, sys: 8.06 ms, total: 35.4 s
Wall time: 35.4 s
Grid Search: best parameters: {'criterion': 'entropy', 'max_depth': 6}


In [122]:
# accuracy of best model
best_model = grid.best_estimator_
predictt_y = best_model.predict(X_testt)
acct = accuracy_score(y_testt, predictt_y)
print("Accuracy: {:3.2f}".format(acct))

Accuracy: 0.93


- I had an accuracy of 93% when I choose this combination of onservations and features and I played with this several times where it trained for 2 minutes and gave 76% accuracy and when I did the whole dataset it took about 16 minutes with 97% accuracy. So my current sampling and it's accuracy performed very well considering it took just 35 secs to complete training 

In [137]:
# I am choosing 8000 observations and 23 columns 

x2,y1 = make_classification(n_samples=8000, n_features=23, random_state=42)
#Split data into 70-30 training and testing 
X_trainl, X_testl, y_trainl, y_testl = train_test_split(x2, y1, train_size=0.7, test_size=0.3, random_state=3)

# Create a pipeline with a scaler and LinearSVC
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('linearsvc', LinearSVC())])
    


In [138]:
#Using pipelines in grid searches 

param_grid = {'linearsvc__C': [1, 2, 4],    
              'linearsvc__tol': [1e-4, 1e-5, 1e-6],
             'linearsvc__max_iter': [10000, 20000]} 

In [139]:
# Perform GridSearchCV for hyperparameter tuning
grids = GridSearchCV(pipeline, param_grid=param_grid, cv=5, scoring = 'accuracy')
# timing the completion of training
%time grids.fit(X_trainl, y_trainl)

# Print the best hyperparameters found by GridSearchCV
print("Grid Search: best parameters: {}".format(grids.best_params_))



CPU times: user 2min 22s, sys: 1min 20s, total: 3min 43s
Wall time: 1min 18s
Grid Search: best parameters: {'linearsvc__C': 1, 'linearsvc__max_iter': 10000, 'linearsvc__tol': 0.0001}


In [141]:
best_param = grids.best_estimator_

ypreds = best_param.predict(X_testl)
accs = accuracy_score(y_testl, ypreds)
print("Accuracy: {:3.2f}".format(accs))

Accuracy: 0.87


- This is a more complex model than the tree that is why it took a longer time to complete training even with fewer observations and features. However, I didn't want my sample size to be so small that I would get a good enough picture. 

# Housing Prices 

In [270]:
# Importing libraries 
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.svm import LinearSVR
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_regression
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score


In [271]:
# Reading the housing data as a data frame 
housing = pd.read_csv('ames.csv')

In [272]:
# Looking at the first 5 rows and all the columns 
housing.head()

Unnamed: 0,MS_SubClass,MS_Zoning,Lot_Frontage,Lot_Area,Street,Alley,Lot_Shape,Land_Contour,Utilities,Lot_Config,...,Fence,Misc_Feature,Misc_Val,Mo_Sold,Year_Sold,Sale_Type,Sale_Condition,Sale_Price,Longitude,Latitude
0,One_Story_1946_and_Newer_All_Styles,Residential_Low_Density,141,31770,Pave,No_Alley_Access,Slightly_Irregular,Lvl,AllPub,Corner,...,No_Fence,,0,5,2010,WD,Normal,215000,-93.619754,42.054035
1,One_Story_1946_and_Newer_All_Styles,Residential_High_Density,80,11622,Pave,No_Alley_Access,Regular,Lvl,AllPub,Inside,...,Minimum_Privacy,,0,6,2010,WD,Normal,105000,-93.619756,42.053014
2,One_Story_1946_and_Newer_All_Styles,Residential_Low_Density,81,14267,Pave,No_Alley_Access,Slightly_Irregular,Lvl,AllPub,Corner,...,No_Fence,Gar2,12500,6,2010,WD,Normal,172000,-93.619387,42.052659
3,One_Story_1946_and_Newer_All_Styles,Residential_Low_Density,93,11160,Pave,No_Alley_Access,Regular,Lvl,AllPub,Corner,...,No_Fence,,0,4,2010,WD,Normal,244000,-93.61732,42.051245
4,Two_Story_1946_and_Newer,Residential_Low_Density,74,13830,Pave,No_Alley_Access,Slightly_Irregular,Lvl,AllPub,Inside,...,Minimum_Privacy,,0,3,2010,WD,Normal,189900,-93.638933,42.060899


In [273]:
# Selecting just the numerical data types to avoid encoding 
housing2 = housing.select_dtypes(include='number')
housing2

Unnamed: 0,Lot_Frontage,Lot_Area,Year_Built,Year_Remod_Add,Mas_Vnr_Area,BsmtFin_SF_1,BsmtFin_SF_2,Bsmt_Unf_SF,Total_Bsmt_SF,First_Flr_SF,...,Enclosed_Porch,Three_season_porch,Screen_Porch,Pool_Area,Misc_Val,Mo_Sold,Year_Sold,Sale_Price,Longitude,Latitude
0,141,31770,1960,1960,112,2,0,441,1080,1656,...,0,0,0,0,0,5,2010,215000,-93.619754,42.054035
1,80,11622,1961,1961,0,6,144,270,882,896,...,0,0,120,0,0,6,2010,105000,-93.619756,42.053014
2,81,14267,1958,1958,108,1,0,406,1329,1329,...,0,0,0,0,12500,6,2010,172000,-93.619387,42.052659
3,93,11160,1968,1968,0,1,0,1045,2110,2110,...,0,0,0,0,0,4,2010,244000,-93.617320,42.051245
4,74,13830,1997,1998,0,3,0,137,928,928,...,0,0,0,0,0,3,2010,189900,-93.638933,42.060899
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2925,37,7937,1984,1984,0,3,0,184,1003,1003,...,0,0,0,0,0,3,2006,142500,-93.604776,41.988964
2926,0,8885,1983,1983,0,2,324,239,864,902,...,0,0,0,0,0,6,2006,131000,-93.602680,41.988314
2927,62,10441,1992,1992,0,3,0,575,912,970,...,0,0,0,0,700,7,2006,132000,-93.606847,41.986510
2928,77,10010,1974,1975,0,1,123,195,1389,1389,...,0,0,0,0,0,4,2006,170000,-93.600190,41.990921


In [274]:
# Splitting the data set into predictors and a response
feat = housing2.drop(['Sale_Price'], axis = 1)
targ = housing2['Sale_Price']

In [275]:
# making a subset of the data to avoid long run times so we random sample
#But here I used all the features since it is not as much as the forest data
feat,targ = make_regression(n_samples=2000, n_features=35, random_state=42)
#split into 70% training and 30% testing 
X_trainh, X_testh, y_trainh, y_testh = train_test_split(feat,targ, train_size=0.7, test_size=0.3, random_state=3)

# Initialize the scaler
sc = StandardScaler()

# Fit the scaler on the training data and transform it
X_trainh = sc.fit_transform(X_trainh)

# Use the same scaler to transform the test data
X_testh = sc.transform(X_testh)

# Decision Tree Regressor for Housing Prediction Data

In [276]:
# decision trees
regtree = DecisionTreeRegressor(random_state=40)

# grid search
param_grid = {'max_depth': list(range(1,21)), 'criterion': ['mse','mae'] }
gridtree = GridSearchCV(regtree, param_grid, cv=5)
#time the output of the best parameters and what they are
%time gridtree.fit(X_trainh, y_trainh)
print("Grid Search: best parameters: {}".format(grid.best_params_))

CPU times: user 30.4 s, sys: 12.6 ms, total: 30.4 s
Wall time: 30.4 s
Grid Search: best parameters: {'criterion': 'entropy', 'max_depth': 6}


In [277]:
# metrics of best model
best_modelr = gridtree.best_estimator_
predictr_y = best_modelr.predict(X_testh)
mse = mean_squared_error(y_testh, predictr_y)
mae = mean_absolute_error(y_testh, predictr_y)
r2 = r2_score(y_testh, predictr_y)

#printing the metrics of the best model 
print("tree mse: {:3.2f}".format(mse))
print("tree mae: {:3.2f}".format(mae))
print("tree r2: {:3.2f}".format(r2))

tree mse: 19773.97
tree mae: 112.54
tree r2: 0.52


This problem and the ones that follow are regression problems so I am going to have a different set of metrics for continous values in my target variable and I chose the following few. 


- MSE is the average of squared differences between observed and predicted values. It penalizes larger errors more heavily than MAE

- MAE represents the average absolute difference between the observed and predicted values. It is robust to outliers.

- R-squared measures the proportion of the variance in the dependent variable that is predictable from the independent variables. It ranges from 0 to 1, with higher values indicating better fit.

# SVR from SVM

In [278]:
# making a random sample of the housing data
feat1,targ1 = make_regression(n_samples=2000, n_features=35, random_state=42)
#splitting the samples into 70% training and 30% testing
X_trainhs, X_tesths, y_trainhs, y_tesths = train_test_split(feat1,targ1, train_size=0.7, test_size=0.3, random_state=42)

# Create a pipeline with a scaler and SVR
pipelinesvr = Pipeline([('scaler', StandardScaler()),
    ('svr', SVR())])

In [279]:

#Using pipelines in grid searches 

param_grid = {'svr__C': [1, 2, 4],    
              'svr__kernel': ["rbf", "poly", "linear"],
              'svr__gamma':[0.001,0.01,0.1]} 

In [280]:
# Perform GridSearchCV for hyperparameter tuning
gridsvr = GridSearchCV(pipelinesvr, param_grid=param_grid, cv=5)
#timing the completion of training
%time gridsvr.fit(X_trainhs, y_trainhs)

# Print the best hyperparameters found by GridSearchCV
print("Grid Search: best parameters: {}".format(gridsvr.best_params_))

CPU times: user 21.6 s, sys: 10.9 ms, total: 21.6 s
Wall time: 21.6 s
Grid Search: best parameters: {'svr__C': 4, 'svr__gamma': 0.001, 'svr__kernel': 'linear'}


In [281]:
# metrics of best model
best_modelsvr = gridsvr.best_estimator_
predictsvr_y = best_modelsvr.predict(X_tesths)
svrmse = mean_squared_error(y_tesths, predictsvr_y)
svrmae = mean_absolute_error(y_tesths, predictsvr_y)
svr2 = r2_score(y_tesths, predictsvr_y)

# printing metrics of best model

print("svr mse: {:3.2f}".format(svrmse))
print("svr mae: {:3.2f}".format(svrmae))
print("svr r2: {:3.2f}".format(svr2))

svr mse: 0.00
svr mae: 0.04
svr r2: 1.00


- In this SVR model we see that there are no differences between the true and predicted value and the linear Kernel performed better than the polynomial and Radial. we could not score a higher r2 value (coefficient of determination). 

# Linear SVR from SVM

In [282]:
# making a random sample of the housing data
feat,targ = make_regression(n_samples=2930, n_features=35, random_state=42)
#splitting the samples into 70% training and 30% testing
X_trainh, X_testh, y_trainh, y_testh = train_test_split(feat,targ, train_size=0.7, test_size=0.3, random_state=42)
# Create a pipeline with a scaler and LinearSVR
pipelinelsvr = Pipeline([('scaler', StandardScaler()),
    ('linearsvr', LinearSVR())])

#Using pipelines in grid searches 
param_grid = {'linearsvr__C': [0.01, 0.02, 0.1,1],    
              'linearsvr__tol': [1e-2,1e-3,1e-4, 1e-5, 1e-6],
             'linearsvr__max_iter': [1000, 2000]}

In [283]:
# Perform GridSearchCV for hyperparameter tuning
gridlsvr = GridSearchCV(pipelinelsvr, param_grid=param_grid, cv=5)
# timing the completion of training
%time gridlsvr.fit(X_trainh, y_trainh)

# Print the best hyperparameters found by GridSearchCV
print("Grid Search: best parameters: {}".format(gridlsvr.best_params_))


CPU times: user 12.3 s, sys: 13.8 s, total: 26.1 s
Wall time: 1.65 s
Grid Search: best parameters: {'linearsvr__C': 1, 'linearsvr__max_iter': 1000, 'linearsvr__tol': 0.01}


In [284]:
# metrics of best model
best_modellsvr = gridlsvr.best_estimator_
predictlsvr_y = best_modellsvr.predict(X_testh)
lsvrmse = mean_squared_error(y_testh, predictlsvr_y)
lsvrmae = mean_absolute_error(y_testh, predictlsvr_y)
lsvr2 = r2_score(y_testh, predictlsvr_y)

# printing metrics of best model

print("linearsvr mse: {:3.2f}".format(lsvrmse))
print("linearsvr mae: {:3.2f}".format(lsvrmae))
print("linearsvr r2: {:3.2f}".format(lsvr2))

linearsvr mse: 0.00
linearsvr mae: 0.00
linearsvr r2: 1.00


- The LinearSVR model performed the best and there are no differences between the true and predicted values and we could not score a higher r2(coefficient of determination) value than I accomplished . 