<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"></ul></div>

In [18]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

#preprocessing and model selection
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, RepeatedKFold, train_test_split, cross_val_score

#regression model metrics
from sklearn.metrics import mean_squared_error, r2_score

#regression models
from sklearn.svm import LinearSVR


In [126]:
path = 'https://simplonline-v3-prod.s3.eu-west-3.amazonaws.com/media/file/csv/bdfc59ed-c3c7-48ac-a3d3-9e54663f6c1d.csv'
target = 'SalePrice'
df = pd.read_csv(path)
df = df[["OverallQual","GrLivArea", 'Neighborhood',"SalePrice"]]
df.head()


X = df.drop(columns = [target])
y = df[target]

In [127]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.preprocessing import PolynomialFeatures
from sklearn.compose import ColumnTransformer

numeric_transformer = Pipeline(
    steps=[('imputer',SimpleImputer(strategy='mean')),
           ('scaler', StandardScaler())])

categorical_transformer = Pipeline(
    steps=[('imputer', SimpleImputer(strategy='constant', fill_value='missing')
            ), ('onehot', OneHotEncoder(handle_unknown='ignore'))])


numeric_features = df.select_dtypes(include=['int64', 'float64']).drop(columns = [target]).columns
categorical_features = df.select_dtypes(include=['object']).columns

print(numeric_features)
print(categorical_features)

Index(['OverallQual', 'GrLivArea'], dtype='object')
Index(['Neighborhood'], dtype='object')


In [145]:
import numpy as np
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

#numeric_features = df.select_dtypes(include=['int64', 'float64'])#.drop(columns = [target]).columns

lsvr = Pipeline(steps=[('preprocessor', preprocessor),                       
                      ('regressor', LinearSVR())])


print(y.shape)
y1 = y
y_reshaped = y1.values.reshape(-1, 1)
y_scaler = StandardScaler()
y_scaler.fit(y_reshaped)



X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.25,
                                                    random_state=1)

y_train = y_scaler.transform(y_train.values.reshape(-1,1))
y_test = y_scaler.transform(y_test.values.reshape(-1,1))
print(y_train.shape)

(1460,)
(1095, 1)


In [146]:
paramsLinearSVR = \
{
 'regressor__C': [float(i)*0.1+0.1 for i in range(100)],
 #'dual': [True],
 #'epsilon': [0.0],
 #'fit_intercept': [True],
 #'intercept_scaling': [1.0],
 #'loss': ['epsilon_insensitive'],
 'regressor__max_iter': [ 1000*float(i+1) for i in range(10)],
 #'random_state': [None],
 'regressor__tol': [0.01],
 #'verbose': [0],

}



In [147]:
cv1 = RepeatedKFold(n_splits=10, n_repeats=4, random_state=1)

Grid = RandomizedSearchCV(lsvr, paramsLinearSVR, n_iter = 20, scoring='r2', cv=cv1)
Grid.fit(X, y)
best_model = Grid.best_estimator_
#show_classification_model_metrics(best_model, X_test, y_test)

In [148]:
print(f"R squarred :{best_model.score(X_test, y_test)}")
#print(f"\n OverallQual coef : {best_model.coef_[0]}\n GrLivArea coef : {best_model.coef_[1]}\n intercept : {best_model.intercept_}")


R squarred :-223768366.18200096


In [None]:
y_pred = y_scaler.inverse_transform(lsvr.predict(X_test))
