### Imports

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.metrics import r2_score
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
sns.set_style('darkgrid')

%matplotlib inline

### Importing Data

In [2]:
df_train = pd.read_csv('../Data/cleaned1_train.csv')
df_test = pd.read_csv('../Data/cleaned1_test.csv')

In [3]:
df_train.drop(columns='Unnamed: 0', inplace=True)

In [4]:
df_test.drop(columns='Unnamed: 0', inplace=True)

### Convert all columns to lowercase and replace spaces in column names.

In [5]:
df_train.columns = df_train.columns.str.lower().str.replace(' ', '_')

In [6]:
df_test.columns = df_test.columns.str.lower().str.replace(' ', '_')

### Setting up X and y

In [7]:
X = df_train[['gr_liv_area','overall_qual']]
y = df_train['saleprice']

### Instantiating our PolynomialFeatures object

In [8]:
poly = PolynomialFeatures(degree=2, interaction_only=False, include_bias=False)

### Fitting and Transforming the Data

In [9]:
X_overfit = poly.fit_transform(X)

In [13]:
poly.get_feature_names_out(X.columns)

array(['gr_liv_area', 'overall_qual', 'gr_liv_area^2',
       'gr_liv_area overall_qual', 'overall_qual^2'], dtype=object)

### Creating Train Test Splits

In [11]:
X_train, X_test, y_train, y_test = train_test_split(
    X_overfit,
    y,
    test_size=0.7,
    random_state=42
)


### Scaling Our Data

In [14]:
sc = StandardScaler()
Z_train = sc.fit_transform(X_train)
Z_test = sc.transform(X_test)    

### Linear Regression

In [15]:
ols = LinearRegression()


In [16]:
ols.fit(Z_train, y_train)

LinearRegression()

### Checking How the Model Scores on the Training and Test Data?

In [17]:
ols.score(Z_train, y_train), ols.score(Z_test, y_test)

(0.7911096492244158, 0.7645601585485928)

In [18]:
alpha = .0010
alpha * (ols.coef_ ** 2).sum()

45258105.953449935

In [19]:
(ols.coef_ ** 2).sum()

45258105953.449936

### Instantiating Ridge

In [20]:
ridge = Ridge(alpha = 100)

In [21]:
ridge.fit(Z_train, y_train)

Ridge(alpha=100)

In [22]:
ridge.score(Z_train, y_train), ridge.score(Z_test, y_test)

(0.7096273608279441, 0.7599302104497923)

In [23]:
ols.score(Z_train, y_train), ols.score(Z_test, y_test)

(0.7911096492244158, 0.7645601585485928)

In [24]:
from sklearn.linear_model import RidgeCV

### Setting Up a List of Ridge Alphas to Check

In [25]:
alphas = np.logspace(0, 5, 100)

In [26]:
ridge_cv = RidgeCV(alphas = alphas, cv = 5) 

### Cross Validating Over Our List of Ridge Alphas

In [None]:
ridge_cv.fit(Z_train, y_train);

### The Crossval Score

In [None]:
ridge_cv.best_score_ 