# Linear Regression and Multiple Linear Regression

- Linear regression only has one independent variable, while Multiple Linear Regression has multiple independent variables (x)

- This model trains the slope and intercept parameters based on training data

$$
y = mx + b
$$

- This model assumes a Linear relationship between independent and dependent variables
 

In [None]:
from sklearn.linear_model import LinearRegression
import pandas as pd

lm = LinearRegression()


In [None]:
def download_data(url):
    return pd.read_csv(url)

df = download_data("http://dataworkshop123.s3.amazonaws.com/train.csv")
X = df[['LotArea']]
Y = df[['SalePrice']]

In [None]:
lm.fit(X, Y)

# X must be of equal shape of training data
yhat = lm.predict(X)

In [None]:
yhat

In [None]:
print(f"Intercept: {lm.intercept_}")
print(f"Slope: {lm.coef_}")

### Multiple Linear Regression

$$
y = b + m_{1}x_{1} + m_{2}x_{2} ...
$$

In [None]:
X = df[['LotArea', 'OverallQual']]
Y = df[['SalePrice']]

In [None]:
lm_multi = LinearRegression()
lm_multi.fit(X, Y)
yhat_multi = lm_multi.predict(X)
yhat_multi

In [None]:
import seaborn as sns
sns.regplot(x=X[['OverallQual']], y=yhat_multi)

In [None]:
sns.residplot(x=X[['OverallQual']], y=yhat_multi)

In [None]:
# ax1 = sns.displot(Y, color="r", label="Target Values")
sns.displot(data=df, x='SalePrice', kind='hist')


In [None]:
ax1 = sns.kdeplot(data=df, x='SalePrice', shade=True)
sns.kdeplot(data=yhat_multi, ax=ax1, )


In [None]:
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import numpy as np
sns.set(style = "darkgrid")

fig = plt.figure()
ax = fig.add_subplot(projection = '3d')

x = X[['LotArea']]
y = X[['OverallQual']]
z = yhat_multi

ax.set_xlabel("LotArea")
ax.set_ylabel("OverallQual")
ax.set_zlabel("Prediction")

ax.scatter(x, y, z)
ax.plot_surface(x, y, np.full_like(z, ax.get_zlim()[0]), alpha = 1)
plt.show()

In [None]:
from sklearn.metrics import mean_squared_error
mean_squared_error(Y['SalePrice'], yhat_multi)

In [None]:
mean_squared_error(Y['SalePrice'], yhat)

In [None]:
## R^2 - How much of the variance can be explained by the model
lm_multi.score(df[['LotArea', 'OverallQual']], yhat_multi)

How can we make sure this model is the best one for this data?

- See if predictions make sense
- Numerical Evaluations
- Visualization
- Try different models

## Pipelines

In [None]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

Input = [('scale', StandardScaler()), ('polynomial', PolynomialFeatures(degree=2)), ('model', LinearRegression())]
pipe = Pipeline(Input)

pipe.fit(X, Y)
yhat_pipe=pipe.predict(X)


In [None]:
yhat_pipe

In [None]:
test_df = download_data("http://dataworkshop123.s3.amazonaws.com/test.csv")
test_df

In [None]:
yhat_test = pipe.predict(test_df[['LotArea', 'OverallQual']])

### Cross Validation

In [None]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(lm_multi, X, Y, cv=3)


In [None]:
np.mean(scores)

### Hyperparameter Optimization (Grid Search)

In [None]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV

parameters1=[{'alpha': [0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000]}]

ridge=Ridge()

grid = GridSearchCV(ridge, parameters1, cv=3)
grid.fit(X, Y)

grid.best_estimator_

scores=grid.cv_results_
scores['mean_test_score']

In [None]:
scores
