# Module 7 - Linear Pregression

In [None]:
import numpy as np
import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as plt
import sklearn

%matplotlib inline

In [None]:
from sklearn.datasets import load_boston
boston = load_boston()

## Available Datasets
load_boston([return_X_y])	Load and return the boston house-prices dataset (regression). 

load_iris([return_X_y])	Load and return the iris dataset (classification). 

load_diabetes([return_X_y])	Load and return the diabetes dataset (regression). 

load_digits([n_class, return_X_y])	Load and return the digits dataset (classification). 

load_linnerud([return_X_y])	Load and return the linnerud dataset (multivariate regression). 

load_wine([return_X_y])	Load and return the wine dataset (classification). 

load_breast_cancer([return_X_y])	Load and return the breast cancer wisconsin dataset (classification).

In [None]:
#data = dataset, target = dependent variable, feature_names = column headers, DESCR = data dictionary
boston.keys()

In [None]:
#506 rows, 13 columns
boston.data.shape

In [None]:
print (boston.feature_names)

In [None]:
print (boston.DESCR)

In [None]:
bos = pd.DataFrame(boston.data)
bos.head()

In [None]:
bos.columns = boston.feature_names
bos.head()

In [None]:
#verifying the first 5 rows in the dependent variable datatset
boston.target[:5]

In [None]:
#assigning dependent variable to column named "Price"
bos['PRICE'] = boston.target

In [None]:
bos.head()

In [None]:
from sklearn.linear_model import LinearRegression
X = bos.drop('PRICE', axis = 1)

#assign linear regression function to a variable
lm = LinearRegression()
lm

#### Important functions to keep in mind while fitting a linear regression model are: 
    
- lm.fit() -> fits a linear model 

- lm.predict() -> Predict Y using the linear model with estimated coefficients 

- lm.score() -> Returns the coefficient of determination (R^2). A measure of how well observed outcomes are replicated by the model, as the proportion of total variation of outcomes explained by the model. 

In [None]:
lm.fit(X, bos.PRICE)

In [None]:
print('Estimated intercept coefficient:', lm.intercept_)

In [None]:
print('Number of coefficients:', len(lm.coef_))

In [None]:
pd.DataFrame(list(zip(X.columns, lm.coef_)), columns = ['features', 'estimatedCoefficients'])

In [None]:
plt.scatter(bos.RM, bos.PRICE) 
plt.xlabel("Average number of rooms per dwelling (RM)") 
plt.ylabel("Housing Price") 
plt.title("Relationship between RM and Price") 
plt.show()

In [None]:
lm.predict(X)[0:5]

In [None]:
plt.scatter(bos.PRICE, lm.predict(X))
plt.xlabel("Price")
plt.ylabel("Predicted Price")
plt.title("Price vs Predicted Price")
plt.show()

In [None]:
#calculate mean squared error
mseFull = np.mean((bos.PRICE - lm.predict(X)) ** 2)
print(mseFull)

## Do not split your train and test data this way

In [None]:
X_train = X[:-50]
X_test = X[-50:]
Y_train = bos.PRICE[:-50]
Y_test = bos.PRICE[-50:]
print(X_train.shape)
print(X_test.shape)
print(Y_train.shape)
print(Y_test.shape)

# Randomize train and test data

In [None]:
#test_size default = 0.25
X_train, X_test, Y_train, Y_test = sklearn.model_selection.train_test_split(
    X, bos.PRICE, test_size=0.33, random_state = 5)
print(X_train.shape)
print(X_test.shape)
print(Y_train.shape)
print(Y_test.shape)

In [None]:
lm=LinearRegression()
lm.fit(X_train, Y_train)
pred_train = lm.predict(X_train)
pred_test = lm.predict(X_test)

In [None]:
print ('Fit a model X_train, and calculate MSE with Y_train:', np.mean((Y_train - lm.predict(X_train)) ** 2))
print ('Fit a model X_train, and calculate MSE with X_test, Y_test:', np.mean((Y_test - lm.predict(X_test)) ** 2))