### Linear Regression Model
This notebook uses the train and validation model data to train a linear regression model on the training data. I will first use the default parameters in the model and then use optimal parameters generated by a random rearch.

In [1]:
#importing the data
import pandas as pd
import numpy as np
from pathlib import Path

In [2]:
#importing the california data
PARENT = "Predicting_House_Prices"
path = Path(PARENT).parent / "../Data/X_train_model2.csv"
X_train_model2 = pd.read_csv(path)

path2 = Path(PARENT).parent / "../Data/X_valid_model2.csv"
X_valid_model2 = pd.read_csv(path2)

path3 = Path(PARENT).parent / "../Data/y_train_model2.csv"
y_train_model2 = pd.read_csv(path3)

path4 = Path(PARENT).parent / "../Data/y_valid_model2.csv"
y_valid_model2 = pd.read_csv(path4)

In [3]:
#dropping unnamed column
X_train_model2 = X_train_model2.drop(columns="Unnamed: 0")
X_valid_model2 = X_valid_model2.drop(columns="Unnamed: 0")

In [4]:
#training the default linear regression model
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

lin_reg1 = LinearRegression().fit(X_train_model2, y_train_model2)

In [5]:
#getting the mean squared error and R^2

#y predictions on the validation data
preds= lin_reg1.predict(X_valid_model2)

print("Mean Squared Error:", format(mean_squared_error(y_valid_model2, preds)))
print("R Squared:", format(lin_reg1.score(X_valid_model2, y_valid_model2)))

Mean Squared Error: 1.132393868503543e+31
R Squared: -3.907535024590304e+18


In [6]:
# random search for parameters

from sklearn.model_selection import RandomizedSearchCV
import random

random.seed(3467)
# run randomized search
lin_reg2 = LinearRegression(
    normalize= False,
    n_jobs=None
).fit(X_train_model2, y_train_model2)

#intializing possible parameters
params = {
    "fit_intercept": [True, False],
    "copy_X": [True, False],
    "positive": [True, False]
}
#number of iterations
n_iter_search = 400
random_search = RandomizedSearchCV(
    lin_reg2, param_distributions=params, 
    n_iter=n_iter_search, 
    cv= 3, n_jobs = -1
)
#fitting the random search
random_search.fit(X_train_model2, y_train_model2)
#printing the results
rand_opt = random_search.best_params_
print(rand_opt)



{'positive': True, 'fit_intercept': False, 'copy_X': True}


In [7]:
#fitting the model with random search parameters
lin_reg3 = LinearRegression(positive= True, fit_intercept=False, copy_X= True,
    n_jobs=None).fit(X_train_model2, y_train_model2)

In [8]:
#getting the mean squared error and R^2

#y predictions on the validation data
preds2= lin_reg3.predict(X_valid_model2)

print("Mean Squared Error:", format(mean_squared_error(y_valid_model2, preds2)))
print("R Squared:", format(lin_reg3.score(X_valid_model2, y_valid_model2)))

Mean Squared Error: 1527797430748.0488
R Squared: 0.565542998603146


In [9]:
#looking at all of the columns
for t in X_train_model2.columns:
    print(t)

yearBuilt
livingArea
bathrooms
bedrooms
parking
garageSpaces
hasGarage
pool
spa
isNewConstruction
hasPetsAllowed
state_CA
state_GA
city_"ONeals"
city_Abbeville
city_Acampo
city_Acton
city_Acworth
city_Adairsville
city_Adel
city_Adelanto
city_Adin
city_Adrian
city_Agoura
city_Agoura Hills
city_Agua Dulce
city_Ahwahnee
city_Ailey
city_Alameda
city_Alamo
city_Alapaha
city_Albany
city_Albion
city_Alderpoint
city_Alhambra
city_Aliso Viejo
city_Allenhurst
city_Alma
city_Alpaugh
city_Alpharetta
city_Alpine
city_Alta
city_Alta Loma
city_Altadena
city_Alto
city_Alturas
city_Alviso
city_Amador City
city_American Canyon
city_Americus
city_Anaheim
city_Anaheim Hills
city_Anderson
city_Angels Camp
city_Angelus Oaks
city_Angwin
city_Annapolis
city_Antelope
city_Antioch
city_Apple Valley
city_Applegate
city_Appling
city_Aptos
city_Arabi
city_Aragon
city_Arcadia
city_Arcata
city_Armuchee
city_Arnold
city_Arnoldsville
city_Arrowbear Lake
city_Arrowhead
city_Arroyo Grande
city_Artesia
city_Arvin
city_As

In [10]:
# subsetting for only necessary vairables
X_train_model3 = X_train_model2.filter(["livingArea", "bathrooms", "bedrooms", "garageSpaces", 
"pool", "isNewConstruction", "state_CA", "state_GA"])
X_valid_model3 = X_valid_model2.filter(["livingArea", "bathrooms", "bedrooms", "garageSpaces", 
"pool", "isNewConstruction", "state_CA", "state_GA"])


In [11]:
#fitting the model with limited features
lin_reg4 = LinearRegression(positive= True, fit_intercept=False, copy_X= True,
    n_jobs=None).fit(X_train_model3, y_train_model2)

In [12]:
#getting the mean squared error and R^2

#y predictions on the validation data
preds4= lin_reg4.predict(X_valid_model3)

print("Mean Squared Error:", format(mean_squared_error(y_valid_model2, preds4)))
print("R Squared:", format(lin_reg4.score(X_valid_model3, y_valid_model2)))

Mean Squared Error: 1870310820674.5044
R Squared: 0.42071167689635236
