# Model Selection using Linear Regression

In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import r2_score
from sklearn.linear_model import Ridge
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold 
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge
from sklearn.metrics import r2_score
from sklearn.model_selection import StratifiedKFold

In [2]:
dataset = pd.read_csv ("house_prices_data_training_data.csv")

In [3]:
dataset.dropna(inplace=True)
X = dataset.iloc[:, 3:].values
y = dataset.iloc[:, 2].values

In [4]:

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [5]:

regressor = LinearRegression()
regressor.fit(X_train, y_train)

LinearRegression()

In [6]:
# polynomial regression model
from sklearn.preprocessing import PolynomialFeatures
poly_reg = PolynomialFeatures(degree=2)
X_poly = poly_reg.fit_transform(X)
  
X_poly     # prints X_poly
 
lin_reg2 = LinearRegression()
lin_reg2.fit(X_poly,y)

LinearRegression()

In [7]:
y_pred = regressor.predict(X_test)
np.set_printoptions(precision=2)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

[[335131.67 269950.  ]
 [711345.23 785000.  ]
 [608984.67 830000.  ]
 ...
 [399265.27 342450.  ]
 [262470.9  340000.  ]
 [466063.41 415000.  ]]


In [8]:

r2_score(y_test, y_pred)

0.6955771369859194

# Regularized Linear regression

In [9]:

clf = Ridge(alpha=50.0)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
np.set_printoptions(precision=2)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))
r2_score(y_test, y_pred)

[[347864.96 269950.  ]
 [706676.2  785000.  ]
 [599795.22 830000.  ]
 ...
 [379594.77 342450.  ]
 [267115.67 340000.  ]
 [442710.71 415000.  ]]


0.6906240324758514

In [10]:


k = 5
kf = KFold(n_splits=k, random_state=None)
model = LinearRegression()

acc_score = []

for train_index , test_index in kf.split(X):
    X_train , X_test = X[train_index,:],X[test_index,:]
    y_train , y_test = y[train_index] , y[test_index]
    
    model.fit(X_train,y_train)
    y_pred = model.predict(X_test)
    acc = r2_score(y_test, y_pred)
    acc_score.append(acc)

    
avg_acc_score = sum(acc_score)/k

print('accuracy of each fold - {}'.format(acc_score))
print('Avg accuracy : {}'.format(avg_acc_score))



accuracy of each fold - [0.7075834963594038, 0.6920232525272791, 0.6778073557636506, 0.704162514491026, 0.7106388481439538]
Avg accuracy : 0.6984430934570627


In [11]:
#Implementing cross validation


k = 5
kf = KFold(n_splits=k, random_state=None)
model = Ridge(alpha=1000.0)
model.fit(X_train, y_train)
acc_score = []

for train_index , test_index in kf.split(X):
    X_train , X_test = X[train_index,:],X[test_index,:]
    y_train , y_test = y[train_index] , y[test_index]
    
    model.fit(X_train,y_train)
    y_pred = model.predict(X_test)
    acc = r2_score(y_test, y_pred)
    acc_score.append(acc)

    
avg_acc_score = sum(acc_score)/k

print('accuracy of each fold - {}'.format(acc_score))
print('Avg accuracy : {}'.format(avg_acc_score))

accuracy of each fold - [0.6521671283975006, 0.6457737006598172, 0.6410029785953085, 0.6648216476001577, 0.6739620537402322]
Avg accuracy : 0.6555455017986033


In [12]:

accuracies = cross_val_score(estimator=clf,X = X_train,y=y_train,cv=10)
accuracies.mean()
accuracies.std()

0.018981248760250563

In [13]:

model = LinearRegression()
model.fit(X_train, y_train)
acc_score = []

for train_index , test_index in kf.split(X):
    X_train , X_test = X[train_index,:],X[test_index,:]
    y_train , y_test = y[train_index] , y[test_index]
    
    model.fit(X_train,y_train)
    y_pred = model.predict(X_test)
    acc = r2_score(y_test, y_pred)
    acc_score.append(acc)

    
avg_acc_score = sum(acc_score)/k

print('accuracy of each fold - {}'.format(acc_score))
print('Avg accuracy : {}'.format(avg_acc_score))

accuracy of each fold - [0.7075834963594038, 0.6920232525272791, 0.6778073557636506, 0.704162514491026, 0.7106388481439538]
Avg accuracy : 0.6984430934570627


In [14]:
skf = StratifiedKFold(n_splits=2)
skf.get_n_splits(X, y)
acc_score = []
model = LinearRegression()
    
print(skf)
StratifiedKFold(n_splits=5, random_state=None, shuffle=False)
for train_index, test_index in skf.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    model.fit(X_train,y_train)
    y_pred = model.predict(X_test)
    acc = r2_score(y_test, y_pred)
    acc_score.append(acc)

    
avg_acc_score = sum(acc_score)/k
print('accuracy of each fold - {}'.format(acc_score))
print(" Average accuracy Score ", avg_acc_score)

StratifiedKFold(n_splits=2, random_state=None, shuffle=False)
accuracy of each fold - [0.7088679556374595, 0.681167249164758]
 Average accuracy Score  0.2780070409604435




In [15]:
skf = StratifiedKFold(n_splits=2)
skf.get_n_splits(X, y)
acc_score = []
model = Ridge(alpha=50.0)
model.fit(X_train, y_train)    
print(skf)
StratifiedKFold(n_splits=5, random_state=None, shuffle=False)
for train_index, test_index in skf.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    model.fit(X_train,y_train)
    y_pred = model.predict(X_test)
    acc = r2_score(y_test, y_pred)
    acc_score.append(acc)

    
avg_acc_score = sum(acc_score)/k
print('accuracy of each fold - {}'.format(acc_score))
print(" Average accuracy Score ", avg_acc_score)

StratifiedKFold(n_splits=2, random_state=None, shuffle=False)




accuracy of each fold - [0.6987624574032811, 0.681023806182988]
 Average accuracy Score  0.2759572527172538
