In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score

In [2]:
import os
df = pd.read_csv("AmesHousing.csv")

In [3]:
df.shape

(2930, 82)

In [5]:
df1= pd.get_dummies(df)
df1.head()

Unnamed: 0,Order,PID,MS SubClass,Lot Frontage,Lot Area,Overall Qual,Overall Cond,Year Built,Year Remod/Add,Mas Vnr Area,...,Sale Type_New,Sale Type_Oth,Sale Type_VWD,Sale Type_WD,Sale Condition_Abnorml,Sale Condition_AdjLand,Sale Condition_Alloca,Sale Condition_Family,Sale Condition_Normal,Sale Condition_Partial
0,1,526301100,20,141.0,31770,6,5,1960,1960,112.0,...,0,0,0,1,0,0,0,0,1,0
1,2,526350040,20,80.0,11622,5,6,1961,1961,0.0,...,0,0,0,1,0,0,0,0,1,0
2,3,526351010,20,81.0,14267,6,6,1958,1958,108.0,...,0,0,0,1,0,0,0,0,1,0
3,4,526353030,20,93.0,11160,7,5,1968,1968,0.0,...,0,0,0,1,0,0,0,0,1,0
4,5,527105010,60,74.0,13830,5,5,1997,1998,0.0,...,0,0,0,1,0,0,0,0,1,0


In [6]:
df1.columns[df1.isna().any()].tolist()

#Number of NaN values columnwise
df1.isna().sum()

Order                       0
PID                         0
MS SubClass                 0
Lot Frontage              490
Lot Area                    0
                         ... 
Sale Condition_AdjLand      0
Sale Condition_Alloca       0
Sale Condition_Family       0
Sale Condition_Normal       0
Sale Condition_Partial      0
Length: 307, dtype: int64

In [7]:
def impute_median(series):
    return series.fillna(series.median())
df1['Lot Frontage']= df1['Lot Frontage'].transform(impute_median)
df1['Mas Vnr Area']=df1['Mas Vnr Area'].transform(impute_median)
df1['BsmtFin SF 1']=df1['BsmtFin SF 1'].transform(impute_median)
df1['BsmtFin SF 2']=df1['BsmtFin SF 2'].transform(impute_median)
df1['Bsmt Unf SF']=df1['Bsmt Unf SF'].transform(impute_median)
df1['Total Bsmt SF']=df1['Total Bsmt SF'].transform(impute_median)
df1['Bsmt Full Bath']=df1['Bsmt Full Bath'].transform(impute_median)
df1['Bsmt Half Bath']=df1['Bsmt Half Bath'].transform(impute_median)
df1['Garage Cars']=df1['Garage Cars'].transform(impute_median)
df1['Garage Area']=df1['Garage Area'].transform(impute_median)
#Check remaining columns with NaN values
df1.columns[df1.isna().any()].tolist()

['Garage Yr Blt']

In [8]:
df2=df1.drop('Garage Yr Blt',axis=1)


In [9]:
#Define target array y
y= df2['SalePrice'].values

#Create feature array X
X= df2.drop('SalePrice',axis=1).values

In [10]:
X.shape

(2930, 305)

In [11]:
y.shape

(2930,)

In [12]:
y=y.reshape(-1,1)
y.shape

(2930, 1)

In [13]:
X_train, X_test,y_train, y_test= train_test_split(X,y,test_size=0.3,random_state=42)

In [15]:
#Create a regressor object
LR= LinearRegression()

#Fit training set to the regressor
LR.fit(X_train,y_train)

print("Mô hình hồi quy tuyến tính đã được huấn luyện, có các tham số:")
print("Intercept =", LR.intercept_)
print("Coefficients:", LR.coef_)

Mô hình hồi quy tuyến tính đã được huấn luyện, có các tham số:
Intercept = [-2991353.05285091]
Coefficients: [[ 2.91525289e+00  5.09552521e-06 -9.58085749e+01  5.58719525e+01
   6.87156440e-01  6.29529927e+03  6.03226927e+03  3.69597140e+02
   3.10492569e+01  2.05862641e+01  3.90754730e+02  3.83869658e+02
   3.70544828e+02 -3.47313693e+02  1.77627550e+01  3.15233332e+01
  -1.60771510e+01  3.32089163e+01  5.19265397e+02 -2.50368977e+02
   2.35546845e+03  1.48057693e+03 -3.46583919e+03 -1.32271765e+04
   4.72073460e+01  5.54546021e+03  5.57766490e+03  7.29141674e+00
   1.43182197e+01  7.31505862e-01  4.17224255e+00  1.98061668e+01
   4.79586128e+01 -9.15212109e+01 -8.95695064e-01  8.43802293e+01
   1.02514117e+03  1.45276675e+04 -2.41747544e+04 -3.88227782e+03
   3.81612162e+04 -2.73078347e+03 -7.21031420e+03 -1.46907538e+04
  -1.15437447e+04  1.15437447e+04 -8.79446809e+02 -1.50055627e+03
   1.54886438e+02  4.91066294e+03 -5.15884909e+03  9.32997060e+01
  -8.33247815e+02  5.90117868e+03

In [16]:
#Make predictions with the regressor
y_prediction = LR.predict(X_test)

In [18]:
# Calculate R2-score
score=r2_score(y_test,y_prediction)
print('R2-score is ',score)
print('Mean_sqrd_error is==',mean_squared_error(y_test,y_prediction))
print('Root_mean_squared error of is==',np.sqrt(mean_squared_error(y_test,y_prediction)))

R2-score is  0.8955304226952503
Mean_sqrd_error is== 734361363.6168584
Root_mean_squared error of is== 27099.102634900264
