In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
from scipy.stats import f_oneway
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score,mean_squared_error

In [2]:
houseprice_df=pd.read_csv("house_price_data.csv")
df_copy=houseprice_df.copy()
y=houseprice_df['SalePrice']
houseprice_df.drop(columns='SalePrice',inplace=True)
#houseprice_df


In [3]:
na_df=pd.DataFrame({'column_name':houseprice_df.columns,'na_count':houseprice_df.isnull().sum(),
                   'na_pc':houseprice_df.isnull().sum()/houseprice_df.shape[0]*100})
col_grtr50na=list(na_df[na_df['na_pc']>50]['column_name'].values)
col_to_drop=[]
for col in houseprice_df.columns:
    if len(houseprice_df[col].value_counts())==1 or len(houseprice_df[col].value_counts())==houseprice_df.shape[0]:
        col_to_drop.append(col)

col_to_drop=col_to_drop+col_grtr50na
houseprice_df.drop(columns=col_to_drop,inplace=True)


In [4]:
#check for the correlation of each input variable with the output variable
correlated_cols=[]
#PCC check
for col in houseprice_df.columns:
    if houseprice_df[col].dtype=='int64' or houseprice_df[col].dtype=='float64':
        pcc=np.corrcoef(y,houseprice_df[col])[0][1]
        if abs(pcc)>0.5:
            correlated_cols.append(col)
#ANOVA test
for col in houseprice_df.columns:
    if houseprice_df[col].dtype=='object':
        catogories_list=list(houseprice_df[col].value_counts().index)
        result=f_oneway(*(df_copy[df_copy[col]==category]['SalePrice'] for category in catogories_list))
        if result[1]<0.05:
            correlated_cols.append(col)

corr_df=houseprice_df[correlated_cols]
#corr_df.shape

In [5]:
x_train,x_test,y_train,y_test=train_test_split(corr_df,y,test_size=0.2,random_state=21)
#x_train

In [6]:
#filling missing values(done after train-test-split)
for col in x_train.columns:
    if x_train[col].dtype=='int64' or x_train[col].dtype=='float64':
        x_train[col].fillna(x_train[col].mean(),inplace=True)
        x_test[col].fillna(x_train[col].mean(),inplace=True)
    if x_train[col].dtype=='object':
        x_train[col].fillna(x_train[col].mode()[0],inplace=True)
        x_test[col].fillna(x_train[col].mode()[0],inplace=True)        
        
#x_train.isnull().sum()        

In [7]:
#scaling the continuous variables
scaler=StandardScaler() #creating object
con_cols=[col for col in x_train.columns if x_train[col].dtype=='int64' or x_train[col].dtype=='float64']
for col in con_cols:
    x_train[col]=scaler.fit_transform(np.array(x_train[col]).reshape(-1,1))
    x_test[col]=scaler.transform(np.array(x_test[col]).reshape(-1,1))

#x_train[con_cols]


In [8]:
#one-hot encoding the categorical values
oe_train=pd.get_dummies(x_train[[col for col in x_train.columns if x_train[col].dtype=='object']])
oe_test=pd.get_dummies(x_test[[col for col in x_test.columns if x_test[col].dtype=='object']])
oe_train_final,oe_test_final=oe_train.align(oe_test,join='inner',axis=1)

In [9]:
x_train_final=pd.concat([x_train[con_cols],oe_train_final],axis=1)
x_test_final=pd.concat([x_test[con_cols],oe_test_final],axis=1)

In [10]:
linreg=LinearRegression()
linreg.fit(x_train_final,y_train)
linreg_pred=linreg.predict(x_test_final)

In [11]:
#to measure performance we have 2 approaches 
#using linearregression class
linreg.score(x_train_final,y_train)

0.9063092561412962

In [12]:
linreg.score(x_test_final,y_test)

0.8115567971244987

In [13]:
#using metrics
mean_squared_error(y_test,linreg_pred)

1277138376.9794521

In [14]:
r2_score(y_test,linreg_pred)

0.8115567971244987

In [15]:
#we do align in one-hot encoding because there is every probability of getting new category in test data and which is not present in train data
#Which in turn leads to extra columns in test data and gets error when model is performed on test data even the model is good on train data
#So inorder to eliminate those extra columns we align both train and test data which returns 2 dataframes having same column names
#below is example
train=['Hyderabad','Hyderabad','Chennai','Chennai','Blore','Pune','Pune']

pd.get_dummies(train)

Unnamed: 0,Blore,Chennai,Hyderabad,Pune
0,0,0,1,0
1,0,0,1,0
2,0,1,0,0
3,0,1,0,0
4,1,0,0,0
5,0,0,0,1
6,0,0,0,1


In [16]:
test=['Ahmedabad','Hyderabad','Chennai','Pune','Mumbai','Hyderabad','Blore','Blore','Ahmedabad']

pd.get_dummies(test)

Unnamed: 0,Ahmedabad,Blore,Chennai,Hyderabad,Mumbai,Pune
0,1,0,0,0,0,0
1,0,0,0,1,0,0
2,0,0,1,0,0,0
3,0,0,0,0,0,1
4,0,0,0,0,1,0
5,0,0,0,1,0,0
6,0,1,0,0,0,0
7,0,1,0,0,0,0
8,1,0,0,0,0,0
