In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
#load the datasets
train=pd.read_csv("../input/house-prices-advanced-regression-techniques/train.csv")
test=pd.read_csv("../input/house-prices-advanced-regression-techniques/test.csv")

In [None]:
#explore the data
train.describe () 
train.info()
# identify null values

The dataset contains 1460 datapoints, 81 variables,38 numerical variables and 43 object variables. Some of the variables contain null variables.

In [None]:
# step 2
#visualize data
#import the visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns
plt.rcParams.update({'figure.max_open_warning': 0})
%matplotlib inline

Before starting with visualization it is better to separate the data into categorical and numerical for easier visualizations and analysis

In [None]:
# separating data into numerical and categorical columns
nc=[]  #list numerical dataset
cc=[]   #list for categorical dataset
for col in train.columns:
    if train[col].dtype in ("int64",'float64'):
        nc.append(train[col].name)
    else:
        cc.append(train[col].name)
ncd=train[nc]   #numerical columns datasets
ccd=train[cc]   #categorical columns datasets

1. Numerical data visualizations

In [None]:
#nc.remove("SalePrice")
"""for i in nc:
    sns.relplot(data=train, x=i,y='SalePrice')"""

In [None]:
#correlation of numerical variables with sales price
#drop id 
#ncd=ncd.drop(['Id'],axis=1)
plt.figure(figsize=(60,50))
ax=sns.heatmap(ncd.corr(), annot=True, fmt=".2f",cmap='cool')

print (ax)

2. Categorical visualization

In [None]:
for i in cc:
  
    ax=sns.catplot(x=i,data=train,kind='count',height=5,aspect=1.5)
    ax.set_xticklabels(rotation=30)


In [None]:
#visualize relationship between the object variables and sales price
for i in cc:
    sns.catplot(x=i,y='SalePrice',data=train,kind='box')

Data Cleaning This will involve dealing with null variables



In [None]:
#identify null values
obj=train.isnull().sum().sort_values(ascending=False)
for key,value in obj.iteritems():
    print(key,",", value)

In [None]:
# remove columns with null variables greater than 1,000
train=train.drop(['PoolQC','MiscFeature','Fence','Alley'],axis=1)


In [None]:
#fill missing numerical values with mean and categorical values with mode
for col in train:
    if (col in nc) & (train[col].isnull().any()):
        train[col].fillna(train[col].mean(),inplace=True)
    if (col in cc) & (train[col].isnull().any()):
        train[col].fillna(train[col].mode().iloc[0],inplace=True)

Feature Engineering

In [None]:
#remove categorical variables than have single variables accounting for over 95%
for col in cc:
    if (ccd[col].value_counts().max()/ccd[col].count())>0.95:
        train.drop(col,axis=1,inplace=True)

In [None]:
# Encode categorical variables using ordinal encoder
#categorical encoding using ordinal encoder
from sklearn.preprocessing import OrdinalEncoder
ordinal=OrdinalEncoder()
for col in train.columns:
    if train[col].dtype not in ("int64",'float64'):
        train[col]=ordinal.fit_transform(train[col].values.reshape(-1,1))

In [None]:
#remove numerical variables that have marginal correlation with sales price (between 0.1 and -0.1)
for col in nc:
    if (train[col].corr(train['SalePrice'])< 0.1) & (train[col].corr(train['SalePrice'])> -0.1):
                                                                 train.drop(col,axis=1,inplace=True)

In [None]:
#remove outliers in Sales Price
Q1=np.percentile(train['SalePrice'],25,interpolation='midpoint')
Q3=np.percentile(train['SalePrice'],75,interpolation='midpoint')
IQR=Q3-Q1
#upper limit
upper=np.where(train['SalePrice']>=(Q3+1.5*IQR))
lower=np.where(train['SalePrice']<=(Q1-1.5*IQR))
train.drop(upper[0],errors='ignore',inplace=True)
train.drop(lower[0],errors='ignore',inplace=True)

Modelling

In [None]:
#modelling
#import modules, we will use Linear Regression and Random forest regression model
from sklearn.linear_model import LinearRegression
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.inspection import permutation_importance


In [None]:
#split training data
X=train.drop("SalePrice",axis=1)
Y= train["SalePrice"]
X_train, X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.3,random_state=0)

#create list to store your data
result=[]

In [None]:
#linear regression model
linearmodel=LinearRegression()
linearmodel.fit(X_train,Y_train)
y_pred=linearmodel.predict(X_test)
score=["Linear Regression", linearmodel.score(X_train,Y_train),linearmodel.score(X_test,Y_test),abs(linearmodel.score(X_train,Y_train)-linearmodel.score(X_test,Y_test))]
result.append(score)
print (result)

In [None]:
#Random Forest Regression Model
rmodel=RandomForestRegressor()
rmodel.fit(X_train,Y_train)
y_pred=rmodel.predict(X_test)
score1=["Random Regression", rmodel.score(X_train,Y_train),rmodel.score(X_test,Y_test),abs(rmodel.score(X_train,Y_train)-rmodel.score(X_test,Y_test))]
result.append(score1)
print (result)

In [None]:
#clean test data
#using similar methods used on training data
nc=[]
cc=[]
for col in test.columns:
    if test[col].dtype in ("int64",'float64'):
        nc.append(test[col].name)
    else:
        cc.append(test[col].name)
ncd=test[nc]
ccd=test[cc]
index=pd.DataFrame(test['Id'])
#fill in missing values
for col in test:
    if (col in nc) & (test[col].isnull().any()):
        test[col].fillna(test[col].mean(),inplace=True)
    if (col in cc) & (test[col].isnull().any()):
        test[col].fillna(test[col].mode().iloc[0],inplace=True)
#encode categorical variables
from sklearn.preprocessing import OrdinalEncoder
ordinal=OrdinalEncoder()
for col in cc:
    test[col]=ordinal.fit_transform(test[col].values.reshape(-1,1))

#ensure that the test dataset contains same variables as training data
cctrain=[]
for col in train.columns:
    cctrain.append(col)
cctrain.remove("SalePrice")
for col in test.columns:
    if  col in cctrain:
        pass
    else:
        test.drop(col,axis=1, inplace=True)

In [None]:
# we will use random forest since it has better performance compared to linear model
#first we will evaluate feature importance of the variables 
importance=pd.DataFrame({'feature':X_train.columns,'importance':np.round(rmodel.feature_importances_,3)})
importance=importance.sort_values('importance',ascending=False)
icl=importance.loc[importance['importance']<0.01]
listc=icl.feature.tolist()


In [None]:
"""# run the regression model with features that have importance >0.01
for col in listc:
    if col in train.columns:
        train.drop(col,axis=1,inplace=True)
        test.drop(col,axis=1,inplace=True)
ramodel=RandomForestRegressor()
ramodel.fit(X_train,Y_train)
y_pred=ramodel.predict(X_test)
score2=["Random Regression", ramodel.score(X_train,Y_train),ramodel.score(X_test,Y_test),abs(ramodel.score(X_train,Y_train)-ramodel.score(X_test,Y_test))]
result.append(score2)"""


In [None]:
#make predictions with test data and submit
prediction=index.join(pd.DataFrame(rmodel.predict(test).astype(float),columns=['SalePrice']))
prediction.to_csv("submission.csv",index=False)