# HOUSE PRICE PREDICTION

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
from sklearn.preprocessing import LabelEncoder
from scipy.stats import skew, norm
from scipy.special import boxcox1p
from scipy.stats import boxcox_normmax
from scipy.stats import boxcox, boxcox_normmax
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split,KFold, cross_val_score
from sklearn.metrics import mean_absolute_error , r2_score
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LinearRegression , Ridge , Lasso, RidgeCV
from sklearn.neighbors import KNeighborsRegressor as KNN
from sklearn.preprocessing import OneHotEncoder ,RobustScaler
from xgboost import XGBRegressor
import warnings


warnings.filterwarnings("ignore")

In [2]:
# read the data 
train_df=pd.read_csv("./archive/train.csv")
test_df=pd.read_csv("./archive/test.csv")

In [3]:
train_df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [4]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

# DATA CLEANING

In [5]:
# see th duplicated data 
train_df.duplicated().sum()

0

In [6]:
# calculate the number of the nan value 
train_df.isna().sum()

Id                 0
MSSubClass         0
MSZoning           0
LotFrontage      259
LotArea            0
                ... 
MoSold             0
YrSold             0
SaleType           0
SaleCondition      0
SalePrice          0
Length: 81, dtype: int64

In [7]:
columns_to_drop = []
isnull_columns = []

def remove_high_null_ratio_columns(df: pd.DataFrame, split='train',  threshold = 0.01):
    global columns_to_drop
    if split == 'train':
        total = train_df.isnull().sum().sort_values(ascending=False)
        percent = (train_df.isnull().sum() / train_df.isnull().count()).sort_values(ascending=False)
        missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
        columns_to_drop = missing_data[missing_data['Percent'] > threshold].index
        
    df = df.drop(columns=columns_to_drop)
    return df

def add_isnull_columns(df: pd.DataFrame, split='train'):
    global isnull_columns
    if split == 'train':
        isnull_columns.clear()
        
        for col in df.columns:
            if df[col].isnull().sum() > 0:
                isnull_columns.append(col)
    for col in isnull_columns:
        df[f'{col}_isnull'] = df[col].isnull().astype(int)
    return df

def fillna(df: pd.DataFrame, split='train'):
    numerical_cols = df.select_dtypes(include=['number']).columns
    categorical_cols = df.select_dtypes(include=['object']).columns
    df[numerical_cols] = df[numerical_cols].fillna(df[numerical_cols].mean())
    df[categorical_cols] = df[categorical_cols].apply(lambda x: x.fillna(x.mode()[0]))
    return df


In [8]:
clean_df = remove_high_null_ratio_columns(train_df)
clean_df = add_isnull_columns(clean_df)
clean_df = fillna(clean_df)


clean_test_df = remove_high_null_ratio_columns(test_df, 'test')
clean_test_df = add_isnull_columns(clean_test_df, 'test')
clean_test_df = fillna(clean_test_df, 'test')

In [9]:
# print the shape of data
train_df.shape

(1460, 81)

In [10]:
clean_df = clean_df.apply(LabelEncoder().fit_transform)

# FEATURE ENGINEERING

In [11]:
x = clean_df.drop(columns=['Id', 'SalePrice'])
y = clean_df['SalePrice']



In [12]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

# MODELLING

## KNN

In [13]:
# knn object with the hyper parameter n_neighbors=5 , weights="uniform" & metric="manhattan" 
knn = KNN(n_neighbors=5,weights="uniform",metric="manhattan")

# fit the model with x_train & y_train
knn.fit(x_train,y_train)

# train score 
knn.score(x_train,y_train)

0.8634787919694806

## LINEAR REGRESSION

In [14]:
lr=LinearRegression()
lr.fit(x_train,y_train)
lr.score(x_train,y_train)

0.9130004863137138

In [15]:
# calculate the mean score error to the linear regression model 
scores = cross_val_score(lr,x , y, cv=6, scoring='r2')

print("Cross-Validation Scores:", scores)
print("Mean R^2 Score:", scores.mean())

Cross-Validation Scores: [0.92187447 0.92761728 0.86426145 0.91508888 0.92003969 0.87081779]
Mean R^2 Score: 0.903283259407213


## XGBOOST

In [16]:
#  xgboost object
xgb = XGBRegressor(n_estimators= 2000 , max_depth= 7 , learning_rate = 0.01)
# fit the model
xgb.fit(x_train , y_train);
# print the accuracy of model
print ("train accuracy",xgb.score(x_train , y_train))
print ("test accuracy",xgb.score(x_test , y_test))

train accuracy 0.9997277910323369
test accuracy 0.9108398770333933


# SUBMISSION

In [17]:
clean_test_df = clean_test_df.apply(LabelEncoder().fit_transform)

X_test = clean_test_df.drop(columns=['Id'])

y_pred = xgb.predict(X_test)

In [18]:
submission_df = test_df[['Id']].copy()
submission_df['SalePrice'] = y_pred

submission_df.to_csv("./archive/submission.csv", index=False)