In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

In [None]:
df_train = pd.read_csv("/kaggle/input/house-prices-advanced-regression-techniques/train.csv")

In [None]:
df_train

In [None]:
df_train.info()

In [None]:
df_test = pd.read_csv("/kaggle/input/house-prices-advanced-regression-techniques/test.csv")

In [None]:
df_test

In [None]:
df_test.info()

In [None]:
df_train.shape

In [None]:
df_test.shape

In [None]:
t = (df_train.dtypes == 'object')
object_cols = list(t[t].index)
print("Categorical variables:")
print(object_cols)
print('-'*50)

t = (df_train.dtypes == 'int')
num_cols = list(t[t].index)
print("Integer variables:")
print(num_cols)
print('-'*50)

t = (df_train.dtypes == 'float')
num_cols = list(t[t].index)
print("Float variables:")
print('-'*50)

In [None]:
df_train.columns

In [None]:
df_test.columns

In [None]:
df_train.describe()

In [None]:
df_test.describe()

In [None]:
df_train.isnull().sum()

In [None]:
df_test.isnull().sum()

In [None]:
df = [df_train,df_test]
df_merged = pd.concat(df).reset_index(drop=True)

In [None]:
df_merged 

In [None]:
df_merged.isnull().sum()

In [None]:
unique_values = []
for col in object_cols:
  unique_values.append(df_merged[col].unique().size)
plt.figure(figsize=(18,6))
plt.title('Number Unique Values Of Categorical Features')
plt.xticks(rotation=90)
sns.barplot(x=object_cols,y=unique_values)

In [None]:
plt.figure(figsize=(18,36))
plt.title('Distribution Categorical Features')
plt.xticks(rotation=90)
index = 1
for col in object_cols:
  y = df_merged[col].value_counts()
  plt.subplot(11,4,index)
  plt.xticks(rotation=90)
  sns.barplot(x=list(y.index), y=y)
  index +=1

In [None]:
df_merged.drop(columns='Id', inplace=True); print('Drop Id \n')
df_merged['MSZoning'] = df_merged['MSZoning'].fillna(df_merged['MSZoning'].mode()[0])
df_merged['LotFrontage'] = df_merged['LotFrontage'].fillna(df_merged['LotFrontage'].mean())
df_merged.drop(columns='Alley', inplace=True); print('Drop Alley \n')

# df_merged['Utilities'] = df_merged['Utilities'].fillna(df_merged['Utilities'].mode()[0])
print(df_merged['Utilities'].value_counts())
df_merged.drop(columns='Utilities', inplace=True); print('Drop Utilities \n')

df_merged['Exterior1st'] = df_merged['Exterior1st'].fillna(df_merged['Exterior1st'].mode()[0])
df_merged['Exterior2nd'] = df_merged['Exterior2nd'].fillna(df_merged['Exterior2nd'].mode()[0])
df_merged['MasVnrType'] = df_merged['MasVnrType'].fillna(df_merged['MasVnrType'].mode()[0])
df_merged['MasVnrArea'] = df_merged['MasVnrArea'].fillna(df_merged['MasVnrArea'].mean())
df_merged['Electrical'] = df_merged['Electrical'].fillna(df_merged['Electrical'].mode()[0])
df_merged['KitchenQual'] = df_merged['KitchenQual'].fillna(df_merged['KitchenQual'].mode()[0])
df_merged['Functional'] = df_merged['Functional'].fillna(df_merged['Functional'].mode()[0])

# df_merged.loc[(df_merged['Fireplaces'] != 0) & (df_merged['FireplaceQu'].isnull()) ][['FireplaceQu','Fireplaces']]
df_merged['FireplaceQu'] = df_merged['FireplaceQu'].fillna('NA'); print('FirePlaceQu: Fill NA values for missing values \n')

df_merged.loc[(df_merged['PoolQC'].isnull()) & df_merged['PoolArea']>0][['PoolQC','PoolArea']]
df_merged.at[2599,'PoolQC'] = df_merged['PoolQC'].mode()[0]; print('PoolQC: Use mode for missing value with non-zero PoolArea \n')
df_merged['PoolQC'] = df_merged['PoolQC'].fillna('NA'); print('PoolQC: Use NA for remaining missing values \n')

df_merged['SaleType'].fillna(df_merged['SaleType'].mode()[0], inplace=True)
df_merged.drop(columns=['Fence','MiscFeature','SalePrice'], inplace=True); print('Drop Fence, MiscFeature and SalePrice\n')

# Basement Features
# df_merged.loc[df_merged['BsmtQual'].isnull()][['BsmtQual','BsmtCond','BsmtExposure','BsmtFinType1','BsmtFinSF1','BsmtFinType2','BsmtUnfSF','TotalBsmtSF','BsmtFullBath','BsmtHalfBath']].head()
# df_merged.loc[df_merged['TotalBsmtSF'].isnull()][['BsmtQual','BsmtCond','BsmtExposure','BsmtFinType1','BsmtFinSF1','BsmtFinType2','BsmtFinSF2','BsmtUnfSF','TotalBsmtSF','BsmtFullBath','BsmtHalfBath']]
print('Fill missing values of Basement features with NA or 0 \n')
df_merged['BsmtQual'] = df_merged['BsmtQual'].fillna('NA')
df_merged['BsmtCond'] = df_merged['BsmtCond'].fillna('NA')
df_merged['BsmtExposure'] = df_merged['BsmtExposure'].fillna('NA')
df_merged['BsmtFinType1'] = df_merged['BsmtFinType1'].fillna('NA')
df_merged['BsmtFinType2'] = df_merged['BsmtFinType2'].fillna('NA')

df_merged['BsmtFinSF1'] = df_merged['BsmtFinSF1'].fillna(int(0))
df_merged['BsmtFinSF2'] = df_merged['BsmtFinSF2'].fillna(int(0))
df_merged['BsmtUnfSF'] = df_merged['BsmtUnfSF'].fillna(int(0))
df_merged['TotalBsmtSF'] = df_merged['TotalBsmtSF'].fillna(int(0))
df_merged['BsmtFullBath'] = df_merged['BsmtFullBath'].fillna(int(0))
df_merged['BsmtHalfBath'] = df_merged['BsmtHalfBath'].fillna(int(0))

# Garage Features
# df_merged.loc[df_merged['GarageCond'].isnull()][['GarageType','GarageYrBlt','GarageFinish','GarageCars','GarageArea','GarageQual','GarageCond']].head()
print('Fill missing values of Garage features with NA or 0 \n')
df_merged['GarageType'] = df_merged['GarageType'].fillna('NA')
df_merged['GarageFinish'] = df_merged['GarageFinish'].fillna('NA')
df_merged['GarageCond'] = df_merged['GarageCond'].fillna('NA')
df_merged['GarageQual'] = df_merged['GarageQual'].fillna('NA')
df_merged['GarageCars'] = df_merged['GarageCars'].fillna(int(0))
df_merged['GarageArea'] = df_merged['GarageArea'].fillna(int(0))
df_merged['GarageYrBlt'] = df_merged['GarageYrBlt'].fillna(int(0))

In [None]:
df_merged

In [None]:
df_merged.isnull().sum().sum()

In [None]:
from sklearn.preprocessing import OneHotEncoder
s = (df_merged.dtypes == 'object')
object_cols = list(s[s].index)
print("Categorical variables:")
print(object_cols)
print('No. of. categorical features: ',len(object_cols))

In [None]:
OH_encoder = OneHotEncoder(sparse=False)
OH_column = pd.DataFrame(OH_encoder.fit_transform(df_merged[object_cols]))
OH_column.index = df_merged.index
OH_column.columns = OH_encoder.get_feature_names_out()
df_final = df_merged.drop(object_cols, axis=1)
df_final = pd.concat([df_final, OH_column], axis=1)
df_final.head()

In [None]:
print('Shape Of Final Dataset:', df_final.shape)
print('Shape Of Train Dataset:', df_train.shape)
print('Shape Of Test Dataset:',  df_test.shape)

X_Train = pd.DataFrame(df_final[:1460])
X_Test  = pd.DataFrame(df_final[1460:])
Y_Train = df_train['SalePrice']

print('\nCheck that the datasets are consistent:\n')
print('Shape Of X_Train Dataset', X_Train.shape)
print('Shape Of Y_Train Dataset:', Y_Train.shape)
print('Shape Of X_Test Dataset:',  X_Test.shape)

In [None]:
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split

# Split the training set into training and validation set

X_train, X_valid, Y_train, Y_valid = train_test_split(X_Train, Y_Train, train_size=0.8, test_size=0.2,random_state=0)

In [None]:
from sklearn.ensemble import RandomForestRegressor
RFR = RandomForestRegressor()
RFR.fit(X_train, Y_train)
Y_pred = RFR.predict(X_valid)
print('\nRMSE: ', np.sqrt(mean_squared_error(Y_valid, Y_pred)))
print('\nMAE: ', mean_absolute_error(Y_valid, Y_pred))
print('\nR2: ', r2_score(Y_valid, Y_pred))

In [None]:
from sklearn.ensemble import GradientBoostingRegressor
GBR = GradientBoostingRegressor()
GBR.fit(X_train, Y_train)
Y_pred = GBR.predict(X_valid)
print('\nRMSE: ', np.sqrt(mean_squared_error(Y_valid, Y_pred)))
print('\nMAE: ', mean_absolute_error(Y_valid, Y_pred))
print('\nR2: ', r2_score(Y_valid, Y_pred))

In [None]:
from xgboost import XGBRegressor
XGBR = XGBRegressor(learning_rate=0.03,n_estimators=200,objective='reg:squarederror')
XGBR.fit(X_train,Y_train)
Y_pred = XGBR.predict(X_valid)
print('\nRMSE: ', np.sqrt(mean_squared_error(Y_valid, Y_pred)))
print('\nMAE: ', mean_absolute_error(Y_valid, Y_pred))
print('\nR2: ', r2_score(Y_valid, Y_pred))

In [None]:
from sklearn.model_selection import GridSearchCV
model = XGBRegressor()

In [None]:
n_estimators   = [100, 200, 500]
learning_rates = [0.03,0.1,0.3]
objectives     = ['reg:squarederror']

# Define the grid of hyperparameters to search
hyperparameter_grid = {
    'n_estimators' : n_estimators,
    'learning_rate':learning_rates,
    'objective' : objectives
    }

grid_cv = GridSearchCV(estimator = model,
            param_grid = hyperparameter_grid,
            scoring = 'neg_mean_absolute_error',
            return_train_score = True)

grid_cv.fit(X_Train,Y_Train)

In [None]:
grid_cv.best_score_

In [None]:
grid_cv.best_estimator_

In [None]:
regressor = grid_cv.best_estimator_
Y_pred = regressor.predict(X_valid)
print(mean_absolute_error(Y_valid, Y_pred))

In [None]:
#Submission 
Y_Pred = regressor.predict(X_Test)
Y_Pred 

In [None]:
Y_Pred.shape

In [None]:
new = pd.DataFrame()
new['Id'] = df_test['Id']
new['SalePrice'] = Y_Pred

In [None]:
new

In [None]:
new1 = new.iloc[:, 0:]
new1

In [None]:
new1.to_csv('Submission.csv',index=False)