In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train_data = pd.read_csv('../input/house-prices-advanced-regression-techniques/train.csv')
test_data = pd.read_csv('../input/house-prices-advanced-regression-techniques/test.csv')
submission=pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/sample_submission.csv')

In [None]:
print(len(train_data.columns))
print(len(test_data.columns))

In [None]:
train_data.head()


In [None]:
print(train_data.isnull().sum().sort_values(ascending = False))

In [None]:
train_data=train_data.drop(columns=['PoolQC', 'MiscFeature', 'Alley', 'Fence'])
train_data.describe()

In [None]:
categorical_data = train_data.select_dtypes(['object']).columns
train_data[categorical_data] = train_data[categorical_data].fillna(train_data[categorical_data].mode().iloc[0])
train_data[categorical_data].mode()

In [None]:
print(train_data.isnull().sum().sort_values(ascending = False))

In [None]:
trai=train_data.drop('Id',axis=1)
numerical_data = trai.select_dtypes(['float64','int64']).columns


In [None]:
from sklearn.impute import KNNImputer

imputer = KNNImputer(n_neighbors=15)
for i in numerical_data:
    train_data[i]=imputer.fit_transform(train_data[[i]])

In [None]:
train_data.hist(figsize=(20,20), bins=20)
plt.show()

In [None]:
category_columns = train_data.select_dtypes(['object']).columns
print(category_columns)
train_data[category_columns] = train_data[category_columns].astype('category').apply(lambda x: x.cat.codes)

float_columns = train_data.select_dtypes(['float64']).columns
print(float_columns)
train_data['LotFrontage'] =  pd.to_numeric(train_data['LotFrontage'], errors = 'coerce')
train_data['MasVnrArea'] =  pd.to_numeric(train_data['MasVnrArea'], errors = 'coerce')
train_data['GarageYrBlt'] =  pd.to_numeric(train_data['GarageYrBlt'], errors = 'coerce')
train_data['SalePrice'] =  pd.to_numeric(train_data['SalePrice'], errors = 'coerce')

train_data = train_data.astype('float64')

In [None]:
sns.displot(train_data['SalePrice'])

In [None]:
correlation_matrix = train_data.corr()
correlation_matrix['SalePrice'].sort_values(ascending = False)

In [None]:
#Heat map
correlation_num = 30
correlation_cols = correlation_matrix.nlargest(correlation_num,'SalePrice')['SalePrice'].index
correlation_mat_sales = np.corrcoef(train_data[correlation_cols].values.T)
sns.set(font_scale=1.25)
f, ax = plt.subplots(figsize=(12, 9))
hm = sns.heatmap(correlation_mat_sales, cbar=True, annot=True, square=True, fmt='.2f', annot_kws={'size': 7}, yticklabels=correlation_cols.values, xticklabels=correlation_cols.values)
plt.show()

In [None]:
y= train_data['SalePrice']
x= train_data.drop(columns=['SalePrice','Id'])
print(len(x.columns))

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(x,y, test_size = 0.3, random_state = 60,shuffle=True)
print(len(X_train))
print(len(X_test))

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import sklearn.metrics as sm
forest_model= RandomForestRegressor(n_estimators = 150, random_state = 42)

# Train the model with training data
forest_model.fit(X_train,Y_train)
#make predictions using random forest model
price_predict =(forest_model.predict(X_test))


forest_mse = mean_squared_error(Y_test,price_predict)
print("Mean Squared Error : " , forest_mse)
forest_SqMse = np.sqrt(forest_mse)
print("Root Mean Squared error: ", forest_SqMse)

print("Explain variance score for Random Forest Regression =", round(sm.explained_variance_score(Y_test,price_predict), 2)) 
print("R2 score for Random Forest Regression =", round(sm.r2_score(Y_test, price_predict), 2))

In [None]:
test_data=test_data.drop(columns=['PoolQC', 'MiscFeature', 'Alley', 'Fence'])
test_data.describe()

In [None]:
categorical_data = test_data.select_dtypes(['object']).columns
test_data[categorical_data] = test_data[categorical_data].fillna(test_data[categorical_data].mode().iloc[0])
test_data[categorical_data].mode()

In [None]:
print(test_data.isnull().sum().sort_values(ascending = False))

In [None]:
trais=test_data.drop('Id',axis=1)
numerical_datas = trais.select_dtypes(['float64','int64']).columns


In [None]:
from sklearn.impute import KNNImputer

imputer = KNNImputer(n_neighbors=15)
for i in numerical_datas:
    test_data[i]=imputer.fit_transform(test_data[[i]])

In [None]:
category_columns = test_data.select_dtypes(['object']).columns
print(category_columns)
test_data[category_columns] = test_data[category_columns].astype('category').apply(lambda x: x.cat.codes)

float_columns = test_data.select_dtypes(['float64']).columns
print(float_columns)
test_data['LotFrontage'] =  pd.to_numeric(test_data['LotFrontage'], errors = 'coerce')
test_data['MasVnrArea'] =  pd.to_numeric(test_data['MasVnrArea'], errors = 'coerce')
test_data['GarageYrBlt'] =  pd.to_numeric(test_data['GarageYrBlt'], errors = 'coerce')
# test_data['SalePrice'] =  pd.to_numeric(test_data['SalePrice'], errors = 'coerce')

test_data = test_data.astype('float64')

In [None]:
float_columns = test_data.select_dtypes(['float64']).columns
print(float_columns)
test_data['LotFrontage'] =  pd.to_numeric(test_data['LotFrontage'], errors = 'coerce')
test_data['MasVnrArea'] =  pd.to_numeric(test_data['MasVnrArea'], errors = 'coerce')
test_data['GarageYrBlt'] =  pd.to_numeric(test_data['GarageYrBlt'], errors = 'coerce')
test_data['BsmtFinSF1'] =  pd.to_numeric(test_data['BsmtFinSF1'], errors = 'coerce')
test_data['BsmtFinSF2'] =  pd.to_numeric(test_data['BsmtFinSF2'], errors = 'coerce')
test_data['BsmtUnfSF'] =  pd.to_numeric(test_data['BsmtUnfSF'], errors = 'coerce')
test_data['TotalBsmtSF'] =  pd.to_numeric(test_data['TotalBsmtSF'], errors = 'coerce')
test_data['BsmtFullBath'] =  pd.to_numeric(test_data['BsmtFullBath'], errors = 'coerce')
test_data['GarageCars'] =  pd.to_numeric(test_data['GarageCars'], errors = 'coerce')
test_data['GarageArea'] =  pd.to_numeric(test_data['GarageArea'], errors = 'coerce')

test_data = test_data.astype('float64')

In [None]:
test_id = test_data['Id']
test_x= test_data.drop(columns=['Id'])
print(len(test_x.columns))

In [None]:
y_prdict = forest_model.predict(test_x)

In [None]:
#submission = pd.DataFrame(test_id)
submission['SalePrice']=y_prdict
submission.head(20)

In [None]:
submission.to_csv('submission1.csv', index=False)