# 1- import necessary library

In [None]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

# 2- load  train.csv of the house-price-prediction-challenge dataset 

In [None]:
dataframe_train=pd.read_csv("../input/house-price-prediction-challenge/train.csv")
dataframe_train

# 3- load  test.csv of the house-price-prediction-challenge dataset

In [None]:
dataframe_test=pd.read_csv("../input/house-price-prediction-challenge/test.csv")
dataframe_test

# 4- drop unrelated columns, which don't effect the house price, in train.csv of the house-price-prediction-challenge dataset

In [None]:
df_train=dataframe_train.drop((["UNDER_CONSTRUCTION", "RERA", "BHK_NO.","POSTED_BY","READY_TO_MOVE","RESALE","BHK_OR_RK","ADDRESS"]),axis=1)
df_train

# 5- Check the scatter plot of the dataset in order to check which approach gives max r2_score and min Root of Mean Square Error (RMSE)

# As can be seen when the graphs are examined, there is a nonlinear relationship between the price variable and other variables. For this reason, the (RMSE) value we will obtain with linear regression will be high and the (r2_score) value will be low. Decision Tree and Random Forest will yield lower (RMSE) and higher (r2_score) values.

In [None]:
plt.figure(figsize=(30,6))
plt.subplot(131)
plt.scatter(df_train['SQUARE_FT'], df_train['TARGET(PRICE_IN_LACS)'])
plt.xlabel('SQUARE_FT')
plt.ylabel('TARGET(PRICE_IN_LACS)')
plt.subplot(132)
plt.scatter(df_train['LONGITUDE'], df_train['TARGET(PRICE_IN_LACS)'])
plt.xlabel('LONGITUDE')
plt.ylabel('TARGET(PRICE_IN_LACS)')
plt.subplot(133)
plt.scatter(df_train['LATITUDE'], df_train['TARGET(PRICE_IN_LACS)'])
plt.xlabel('LATITUDE')
plt.ylabel('TARGET(PRICE_IN_LACS)')

# 6- drop unrelated columns, which don't effect the house price, in test.csv of the house-price-prediction-challenge dataset

In [None]:
df_test=dataframe_test.drop((["UNDER_CONSTRUCTION", "RERA", "BHK_NO.","POSTED_BY","READY_TO_MOVE","RESALE","BHK_OR_RK","ADDRESS"]),axis=1)
df_test

# 7- separate the independent values into X_train and depended value into y_train (price)

In [None]:
df_train_values=df_train.values
X_train=df_train_values[:,0:3]
y_train=df_train_values[:,3]

# 8- check the shape of X_train and y_train

In [None]:
X_train.shape,y_train.shape

# 9- reshape the y_train array

In [None]:
y_train=y_train.reshape(-1,1)
X_train.shape,y_train.shape

# 10- export test data into X_test

In [None]:
df_test_values=df_test.values
X_test=df_test_values[:,0:3]
X_test.shape

# 11- create the LinearRegression() model for training

In [None]:
lin_reg_model=LinearRegression()
lin_reg_model.fit(X_train,y_train)

# 12- make a prediction using X_train with trained model so as to compare true price and predicted price made by our LinearRegression() model. 

In [None]:
y_prediction=lin_reg_model.predict(X_train)
y_prediction

# 13- check Root Of Mean Square Error (RMSE)

In [None]:
rmse=np.sqrt(mean_squared_error(y_train,y_prediction))
rmse

# 14- check r2_score

In [None]:
r2_score(y_train,y_prediction)

# 15- add the linear regression prediction values end of the train dataframe and compare true price and predicted price

In [None]:
dataframe_train['predicted_price_linear_regression']=y_prediction
dataframe_train

# 16- make a prediction using X_test with trained model

In [None]:
y_prediction_test=lin_reg_model.predict(X_test)
y_prediction_test

 # 17- add the prediction values end of the test dataframe 

In [None]:
dataframe_test['predicted_price_linear_regression']=y_prediction_test
dataframe_test

# 18- create DecisionTreeRegressor() and train model with X_train, y_train then make prediction using X_train and X_test with trained decision_tree_model, finally check the RMSE and r2_score

In [None]:
decision_tree_model= DecisionTreeRegressor()
decision_tree_model.fit(X_train,y_train)
y_prediction_tree=decision_tree_model.predict(X_train)

y_prediction_tree_test=decision_tree_model.predict(X_test)
y_prediction_tree_test

print("RMSE:",np.sqrt(mean_squared_error(y_train,y_prediction_tree)))
print("R_square:",r2_score(y_train,y_prediction_tree))

# 19- add the prediction values end of the train dataframe and compare true price, linear_regression predicted price and decision tree predicted price

In [None]:
dataframe_train['predicted_price_decision_tree_regression']=y_prediction_tree
dataframe_train

# 20- add the decision tree prediction values end of the test dataframe 

In [None]:
dataframe_test['predicted_price_decision_regression']=y_prediction_tree_test
dataframe_test

# 21- create RandomForestRegressor() and train model with X_train, y_train then make prediction using X_train and X_test with trained random_forest_model, finally check the RMSE and r2_score

In [None]:
random_forest_model=RandomForestRegressor(n_estimators=200, random_state=7)
random_forest_model.fit(X_train,y_train)

y_prediction_random_forest=random_forest_model.predict(X_train)

y_prediction_random_forest_test=random_forest_model.predict(X_test)


print("RMSE:",np.sqrt(mean_squared_error(y_train,y_prediction_random_forest)))
print("R_square:",r2_score(y_train,y_prediction_random_forest))

# 22- add the prediction values end of the train dataframe and compare true price, linear_regression predicted price, decision tree predicted price and random forest predicted price

In [None]:
dataframe_train['predicted_price_decision_random_forest_regression']=y_prediction_random_forest
dataframe_train

# 23- add the random forest prediction values end of the test dataframe 

In [None]:
dataframe_test['predicted_price_random_forest_regression']=y_prediction_random_forest_test
dataframe_test