In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
sc = StandardScaler()
mx = MinMaxScaler()
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train_data = pd.read_csv("../input/house-price-prediction-challenge/train.csv")
test_data = pd.read_csv("../input/house-price-prediction-challenge/test.csv")
train_data.head()

In [None]:
train_data.describe().T

In [None]:
train_data.info()

In [None]:
obj_cols = list(train_data.columns[train_data.dtypes == 'object'])
obj_cols

# Preprocessing the train and test datasets

## 1. Plotting graphs of continuous variables and getting rid of outliers
## 2. Getting the city name from address
## 3. Converting BHK or RK into categorical variables
## 4. Cleaning the dataset by imputing the missing values and fixing obj type datatypes into float or int

In [None]:
import plotly.express as px
fig = px.histogram(train_data['TARGET(PRICE_IN_LACS)'], x = 'TARGET(PRICE_IN_LACS)')
fig.show()

In [None]:
fig1 = px.histogram(train_data['SQUARE_FT'], x = 'SQUARE_FT')
fig1.show()

## we see that there are a lot of outliers which is impacting the graph i.e prices of houses greater than 5k lacs or 50 crores  and square foot values so we remove them

In [None]:
# The below function cleans the dataset and label encodes the categorical columns
def data_cleaning(data):
    for i in range(len(data)):
        str1 = data['ADDRESS'][i].split(",")[-1]
        data['ADDRESS'][i] = str1
    encoder = LabelEncoder()
    #for col in obj_cols:
        #data[col] = pd.get_dummies(data[col],drop_first=True)
    for col in obj_cols:
        data[col] = encoder.fit_transform(data[col].astype(str))
        
    data.drop("BHK_OR_RK",axis = 1, inplace = True) # dropping the location co-ordinates as the city name already does the job  
    data.drop("POSTED_BY", axis = 1, inplace = True)
    data.drop("UNDER_CONSTRUCTION",axis = 1,inplace = True)
    data.drop(['LONGITUDE','LATITUDE'],axis = 1,inplace = True)
    if 'TARGET(PRICE_IN_LACS)' in data.columns:
        data = data[data['TARGET(PRICE_IN_LACS)']<=99] #dropping houses with prices >5crs as they affect the distribution of data
        data = data[data['SQUARE_FT']<=2900]
    data["SQUARE_FT"] = data["SQUARE_FT"].astype(int)
    return data
data_train = data_cleaning(train_data)
data_test = data_cleaning(test_data)
data_test.head(10)

In [None]:

fig1 = px.histogram(data_train['SQUARE_FT'], x = 'SQUARE_FT')
fig1.show()

In [None]:

fig1 = px.histogram(data_train['TARGET(PRICE_IN_LACS)'], x = 'TARGET(PRICE_IN_LACS)')
fig1.show()

## *Now from the charts u can see that the dataset is more evenly distributed with considerably less outliers that negatively performance*

# **EDA on the cleaned dataset**

In [None]:
#splitting into inputs and targets

input_data =data_train.iloc[:,:-1]
target_data = data_train.iloc[:,-1]
#input_data.head()

In [None]:
input_data.head()

In [None]:
sns.pairplot(data_train)

In [None]:
sc.fit(input_data)
scaled_inputs = sc.transform(input_data)

In [None]:
#mx.fit(input_data)
#scaled_inputs = mx.transform(input_data)  #StandardScaler gives better results


### We'll try different types of learning:-
1. Linear regression
2. ElasticNet regression
3. Xgb boost
4. Neural Networks

## *1.Multiple linear regression*

In [None]:
# Train test split method for splitting the inputs into train and test data
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
lr = LinearRegression(normalize=True) #normalizing turned down the rmse even more
train_inputs,test_inputs,train_targets,test_targets = train_test_split(scaled_inputs,target_data)


In [None]:
test_targets.head()

In [None]:
lr.fit(train_inputs,train_targets)
preds = lr.predict(test_inputs)

In [None]:
d = pd.DataFrame()
d['preds']=preds
d['real_price'] = np.array(test_targets)
d['%_error'] = np.absolute((preds-np.array(test_targets))/np.array(test_targets))*100
d

In [None]:
d.describe().T

In [None]:
from sklearn.metrics import mean_squared_error
rmse = np.sqrt(mean_squared_error(test_targets, preds))
print("RMSE: %f" % (rmse))

## *2. ElasticNet regression*

In [None]:
from sklearn.linear_model import ElasticNet
elr = ElasticNet(l1_ratio = 0.85,normalize = False,selection = 'random') #l1 ratio of 0.85 gives the best results
elr.fit(train_inputs,train_targets)
predse = elr.predict(test_inputs)

In [None]:
es = pd.DataFrame()
es['preds']=predse
es['real_price'] = np.array(test_targets)
es['%_error'] = np.absolute((predse-np.array(test_targets))/np.array(test_targets))*100
es

In [None]:

es.describe().T

In [None]:
rsme = np.sqrt(mean_squared_error(test_targets,predse))
print("RSME: {}".format(rsme))

### As you can see the linear regression model is underperforming so we'll try other methods

## *3. Using the xgboost regressor*

In [None]:
# 2 xgboost regressor
import xgboost as xgb
from sklearn.metrics import mean_squared_error

In [None]:
xg_reg = xgb.XGBRegressor(objective ='reg:squarederror', colsample_bytree = 0.3, learning_rate = 0.1,
                max_depth = 5, alpha = 10, n_estimators = 10)

In [None]:
xg_reg.fit(train_inputs,train_targets)

preds2 = xg_reg.predict(test_inputs)

In [None]:

e = pd.DataFrame()
e['preds2']=preds2
e['real_price'] = np.array(test_targets)
e['%_error'] = np.absolute((preds2-np.array(test_targets))/np.array(test_targets))*100
e

In [None]:
e.describe().T

In [None]:
rmse = np.sqrt(mean_squared_error(test_targets, preds2))
print("RMSE: %f" % (rmse))

### *4. Using Deep neural networks*

In [None]:
import tensorflow as tf
import tensorflow as tf
from tensorflow.keras.layers.experimental import preprocessing
import numpy as np

In [None]:
data_test1 = data_test.copy()
sc.fit(data_test1)
data_test1 = sc.transform(data_test1)

In [None]:
#for using neural netowrks you have to save the data either in  npz format or tensorslices
np.savez('data_train', inputs=train_inputs, targets=train_targets)
np.savez('data_validation', inputs=test_inputs, targets=test_targets)
np.savez('data_test', inputs= data_test1)
#np.savez('data_test', inputs=test_inputs, targets=test_targets)

In [None]:
npz = np.load('data_train.npz')
train_inputs_tf = npz['inputs'].astype(np.float)
# targets must be int because of sparse_categorical_crossentropy (we want to be able to smoothly one-hot encode them)
train_targets_tf = npz['targets'].astype(np.int8)

# we load the validation data in the temporary variable
npz = np.load('data_validation.npz')
# we can load the inputs and the targets in the same line
validation_inputs_tf, validation_targets_tf = npz['inputs'].astype(np.float), npz['targets'].astype(np.int8)

npz = np.load('data_test.npz')
test_inputs_tf = npz['inputs'].astype(np.float)


In [None]:
#building the model

model = tf.keras.Sequential([tf.keras.layers.Dense(32, activation='relu'),
                             tf.keras.layers.Dense(32, activation='relu'),
                             tf.keras.layers.Dense(1)
                             ])
model.compile(loss='mse',
              optimizer='RMSprop',
              metrics=['mse'])

In [None]:
early_stopping = tf.keras.callbacks.EarlyStopping(patience=2)

model.fit(
          train_inputs_tf, # train inputs
          train_targets_tf,
          epochs = 15,
          callbacks=[early_stopping], # early stopping
          validation_data=(validation_inputs_tf, validation_targets_tf),
          verbose = 1

          )

In [None]:
preds_tf = model.predict(validation_inputs_tf)

In [None]:
preds_tf = preds_tf.reshape(validation_inputs_tf.shape[0],)

In [None]:
preds_tf.shape

In [None]:
tft = pd.DataFrame()
tft['preds_tf']=preds_tf
tft['real_price'] = np.array(test_targets)
tft['%_error'] = np.absolute((preds_tf-np.array(test_targets))/np.array(test_targets))*100
tft

In [None]:
tft.sort_values(by=['%_error'], inplace=True,ascending = False)


In [None]:
tft.head(15)

In [None]:
tft.describe().T

In [None]:
rmse = np.sqrt(mean_squared_error(test_targets, preds_tf))
print("RMSE: %f" % (rmse))

# **Since the tensorflow model had the lowest RSME and mean error I choose the neural network model for test predictions**

In [None]:
test_preds = model.predict(test_inputs_tf)

In [None]:
test_preds

In [None]:
answers = test_preds.reshape(68720,)

In [None]:
answers_r = np.round(answers,1) 
answers_r


In [None]:
sample_submission= pd.read_csv('../input/house-price-prediction-challenge/sample_submission.csv')

In [None]:
submission = sample_submission.copy()

In [None]:
submission['TARGET(PRICE_IN_LACS)'] = answers_r

In [None]:
submission.head(10)

In [None]:
#submission.to_csv('submission.csv')