# Real Estate Price Prediction using sk-learn

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# # Load the required data file

In [None]:
data = pd.read_csv('../input/real-estate-price-prediction/Real estate.csv')
data.head(5)

1. Here are the top 5 row data
2. If you want last 5 rows use 'tail()' function
3. If you want some random rows of data use 'sample()' function . Ex: data.sample(5)

In [None]:
data.info()

You can see information of data i.e., datatypes, rows, columns, not null count, memory

In [None]:
data.describe()

Describe() gives you the Satistical information of data

In [None]:
data.isnull().sum()             #check null values

Total number of null values(missing data) in each column

# # Data Preprocessing

In [None]:
data.drop(['No'], inplace=True , axis =1)         #by default it takes axis = 0
#inplace attribute places the dataframe into same variable after execution , above line code similar to
#data = data.drop(['No'], axis =1)

In [None]:
#renaming of columns for better user-fiendly
data.rename(columns={
    'X1 transaction date': 'Date',
    'X2 house age': 'Age',
    'X3 distance to the nearest MRT station':'Nearest_Station_Distance',
    'X4 number of convenience stores':'Num_Stores',
    'X5 latitude':'latitude',
    'X6 longitude':'longitude',
    'Y house price of unit area':'Price_Unit_Area',
}, inplace=True)
data.head(5)

# # Creating Features(Inputs) and Labels(targets)

In [None]:
X = data.drop(['Price_Unit_Area'], axis = 1)
Y = data['Price_Unit_Area']

In [None]:
from sklearn.model_selection import train_test_split

Splitting features and labels into train data and test data.
1. Train data to train model
2. Test data to measure accuracy

In [None]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2, random_state = 123)

# # Train the model

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
model = LinearRegression()        #Creating object for LinearRegression class
model.fit(x_train, y_train)

Calculating score 

In [None]:
model.score(x_test, y_test)

# # Test the Model
Predicting values using test data

In [None]:
y_pred = model.predict(x_test)
y_test_pred = pd.DataFrame(y_test)
y_test_pred['Predicted_Price_Unit_Area'] = y_pred
y_test_pred.sample(10)

Find accuracy using R2 (R square) method

In [None]:
from sklearn.metrics import r2_score
r2_score(y_test, y_pred)

In [None]:
data['Predicted_Price_Unit_Area'] = model.predict(X)
data.head(5)

1. Here we implemented simple linear regression model.
2. In the next version will also find outliers and element them after that,
    will implement polynomial regression for better accuracy