[Kaggle Website](https://www.kaggle.com/anmolkumar/house-price-prediction-challenge/tasks?taskId=2304)

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [1]:
data = pd.read_csv("../input/house-price-prediction-challenge/train.csv")
x_test = pd.read_csv("../input/house-price-prediction-challenge/test.csv")
y_test = pd.read_csv("../input/house-price-prediction-challenge/sample_submission.csv")
data

# Exploring the Data

In [1]:
data.info()

In [1]:
data["BHK_OR_RK"].value_counts()

In [1]:
data["ADDRESS"].value_counts()

# Cleaning the Data

In [1]:
data['ADDRESS'] = data['ADDRESS'].str.split(',').apply(lambda x: x[-1])
data

In [1]:
tier_1 = ["Ahmedabad", "Bengaluru", "Chennai", "Delhi", "Hyderabad", "Kolkata", "Mumbai", "Pune"]
tier_2 = ["Agra", "Ajmer", "Aligarh", "Amravati", "Amritsar", "Asansol", "Aurangabad", "Bareilly", "Belgaum", "Bhavnagar", "Bhiwandi", "Bhopal", "Bhubaneswar","Bikaner", "Bilaspur", "Bokaro Steel City","Chandigarh", "Coimbatore", "Cuttack"," Dehradun","Dhanbad", "Bhilai","Durgapur", "Erode", "Faridabad", "Firozabad", "Ghaziabad","Gorakhpur", "Gulbarga", "Guntur", "Gwalior", "Gurugram", "Guwahati", "Hamirpur", "Hubli–Dharwad", "Indore", "Jabalpur", "Jaipur", "Jalandhar", "Jalgaon", "Jammu", "Jamnagar", "Jamshedpur", "Jhansi", "Jodhpur","Navi Mumbai" ,"Kakinada", "Kannur"," Kanpur", "Karnal", "Kochi"," Kolhapur", "Kollam", "Kozhikode", "Kurnool", "Ludhiana","Lucknow","Madurai", "Malappuram"," Mathura", "Mangalore", "Meerut", "Moradabad", "Mysore", "Nagpur"," Nanded"," Nashik", "Nellore","Navi Mumbai","Noid", "Patna", "Puducherry", "Purulia", "Prayagraj", "Raipur","Rajkot", "Rajamahendravaram"," Ranchi", "Rourkela", "Ratlam", "Salem", "Sangli", "Shimla", "Siliguri"," Solapur", "Srinagar", "Surat", "Thanjavur", "Thiruvananthapuram", "Thrissur", "Tiruchirappalli", "Tirunelveli", "Tiruvannamalai", "Ujjain", "Vijayapura", "Vadodara", "Varanasi", "Vasai-Virar City", "Vijayawada", "Visakhapatnam", "Vellore", "Warangal"]

def mapping_city(city):

    if city in tier_1:
        return 0

    elif city in tier_2:
        return 1
        
    else:
        return 2

data['City_Tier'] = data['ADDRESS'].apply(mapping_city)
data

# Analysing the Data

In [1]:
data.plot(kind='scatter',x='LONGITUDE',y='LATITUDE',alpha=0.4,c="TARGET(PRICE_IN_LACS)",cmap=plt.get_cmap("jet"),colorbar=True)

In [1]:
import folium
from folium import Choropleth, Circle, Marker
from folium.plugins import HeatMap, MarkerCluster

In [1]:
map = folium.Map(location=[22.00,78.00], tiles='cartodbpositron', zoom_start=6)


for i in range(0,len(data)):
    Circle(
        location=[data.iloc[i]['LONGITUDE'], data.iloc[i]['LATITUDE']],
        radius=100,
        color='blue').add_to(map)

# Display the map
map

# Finding Relation between different attributes in Data

In [1]:
corr_data = data.corr()
corr_data["TARGET(PRICE_IN_LACS)"].sort_values(ascending=False)

In [1]:
from pandas.plotting import scatter_matrix

attributes = ["TARGET(PRICE_IN_LACS)", "SQUARE_FT", "BHK_NO."]
scatter_matrix(data[attributes], figsize=(15, 8))

# Columns that are contributing towards high house prices-

### 1) Area
### 2) BHK
### 3) UNDER_CONSTRUCTION
### 4) RERA

# Preparing and Training the Model

In [1]:
x_train = data.drop(['POSTED_BY', 'BHK_OR_RK', 'ADDRESS', 'LATITUDE', 'LONGITUDE', 'TARGET(PRICE_IN_LACS)', "City_Tier"], axis=1)
y_train = data['TARGET(PRICE_IN_LACS)']

In [1]:
from sklearn.linear_model import LinearRegression
lm = LinearRegression()

In [1]:
lm.fit(x_train, y_train)

In [1]:
print(lm.intercept_)

In [1]:
lm.coef_
pd.DataFrame(lm.coef_, x_train.columns, columns=['Coeff'])

In [1]:
x_test = x_test.drop(['POSTED_BY', 'BHK_OR_RK', 'ADDRESS', 'LATITUDE', 'LONGITUDE'], axis=1)

In [1]:
predictions = lm.predict(x_test)

In [1]:
from sklearn.metrics import mean_squared_error
print(mean_squared_error(y_test,predictions))


In [1]:
pd.DataFrame(predictions).to_csv('submission.csv') 