In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
data = pd.read_csv("hyd_v2.csv")
locality = data['locality']
facing = data['facing']
furnishingDesc = data['furnishingDesc']
gym = data['gym']
isMaintenance = data['isMaintenance']
lift = data['lift']
loanAvailable = data['loanAvailable']
parking = data['parking']
reactivationSource = data['reactivationSource']
sharedAccomodation = data['sharedAccomodation']
swimmingPool = data['swimmingPool']
waterSupply = data['waterSupply']

In [None]:
data.shape

In [None]:
data.head()
# id, locality_id, ownerName, propertyType, shortUrl, weigth, propertyType, completeStreetName, facingDesc, parkingDesc can be droped
# because they are not needed or are already elaborated in other coumns


In [None]:
data = data.drop(columns=['id', 'localityId', 'ownerName', 'propertyType', 'shortUrl',
                 'weight', 'propertyType', 'completeStreetName','active', 'facingDesc', 'parkingDesc','combineDescription'], axis=1)


In [None]:
data.shape


In [None]:
data.columns

In [None]:
data.isnull().sum()

In [None]:
data.shape

In [None]:
data = data.dropna(axis=0,how='any')

In [None]:
data.isnull().sum()
# data is dropped because the null values are very low and removing them doesnt make a difference

In [None]:
data.drop(columns=['amenities','propertyTitle'],axis=1,inplace=True)

In [None]:
data.iloc[0]

In [None]:
data['locality'].value_counts()

In [None]:
# lets ignore values less than 10 to lessen the data which makes encoding easy
locations = data['locality'].value_counts()
less_than_10 = locations[locations <= 10]
data['locality'] = data['locality'].apply(lambda x: 'other' if x in less_than_10 else x)


In [None]:
data['locality'].value_counts()

In [None]:
data.head()

In [None]:
data.iloc[0]

In [None]:
data['type_bhk'] = data['type_bhk'].apply(lambda x : x[3] if len(x)==4 else None)

In [None]:
data['type_bhk'].value_counts()

In [None]:
data.isnull().sum()

In [None]:
data = data.dropna(axis=0,how='any')
# removing the data without BHK values coz they are few 

In [None]:
data.isnull().sum()

In [None]:
data.iloc[0]

In [None]:
labelencoder = LabelEncoder()

In [None]:
data['locality'].value_counts()

In [None]:
locality_label = labelencoder.fit_transform(data.locality)
data.locality = locality_label
facing_label = labelencoder.fit_transform(data.facing)
data.facing = facing_label
furnishingDesc_label = labelencoder.fit_transform(data.furnishingDesc)
data.furnishingDesc = furnishingDesc_label
gym_label = labelencoder.fit_transform(data.gym)
data.gym = gym_label
isMaintenance_label = labelencoder.fit_transform(data.isMaintenance)
data.isMaintenance = isMaintenance_label
lift_label = labelencoder.fit_transform(data.lift)
data.lift = lift_label
loanAvailable_label = labelencoder.fit_transform(data.loanAvailable)
data.loanAvailable = loanAvailable_label
parking_label = labelencoder.fit_transform(data.parking)
data.parking = parking_label
reactivationSource_label = labelencoder.fit_transform(data.reactivationSource)
data.reactivationSource = reactivationSource_label
sharedAccomodation_label = labelencoder.fit_transform(data.sharedAccomodation)
data.sharedAccomodation = sharedAccomodation_label
swimmingPool_label = labelencoder.fit_transform(data.swimmingPool)
data.swimmingPool = swimmingPool_label
waterSupply_label = labelencoder.fit_transform(data.waterSupply)
data.waterSupply = waterSupply_label

In [None]:
locality_dict = dict(zip(locality,locality_label))
facing_dict = dict(zip(facing,facing_label))
furnishingDesc_dict = dict(zip(furnishingDesc,furnishingDesc_label))
gym_dict = dict(zip(gym,gym_label))
isMaintenance_dict = dict(zip(isMaintenance,isMaintenance_label))
lift_dict = dict(zip(lift,lift_label))
loanAvailable_dict = dict(zip(loanAvailable,loanAvailable_label))
parking_dict = dict(zip(parking,parking_label))
reactivationSource_dict = dict(zip(reactivationSource,reactivationSource_label))
sharedAccomodation_dict = dict(zip(sharedAccomodation,sharedAccomodation_label))
swimmingPool_dict = dict(zip(swimmingPool,swimmingPool_label))
waterSupply_dict = dict(zip(waterSupply,waterSupply_label))

In [None]:
data['locality'].value_counts()

In [None]:
data['locality'].value_counts()

In [None]:
data.iloc[0]

In [None]:
data[['location-x', 'location-y']
     ] = data['location'].str.split(',', n=2, expand=True)


In [None]:
data.loc[0]

In [None]:
del data['location']

In [None]:
for i in data.iloc[0]:
    print(type(i))
    # print(i)

In [None]:
data['balconies'] = data['balconies'].apply(lambda x: x if type(x)==int else 0)
data['maintenanceAmount'] = data['maintenanceAmount'].apply(lambda x: x if type(x)==int else 0)

In [None]:
data['balconies'] = data['balconies'].astype(float)
data['location-x'] = data['location-x'].astype(float)
data['location-y'] = data['location-y'].astype(float)
data['type_bhk'] = data['type_bhk'].astype(float)
data['maintenanceAmount'] = data['maintenanceAmount'].astype(float)

OUTLINERS 

In [None]:
data.describe()

In [None]:
(data['property_size']/data['type_bhk']).describe()
# There are outliners
# lets take 250 as the minimum 

In [None]:
data = data[((data['property_size']/data['type_bhk'])>=250)]

In [None]:
(data['property_size']/data['type_bhk']).describe()

In [None]:
data = data[(data['property_size'])<5000]

In [None]:
data['bathroom'].describe()

In [None]:
data['rent_amount'].describe()

In [None]:
(data['rent_amount']/data['type_bhk']).describe()
# rent cant be less than a amount so they are outliners 
# lets take the minimum as 2000 and maximum as 10000

In [None]:
data.shape

In [None]:
data = data[((data['rent_amount']/data['type_bhk'])>=2500)]

In [None]:
data.shape

In [None]:
data = data[((data['rent_amount']/data['type_bhk'])<=10000)]

In [None]:
data.std()

In [None]:
data['deposit'].describe()

In [None]:
data.shape

In [None]:
(data['deposit']/data['type_bhk']).describe()
# deposit more than 50000 per bedroom are unreasonable and are outliners 
# same for deposit less than 2500

In [None]:
data = data[((data['deposit']/data['type_bhk'])<=50000)]

In [None]:
data.shape

In [None]:
data = data[((data['deposit']/data['type_bhk'])>=2500)]

In [None]:
data.shape

In [None]:
data.head()

In [None]:
data['bathroom'].describe()

In [None]:
y = data['rent_amount']
x = data['deposit']
fig2 = plt.figure()
ax = fig2.add_axes([0,0,1,1])
ax.scatter(x,y,color="g")
plt.show()
# Removing outliners by using scatter graphs

In [None]:
data = data[(data['deposit']<200000)]

In [None]:
y = data['rent_amount']
x = data['type_bhk']
fig2 = plt.figure()
ax = fig2.add_axes([0,0,1,1])
ax.scatter(x,y,color="g")
plt.show()
# no ouliners

In [None]:
y = data['rent_amount']
x = data['rent_amount']
fig2 = plt.figure()
ax = fig2.add_axes([0,0,1,1])
ax.scatter(x,y,color="g")
plt.show()

In [None]:
y = data['deposit']
x = data['type_bhk']
fig2 = plt.figure()
ax = fig2.add_axes([0,0,1,1])
ax.scatter(x,y,color="g")
plt.show()

In [None]:
y = data['property_size']
x = data['rent_amount']
fig2 = plt.figure()
ax = fig2.add_axes([0,0,1,1])
ax.scatter(x,y,color="g")
plt.show()

In [None]:
scaler = StandardScaler()
scaler.fit_transform(data)

In [None]:
data.std()

In [None]:
y = data['rent_amount']
x = data.drop(columns='rent_amount',axis=1)

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2,random_state=3)

In [None]:
model = LinearRegression()

In [None]:
model.fit(x_train,y_train)

In [None]:
print(model.coef_)
# coef are all the inputs we have given 

In [None]:
y_pred = model.predict(x_test)


In [None]:
print(y_pred)

In [None]:
print(y_test)

In [None]:
print(mean_squared_error(y_pred=y_pred,y_true=y_test))
# 4996735

In [None]:
x_test.iloc[0]

In [None]:
balconies = 0.000000
bathroom = 2.000000
deposit = 20000.000000
facing = 5.000000
floor = 1.000000
furnishingDesc = 2.000000
gym = 0.000000
isMaintenance = 1.000000
lift = 0.000000
loanAvailable = 0.000000
locality = 114.000000
maintenanceAmount =0.000000
parking  =  3.000000
property_age = 5.000000
property_size =1000.000000
reactivationSource=  5.000000
sharedAccomodation = 0.000000
swimmingPool  =0.000000
totalFloor  =  2.000000
type_bhk =  2.000000
waterSupply  = 3.000000
location_x =  17.446338
location_y =  78.535506
input_data = np.array([balconies,bathroom,deposit,facing,floor, furnishingDesc, gym,isMaintenance,lift,loanAvailable,locality,maintenanceAmount, parking,property_age,
property_size,reactivationSource,sharedAccomodation,swimmingPool,totalFloor,type_bhk,waterSupply,location_x,location_y])
input_data = input_data.reshape(1,-1)
sol = model.predict(input_data)
print(sol)