In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import sklearn

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

In [None]:
# Read monthly and yearly data
mth = pd.read_csv("../input/housing-in-london/housing_in_london_monthly_variables.csv")
yr = pd.read_csv("../input/housing-in-london/housing_in_london_yearly_variables.csv")

#set date as index for both df and delete date column (non-index)

yr = yr.set_index(pd.to_datetime(yr['date']))
mth = mth.set_index(pd.to_datetime(mth['date']))
del yr['date'] 
del mth['date']

# Extract London data from both dataframes. All london boroughs are marked with borough_flag = 1
ldn_yr = yr[yr['borough_flag']==1]
del ldn_yr['borough_flag']
ldn_mth = mth[mth['borough_flag']==1]
del ldn_mth['borough_flag']

In [None]:
# Take a look at the data

ldn_mth.head()

In [None]:
ldn_yr.head()

Considering the null values present in the data-set********


In [None]:
print(ldn_yr.isnull().sum())
print('\n')
print(ldn_mth.isnull().sum())

We proceed to group the data by area, and impute the missing values by the average value from that area. This is to ensure we don't have any null values present in dataset, since they will not fit in our algorithm.

In [None]:
ldn_yr.info()

We can see that there are object columns in our dataset, which we need to convert into float/int values.

In [None]:
#convert str values in recycling_pct to int

ldn_yr['recycling_pct'] = [pd.to_numeric(x, errors='coerce') for x in ldn_yr['recycling_pct']]   

# Now, since our column "code" consists of entries like 'E09000001', 
# we'll replace the value "E" with "nothing", so that our object datatype will be converted into a float
ldn_yr['code'] = [float(x.replace('E', '')) for x in ldn_yr['code']]

#mean_salary column has integers stored as strings, and also contains some '#'s. 
#By setting errors='coerce',we replace # with nan and then impute the nans.

ldn_yr['mean_salary'] = [pd.to_numeric(x, errors='coerce') for x in ldn_yr['mean_salary']]

In [None]:
ldn_yr.loc[:,('life_satisfaction')] = ldn_yr.groupby('area')['life_satisfaction'].transform(lambda x: x.fillna(x.mean()))

# notice city of london does not have any life_satisfaction data, so we will impute using overall mean

ldn_yr.loc[:, ('life_satisfaction')] = ldn_yr.loc[:, ('life_satisfaction')].fillna(ldn_yr.loc[:, ('life_satisfaction')].mean())


In [None]:
# repeating the imputation on other int/float columns by their group mean.

ldn_yr['area_size'] = ldn_yr.groupby('area').area_size.transform(lambda x: x.fillna(x.mean()))

ldn_yr['number_of_jobs'] = ldn_yr.groupby('area').number_of_jobs.transform(lambda x: x.fillna(x.mean()))
ldn_yr['no_of_houses'] = ldn_yr.groupby('area').no_of_houses.transform(lambda x: x.fillna(x.mean()))

ldn_yr['population_size'] = ldn_yr.groupby('area').population_size.transform(lambda x: x.fillna(x.mean()))
ldn_yr['median_salary'] = ldn_yr.groupby('area').median_salary.transform(lambda x: x.fillna(x.mean()))
ldn_yr['recycling_pct'] = ldn_yr.groupby('area').recycling_pct.transform(lambda x: x.fillna(x.mean()))
ldn_yr['mean_salary'] = ldn_yr.mean_salary.fillna(ldn_yr.mean_salary.mean())

ldn_yr.info()

In [None]:
#now replace each area with a unique integer using factorize
ldn_yr.loc[:,'area'] = pd.factorize(ldn_yr.area)[0].astype(float)

In [None]:
ldn_yr.info()

## Incorporating monthly data

In [None]:
# By taking average over months of a year we incorporate monthly data into our yearly data.
ldn_mth = mth[mth['borough_flag']==1]
del ldn_mth['borough_flag']
cols = ldn_mth.groupby('area').resample('Y').mean()[4:-1]

cols = cols.reset_index()

# factorize the areas in the columns from monthly data. Notice that the boroughs are identical
# so factorize() will assign the same integer to the same area in both dataframes.
cols.loc[:,'area'] = pd.factorize(cols.reset_index().area)[0].astype(float)

In [None]:
# Setting multiindex for both dataframes to merge
ldn_yr = ldn_yr.reset_index().set_index(['date', 'area'])

import datetime

cols.date = ([(x - datetime.timedelta(30)) for x in cols.date])
cols = cols.set_index(['date', 'area'])
cols = cols.sort_index()

Now merge the dataframes.

In [None]:
ldn_yr_plus = pd.merge(ldn_yr, cols, how = 'left', on = ['date','area'])

In [None]:
#imputing missing values in no_of_crimes
ldn_yr_plus.no_of_crimes = ldn_yr_plus.groupby('area').transform(lambda x: x.fillna(x.mean()))

In [None]:
ldn_yr_plus.info()

## House price prediction

Taking our features into X, while taking our target features into y, for prediction

In [None]:
X = ldn_yr_plus.reset_index()[['area', 'median_salary', 'life_satisfaction', 'population_size', 'mean_salary', 'number_of_jobs', 'no_of_houses', 'area_size', 'no_of_crimes', 'houses_sold']]
y = ldn_yr_plus[['average_price']]


Spliting the Dataset into train and test.

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.15,random_state=1)


In [None]:
from sklearn.tree import DecisionTreeRegressor
model=DecisionTreeRegressor(random_state=0,min_samples_split=3)
model.fit(X_train,y_train)

In [None]:
prediction=(model.predict(X_test).astype(int))
print("predictions:",prediction)

In [None]:
from sklearn.metrics import r2_score
r2_score(prediction,y_test)
