Introduction

This notebook aims to introduce a standard way of:

 1. Loading the data into python notebook
 2. Visual and identify issues using Scatter plot between dependent variables and target

In [None]:
#Import data into Python
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

pd.options.mode.chained_assignment = None

In [None]:
#Read training data into training_data variable
training_data=pd.read_csv("../input/train.csv")

In [None]:
#Print training data for exploration purposes
print(training_data.columns)
print(training_data.shape)
print(training_data.head())
print(training_data["price_doc"].describe())

In [None]:
#Comparing the size of total area in square meters with house price
plt.scatter(training_data["full_sq"],training_data["price_doc"])

X-axis -> Total Area in square meters 
Y-axis -> Sale Price
As can be seen with the data above, the sale price is not directly related to the area of the house.
Hmm, my initial assumption is flawed, about a linear relation between area and price. 
Let me bring in other variables into the mix, using correlation.

In [None]:
sns.set()
cols = ["price_doc", "full_sq", "floor", "build_year", "state"]
sns.pairplot(training_data[cols].dropna(), size = 2)
plt.show();

Now, as we are aware, location also plays an important aspect in determining house prices. However, location (sub_area) will not work directly above, as it is a categorical variable. for this, I need to find out the unique locations. I will do this in parallel with the reason for purchase(product_type).

In [None]:
#Unique type for product_type
print(training_data["product_type"].unique())

In [None]:
#Unique sub areas for the house located (used MS excel to remove duplicate values)
print(training_data["sub_area"].unique())

Now, in the training data set, there are two types of Product types -> Investment and OwnerOccupier.
Let's replace these values with 0 (Investment) and 1 (OwnerOccupier).

In [None]:
#Replacing the values for product_type to 0 and 1
training_data["product_type"] [training_data["product_type"]=="Investment"]=0
training_data["product_type"] [training_data["product_type"]=="OwnerOccupier"]=1

In [None]:
#Comparing the size of Product type with house price
plt.scatter(training_data["product_type"],training_data["price_doc"])

In [None]:
#Give the 146 unique sub locations a categorical value:
training_data["sub_area"][training_data["sub_area"]=="Bibirevo"]=0
training_data["sub_area"][training_data["sub_area"]=="Nagatinskij Zaton"]=1
training_data["sub_area"][training_data["sub_area"]=="Tekstil'shhiki"]=2
training_data["sub_area"][training_data["sub_area"]=="Mitino"]=3
training_data["sub_area"][training_data["sub_area"]=="Basmannoe"]=4
training_data["sub_area"][training_data["sub_area"]=="Nizhegorodskoe"]=5
training_data["sub_area"][training_data["sub_area"]=="Sokol'niki"]=6
training_data["sub_area"][training_data["sub_area"]=="Koptevo"]=7
training_data["sub_area"][training_data["sub_area"]=="Kuncevo"]=8
training_data["sub_area"][training_data["sub_area"]=="Kosino-Uhtomskoe"]=9
training_data["sub_area"][training_data["sub_area"]=="Zapadnoe Degunino"]=10
training_data["sub_area"][training_data["sub_area"]=="Presnenskoe"]=11
training_data["sub_area"][training_data["sub_area"]=="Lefortovo"]=12
training_data["sub_area"][training_data["sub_area"]=="Mar'ino"]=13
training_data["sub_area"][training_data["sub_area"]=="Kuz'minki"]=14
training_data["sub_area"][training_data["sub_area"]=="Nagornoe"]=15
training_data["sub_area"][training_data["sub_area"]=="Gol'janovo"]=16
training_data["sub_area"][training_data["sub_area"]=="Vnukovo"]=17
training_data["sub_area"][training_data["sub_area"]=="Juzhnoe Tushino"]=18
training_data["sub_area"][training_data["sub_area"]=="Severnoe Tushino"]=19
training_data["sub_area"][training_data["sub_area"]=="Chertanovo Central'noe"]=20
training_data["sub_area"][training_data["sub_area"]=="Fili Davydkovo"]=21
training_data["sub_area"][training_data["sub_area"]=="Otradnoe"]=22
training_data["sub_area"][training_data["sub_area"]=="Novo-Peredelkino"]=23
training_data["sub_area"][training_data["sub_area"]=="Bogorodskoe"]=24
training_data["sub_area"][training_data["sub_area"]=="Jaroslavskoe"]=25
training_data["sub_area"][training_data["sub_area"]=="Strogino"]=26
training_data["sub_area"][training_data["sub_area"]=="Hovrino"]=27
training_data["sub_area"][training_data["sub_area"]=="Moskvorech'e-Saburovo"]=28
training_data["sub_area"][training_data["sub_area"]=="Staroe Krjukovo"]=29
training_data["sub_area"][training_data["sub_area"]=="Ljublino"]=30
training_data["sub_area"][training_data["sub_area"]=="Caricyno"]=31
training_data["sub_area"][training_data["sub_area"]=="Veshnjaki"]=32
training_data["sub_area"][training_data["sub_area"]=="Danilovskoe"]=33
training_data["sub_area"][training_data["sub_area"]=="Preobrazhenskoe"]=34
training_data["sub_area"][training_data["sub_area"]=="Kon'kovo"]=35
training_data["sub_area"][training_data["sub_area"]=="Brateevo"]=36
training_data["sub_area"][training_data["sub_area"]=="Vostochnoe Izmajlovo"]=37
training_data["sub_area"][training_data["sub_area"]=="Vyhino-Zhulebino"]=38
training_data["sub_area"][training_data["sub_area"]=="Donskoe"]=39
training_data["sub_area"][training_data["sub_area"]=="Novogireevo"]=40
training_data["sub_area"][training_data["sub_area"]=="Juzhnoe Butovo"]=41
training_data["sub_area"][training_data["sub_area"]=="Sokol"]=42
training_data["sub_area"][training_data["sub_area"]=="Kurkino"]=43
training_data["sub_area"][training_data["sub_area"]=="Izmajlovo"]=44
training_data["sub_area"][training_data["sub_area"]=="Severnoe Medvedkovo"]=45
training_data["sub_area"][training_data["sub_area"]=="Rostokino"]=46
training_data["sub_area"][training_data["sub_area"]=="Orehovo-Borisovo Severnoe"]=47
training_data["sub_area"][training_data["sub_area"]=="Ochakovo-Matveevskoe"]=48
training_data["sub_area"][training_data["sub_area"]=="Taganskoe"]=49
training_data["sub_area"][training_data["sub_area"]=="Dmitrovskoe"]=50
training_data["sub_area"][training_data["sub_area"]=="Orehovo-Borisovo Juzhnoe"]=51
training_data["sub_area"][training_data["sub_area"]=="Teplyj Stan"]=52
training_data["sub_area"][training_data["sub_area"]=="Babushkinskoe"]=53
training_data["sub_area"][training_data["sub_area"]=="Pokrovskoe Streshnevo"]=54
training_data["sub_area"][training_data["sub_area"]=="Obruchevskoe"]=55
training_data["sub_area"][training_data["sub_area"]=="Filevskij Park"]=56
training_data["sub_area"][training_data["sub_area"]=="Troparevo-Nikulino"]=57
training_data["sub_area"][training_data["sub_area"]=="Severnoe Butovo"]=58
training_data["sub_area"][training_data["sub_area"]=="Hamovniki"]=59
training_data["sub_area"][training_data["sub_area"]=="Solncevo"]=60
training_data["sub_area"][training_data["sub_area"]=="Dorogomilovo"]=61
training_data["sub_area"][training_data["sub_area"]=="Timirjazevskoe"]=62
training_data["sub_area"][training_data["sub_area"]=="Lianozovo"]=63
training_data["sub_area"][training_data["sub_area"]=="Pechatniki"]=64
training_data["sub_area"][training_data["sub_area"]=="Krjukovo"]=65
training_data["sub_area"][training_data["sub_area"]=="Jasenevo"]=66
training_data["sub_area"][training_data["sub_area"]=="Chertanovo Severnoe"]=67
training_data["sub_area"][training_data["sub_area"]=="Rjazanskij"]=68
training_data["sub_area"][training_data["sub_area"]=="Silino"]=69
training_data["sub_area"][training_data["sub_area"]=="Ivanovskoe"]=70
training_data["sub_area"][training_data["sub_area"]=="Golovinskoe"]=71
training_data["sub_area"][training_data["sub_area"]=="Novokosino"]=72
training_data["sub_area"][training_data["sub_area"]=="Nagatino-Sadovniki"]=73
training_data["sub_area"][training_data["sub_area"]=="Birjulevo Vostochnoe"]=74
training_data["sub_area"][training_data["sub_area"]=="Severnoe Izmajlovo"]=75
training_data["sub_area"][training_data["sub_area"]=="Sokolinaja Gora"]=76
training_data["sub_area"][training_data["sub_area"]=="Vostochnoe Degunino"]=77
training_data["sub_area"][training_data["sub_area"]=="Prospekt Vernadskogo"]=78
training_data["sub_area"][training_data["sub_area"]=="Savelki"]=79
training_data["sub_area"][training_data["sub_area"]=="Ajeroport"]=80
training_data["sub_area"][training_data["sub_area"]=="Vojkovskoe"]=81
training_data["sub_area"][training_data["sub_area"]=="Beskudnikovskoe"]=82
training_data["sub_area"][training_data["sub_area"]=="Krylatskoe"]=83
training_data["sub_area"][training_data["sub_area"]=="Juzhnoportovoe"]=84
training_data["sub_area"][training_data["sub_area"]=="Perovo"]=85
training_data["sub_area"][training_data["sub_area"]=="Akademicheskoe"]=86
training_data["sub_area"][training_data["sub_area"]=="Horoshevo-Mnevniki"]=87
training_data["sub_area"][training_data["sub_area"]=="Shhukino"]=88
training_data["sub_area"][training_data["sub_area"]=="Kapotnja"]=89
training_data["sub_area"][training_data["sub_area"]=="Horoshevskoe"]=90
training_data["sub_area"][training_data["sub_area"]=="Marfino"]=91
training_data["sub_area"][training_data["sub_area"]=="Chertanovo Juzhnoe"]=92
training_data["sub_area"][training_data["sub_area"]=="Savelovskoe"]=93
training_data["sub_area"][training_data["sub_area"]=="Birjulevo Zapadnoe"]=94
training_data["sub_area"][training_data["sub_area"]=="Nekrasovka"]=95
training_data["sub_area"][training_data["sub_area"]=="Cheremushki"]=96
training_data["sub_area"][training_data["sub_area"]=="Sviblovo"]=97
training_data["sub_area"][training_data["sub_area"]=="Alekseevskoe"]=98
training_data["sub_area"][training_data["sub_area"]=="Krasnosel'skoe"]=99
training_data["sub_area"][training_data["sub_area"]=="Kotlovka"]=100
training_data["sub_area"][training_data["sub_area"]=="Zjuzino"]=101
training_data["sub_area"][training_data["sub_area"]=="Ostankinskoe"]=102
training_data["sub_area"][training_data["sub_area"]=="Tverskoe"]=103
training_data["sub_area"][training_data["sub_area"]=="Losinoostrovskoe"]=104
training_data["sub_area"][training_data["sub_area"]=="Butyrskoe"]=105
training_data["sub_area"][training_data["sub_area"]=="Matushkino"]=106
training_data["sub_area"][training_data["sub_area"]=="Metrogorodok"]=107
training_data["sub_area"][training_data["sub_area"]=="Juzhnoe Medvedkovo"]=108
training_data["sub_area"][training_data["sub_area"]=="Lomonosovskoe"]=109
training_data["sub_area"][training_data["sub_area"]=="Jakimanka"]=110
training_data["sub_area"][training_data["sub_area"]=="Mozhajskoe"]=111
training_data["sub_area"][training_data["sub_area"]=="Levoberezhnoe"]=112
training_data["sub_area"][training_data["sub_area"]=="Mar'ina Roshha"]=113
training_data["sub_area"][training_data["sub_area"]=="Gagarinskoe"]=114
training_data["sub_area"][training_data["sub_area"]=="Zamoskvorech'e"]=115
training_data["sub_area"][training_data["sub_area"]=="Altuf'evskoe"]=116
training_data["sub_area"][training_data["sub_area"]=="Ramenki"]=117
training_data["sub_area"][training_data["sub_area"]=="Zjablikovo"]=118
training_data["sub_area"][training_data["sub_area"]=="Meshhanskoe"]=119
training_data["sub_area"][training_data["sub_area"]=="Severnoe"]=120
training_data["sub_area"][training_data["sub_area"]=="Begovoe"]=121
training_data["sub_area"][training_data["sub_area"]=="Arbat"]=122
training_data["sub_area"][training_data["sub_area"]=="Poselenie Sosenskoe"]=123
training_data["sub_area"][training_data["sub_area"]=="Poselenie Moskovskij"]=124
training_data["sub_area"][training_data["sub_area"]=="Poselenie Pervomajskoe"]=125
training_data["sub_area"][training_data["sub_area"]=="Poselenie Desjonovskoe"]=126
training_data["sub_area"][training_data["sub_area"]=="Poselenie Voskresenskoe"]=127
training_data["sub_area"][training_data["sub_area"]=="Poselenie Mosrentgen"]=128
training_data["sub_area"][training_data["sub_area"]=="Troickij okrug"]=129
training_data["sub_area"][training_data["sub_area"]=="Poselenie Shherbinka"]=130
training_data["sub_area"][training_data["sub_area"]=="Poselenie Filimonkovskoe"]=131
training_data["sub_area"][training_data["sub_area"]=="Poselenie Vnukovskoe"]=132
training_data["sub_area"][training_data["sub_area"]=="Poselenie Marushkinskoe"]=133
training_data["sub_area"][training_data["sub_area"]=="Poselenie Shhapovskoe"]=134
training_data["sub_area"][training_data["sub_area"]=="Poselenie Rjazanovskoe"]=135
training_data["sub_area"][training_data["sub_area"]=="Poselenie Kokoshkino"]=136
training_data["sub_area"][training_data["sub_area"]=="Vostochnoe"]=137
training_data["sub_area"][training_data["sub_area"]=="Poselenie Krasnopahorskoe"]=138
training_data["sub_area"][training_data["sub_area"]=="Poselenie Novofedorovskoe"]=139
training_data["sub_area"][training_data["sub_area"]=="Poselenie Voronovskoe"]=140
training_data["sub_area"][training_data["sub_area"]=="Poselenie Klenovskoe"]=141
training_data["sub_area"][training_data["sub_area"]=="Poselenie Rogovskoe"]=142
training_data["sub_area"][training_data["sub_area"]=="Poselenie Kievskij"]=143
training_data["sub_area"][training_data["sub_area"]=="Molzhaninovskoe"]=144
training_data["sub_area"][training_data["sub_area"]=="Poselenie Mihajlovo-Jarcevskoe"]=145

In [None]:
#Check the House cost per location

plt.scatter(training_data["sub_area"],training_data["price_doc"])

Let's get a heatmap of all the variables impacting price_doc

In [None]:
#saleprice correlation matrix
corrmat=training_data.corr()
k = 10 #number of variables for heatmap
cols = corrmat.nlargest(k, 'price_doc')['price_doc'].index
cm = np.corrcoef(training_data[cols].values.T)
sns.set(font_scale=1.25)
hm = sns.heatmap(cm, cbar=True, annot=True, square=True, fmt='.2f', annot_kws={'size': 10}, yticklabels=cols.values, xticklabels=cols.values)
plt.show()

Let me identify dependent variables having null data

In [None]:
total = training_data.isnull().sum().sort_values(ascending = False)
percent = (training_data.isnull().sum()/training_data.isnull().count()).sort_values(ascending = False)
missing_data = pd.concat([total,percent],axis=1,keys = ["Total","Percentage"])
print(missing_data)

In [None]:
training_data = training_data.drop(missing_data[missing_data["Total"] > 1].index,1)
print(training_data)

In [None]:
corrmat=training_data.corr()
k = 10 #number of variables for heatmap
cols = corrmat.nlargest(k, 'price_doc')['price_doc'].index
cm = np.corrcoef(training_data[cols].values.T)
sns.set(font_scale=1.25)
hm = sns.heatmap(cm, cbar=True, annot=True, square=True, fmt='.2f', annot_kws={'size': 10}, yticklabels=cols.values, xticklabels=cols.values)
plt.show()

Based on the above, does it mean that along with full_sq, the nearest office and sports centers also impact the house prices!!! More analysis required.