In [None]:
# make Notebook wider
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))


import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('./'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

df = pd.read_csv("immo_data.csv")     

print("df shape:", df.shape)
display(df.info())
display(df.head())

./.gitignore
./README.md
./immo_data.csv.zip
./immo_data.csv
./data_analysis.ipynb
./.ipynb_checkpoints/data_analysis-checkpoint.ipynb


In [None]:

import matplotlib.pyplot as plt

# pd.set_option('max_columns', None)

def plot_barh(dict, title=""):
    full_height = (len(dict.keys())) / 3
    plt.figure(figsize=(10, full_height))
    plt.gca().spines['right'].set_color('none')
    plt.gca().spines['top'].set_color('none')
    plt.title(title)
    pp = plt.barh(list(dict.keys()), list(dict.values()), align='center', alpha=0.5)

    for p in pp:
       width = p.get_width()
       plt.annotate('{}'.format(width),
          xy=(width, p.get_y()),
          xytext=(3, 6),
          textcoords="offset points",
          ha='left', va='center')
    plt.show()

In [None]:
record_count_by_region = df.groupby(['regio1']).size().sort_values(ascending=True).to_dict()
plot_barh(record_count_by_region, "Records count by region")


In [None]:

berlinDf = df[df['regio1']=='Berlin']
records_by_neighbourhood_in_berlin = berlinDf.groupby(['regio3']).size().sort_values(ascending=True).to_dict()
plot_barh(records_by_neighbourhood_in_berlin, "Records by neighbourhood in Berlin")


In [None]:
useful_columns = [
    'regio1', 'regio3', 'heatingType', 'newlyConst', 'balcony', 'picturecount', 'totalRent', 'yearConstructed', 'noParkSpaces', 'firingTypes', 'hasKitchen', 'geo_bln', 'cellar', 'baseRent', 'livingSpace', 'condition', 'interiorQual', 'petsAllowed', 'lift', 'typeOfFlat', 'noRooms', 'floor', 'numberOfFloors', 'garden', 'heatingCosts', 'energyEfficiencyClass', 'lastRefurbish', 
]
berlinDf_select = berlinDf[useful_columns]

print("NA records by feature:")
berlinDf_select.isna().sum()


In [None]:

# We will be removing: ['yearConstructed', 'noParkSpaces', 'floor', 'numberOfFloors', 'typeOfFlat', 'heatingCosts', 'lastRefurbish', 'interiorQual']
# because there are too many NA values, and feature are not that important.
berlinDf_select = berlinDf_select.drop(['yearConstructed', 'noParkSpaces', 'floor', 'numberOfFloors', 'typeOfFlat', 'heatingCosts', 'lastRefurbish', 'interiorQual', 'petsAllowed', 'energyEfficiencyClass'], axis='columns')

# Also ['regio1', 'totalRent', 'picturecount', 'condition'] is not a value that we want to predict on.
berlinDf_select = berlinDf_select.drop(['regio1', 'totalRent', 'picturecount', 'condition'], axis='columns')

berlinDf_select.isna().sum()

# We will be keeping ['heatingType', 'firingTypes'] because I still think they are relevant. We will fill NA with NO_INFORMATION and experiemtn with the model.


In [None]:
correlation = berlinDf_select.corr()

import seaborn as sn
import matplotlib.pyplot as plt

plt.figure(figsize=(18, 18))
sn.heatmap(correlation, annot=True, cbar=False, linewidths=.5, cmap="YlGnBu", fmt='.2f', annot_kws={'size': 15})
plt.tick_params(axis='both', which='major', labelsize=23, labelbottom = True, bottom=True, top = True, labeltop=True, right=True, labelright=True)
plt.xticks(rotation=90) 
plt.yticks(rotation=0) 

plt.show()

# Checking the baseRent corelation with possible features:
# Data shows that livingSpace and noRooms has the highest corelation ~0.8. 
# But constucton year is important too ~0.5.
# Surprisingly seems like heatingCost is also directly corelated with baseRent too. I was expecting this to be inverse corelated.

# Result: We will experiement with following columns for training: livingSpace, noRooms, heatingCosts, hasKitchen, cellar, garden, balcony
#         Plus categorical columns for: heatingType, firingTypes, condition', 'interiorQual', 'petsAllowed, energyEfficiencyClass


In [None]:
import matplotlib.pyplot as plt


def plot_ticks(x, y, x_label, y_label):
    plt.figure(figsize=(10, 5))
    plt.plot(x, y, 'o')
    plt.xlabel(x_label)
    plt.ylabel(y_label)
    plt.show()

for c in ["livingSpace", "noRooms", "hasKitchen", "cellar", "garden", "balcony"]:
    plot_ticks(berlinDf_select["baseRent"], berlinDf_select[c], 'baseRent', c)
  

In [None]:
for c in ["heatingType", "firingTypes"]:
    berlinDf_select[c] = berlinDf_select[c].fillna("NO_INFORMATION")
    classes = berlinDf_select[c].unique()
    data = {}
    for cl in classes:
        mean = berlinDf_select[berlinDf_select[c]==cl]["baseRent"].mean()
        data[str(cl)] = round(mean,0)
    plot_barh(data, c)


In [None]:
# We will:
#     - later experiment training by exluding the ["hasKitchen", "cellar", "garden", "balcony"]
#     - remove records with baseRent bigger than 5000 and 7500
#     - remove record with noRooms > 10

berlinDf_final = berlinDf_select
berlinDf_final = berlinDf_final.drop(berlinDf_final[berlinDf_final['baseRent']>5000].index)
berlinDf_final = berlinDf_final.drop(berlinDf_final[berlinDf_final['noRooms']>10].index)

berlinDf_final


In [None]:
print("Final data:")
display(berlinDf_final.isna().sum())

pd.set_option('display.max_columns', None)
berlinDf_final