In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# *Riga Real Estate Review*

In [None]:
print(f'In this review I will analyse the real estate in Riga.')
print()
print(f'The main task is to answer these questions:')
print(f'Which districts and house types are the most offered for sale and for rent?')
print(f'What is the average price of objects for sale and for rent?') 
print(f'Which district has the lowest and which has the highest sale / rental price?')
print(f'How does floor level, house type or condition affects the sale or rental price??')
print()


#### Import libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

#### Loading the dataset (csv file)

In [None]:
rre = pd.read_csv('/kaggle/input/riga-real-estate-dataset/riga_re.csv')
rre.head(2)

#### Shape of dataset

In [None]:
nRow, nCol = rre.shape
print(f'There are {nRow} rows and {nCol} columns in this dataset')

In [None]:
print(f'Column names: \n{rre.columns}')

#### Full summary of the dataset

In [None]:
rre.info() 

#### Statistical details of the dataset

In [None]:
rre.describe()  

#### Data cleaning

In [None]:
print(f'After reviewing the data, I decided not to use columns "street", "house_seria", "lat" and "lon", so I am deleting them:')
rre=rre.drop(['street','house_seria','lat','lon'],axis=1)              
rre.head()

In [None]:
print(f'Also I am deleting Nan values:')
rre = rre.dropna(subset=['op_type','district','rooms','area','floor','total_floors','house_type','price'])
rre.head(2)

In [None]:
print(f'I checked what values are in the numeric columns and I see that there is a word "Citi" (in Latvian "Other") among the values in the "rooms" column - only 14 entries, so I will continue to use data without this value.')
rre['rooms'].value_counts()

In [None]:
rre = rre[rre.rooms !='Citi']

#### Changing the data type

In [None]:
print(f'Need to change data type in columns "rooms", "floor","total_floors" and "area" from float to integer.')
rre['rooms'] = rre['rooms'].astype('int')
rre['area'] = rre['area'].astype('int')
rre['floor'] = rre['floor'].astype('int')
rre['total_floors'] = rre['total_floors'].astype('int')
rre.head(2)

#### Merging two columns, creating a new column

In [None]:
print(f'I also want the information about the floor of the object and how many floors there are to be in one column. \nI am creating a new column "Floor_total".')

def floor_total(s):
    return str(s['floor']) + " / " + str(s['total_floors'])
floor_total(rre.iloc[1])
rre.apply(floor_total, axis='columns')

In [None]:
rre['floor_total'] = rre.apply(floor_total, axis='columns')

In [None]:
print(f'Also I am creating a new column with square meter price and name it "sqm_price":')
rre['sqm_price']=round(rre['price'] / rre['area'])
rre['price']=round(rre['price'],2)   # nepakeicia dvieju po kableliu
rre.head(2)

In [None]:
print(f'I see that there are mostly  For sale" and "For rent" operation type.')
print(f'So I will use only data of these to operation types, because the other are insignificant:')
rre['op_type'].value_counts()

In [None]:
rre = rre[rre['op_type'].isin(['For sale','For rent'])]
# I will need For sale and For rent operations separately:
For_sale = rre[rre['op_type'] == 'For sale']
For_rent = rre[rre['op_type'] == 'For rent']

#### Checking for null values

In [None]:
pd.isnull(rre).sum()

In [None]:
print(f'Data is cleaned!')

## General overview of the data

#### Unique values

In [None]:
print(f'How many unique values has each column:')
rre.nunique() 

In [None]:
print(f'In this graph we see how many objects are for sale and how many for rent:')
sns.set_theme()  # setting grid (for whole notebook)
plt.figure(figsize=(10,4))
ax = sns.countplot(x="op_type", data=rre, palette = 'magma')

ax.set_title ("Number of objects by operation type")
ax.set_xlabel ("Operation type")
ax.set_ylabel ("Quantity")


##### The most popular values:

In [None]:
max_district = rre['district'].value_counts().idxmax()
max_rooms = rre['rooms'].value_counts().idxmax()
max_area = rre['area'].value_counts().idxmax()
max_floor = rre['floor'].value_counts().idxmax()
max_house_type = rre['house_type'].value_counts().idxmax()
print(f'The most popular district:        {max_district}') 
print(f'The most popular number of rooms: {max_rooms}') 
print(f'The most popular area:            {max_area}') 
print(f'The most popular floor:           {max_floor}') 
print(f'The most popular house type:      {max_house_type}') 

#### Prices

In [None]:
price_for_sale_min = rre[rre['op_type']=='For sale']['price'].min()
print(f'The lowest price for sale:  {round(price_for_sale_min)}')
price_for_sale_mean = rre[rre['op_type']=='For sale']['price'].mean()
print(f'Average price for sale:     {round(price_for_sale_mean)}')
price_for_sale_max = rre[rre['op_type']=='For sale']['price'].max()
print(f'The highest price for sale: {round(price_for_sale_max)}')
print()
price_for_rent_min = rre[rre['op_type']=='For rent']['price'].min()
print(f'The lowest price for rent:   {round(price_for_rent_min)}')
price_for_rent_mean = rre[rre['op_type']=='For rent']['price'].mean()
print(f'Average price for rent:      {round(price_for_rent_mean)}')
price_for_rent_max = rre[rre['op_type']=='For rent']['price'].max()
print(f'The highest price for rent:  {round(price_for_rent_max)}')
print()

In [None]:
min_avg_max = pd.DataFrame([[price_for_sale_min, price_for_sale_mean, price_for_sale_max],[price_for_rent_min, price_for_rent_mean, price_for_rent_max]], columns=['Min', 'Avg','Max'], index=['For sale', 'For rent'])
min_avg_max = min_avg_max.round()

fs = pd.DataFrame([[price_for_sale_min, price_for_sale_mean, price_for_sale_max]], columns=['Min', 'Avg','Max'], index=['For sale'])
fs = fs.T

fr = pd.DataFrame([[price_for_rent_min, price_for_rent_mean, price_for_rent_max]], columns=['Min', 'Avg','Max'], index=['For rent'])
fr = fr.T

In [None]:
Price =['Min', 'Avg','Max']
Eur =[price_for_sale_min, price_for_sale_mean, price_for_sale_max]

df = pd.DataFrame({"Price":Price, "Eur":Eur})
df

plt.figure(figsize=(10, 6))
splot=sns.barplot(x="Price",y="Eur",data=df, palette = 'bright')
for p in splot.patches:
    splot.annotate(format(p.get_height(), '.1f'), 
                   (p.get_x() + p.get_width() / 2., p.get_height()), 
                   ha = 'center', va = 'center', 
                   xytext = (0, 9), 
                   textcoords = 'offset points')
plt.xlabel("Price", size=14)
plt.ylabel("Eur", size=14)
plt.title("Price for sale", size = 14)


Price =['Min', 'Avg','Max']
Eur =[price_for_rent_min, price_for_rent_mean, price_for_rent_max]

df = pd.DataFrame({"Price":Price, "Eur":Eur})
df

plt.figure(figsize=(10, 6))
splot=sns.barplot(x="Price",y="Eur",data=df, palette = 'bright')
for p in splot.patches:
    splot.annotate(format(p.get_height(), '.1f'), 
                   (p.get_x() + p.get_width() / 2., p.get_height()), 
                   ha = 'center', va = 'center', 
                   xytext = (0, 9), 
                   textcoords = 'offset points')
plt.xlabel("Price", size=14)
plt.ylabel("Eur", size=14)
plt.title("Price for rent")

#### Area

In [None]:
area_for_sale_min = rre[rre['op_type']=='For sale']['area'].min()
print(f'The smallest area for sale:  {round(area_for_sale_min)}')
area_for_sale_mean = rre[rre['op_type']=='For sale']['area'].mean()
print(f'Average area for sale:       {round(area_for_sale_mean)}')
area_for_sale_max = rre[rre['op_type']=='For sale']['area'].max()
print(f'The largest area for sale:   {round(area_for_sale_max)}')
print()
area_for_rent_min = rre[rre['op_type']=='For rent']['area'].min()
print(f'The smallest area for rent:   {round(area_for_rent_min)}')
area_for_rent_mean = rre[rre['op_type']=='For rent']['area'].mean()
print(f'Average area for rent:        {round(area_for_rent_mean)}')
area_for_rent_max = rre[rre['op_type']=='For rent']['area'].max()
print(f'The largest area for rent:    {round(area_for_rent_max)}')

In [None]:
min_avg_max_area = pd.DataFrame([[area_for_sale_min, area_for_sale_mean, area_for_sale_max],[area_for_rent_min, area_for_rent_mean, area_for_rent_max]], columns=['Min', 'Avg','Max'], index=['For sale', 'For rent'])
min_avg_max_area = min_avg_max_area.round()

fsa = pd.DataFrame([[area_for_sale_min, area_for_sale_mean, area_for_sale_max]], columns=['Min', 'Avg','Max'], index=['For sale'])
fsa = fsa.T

fra = pd.DataFrame([[area_for_rent_min, area_for_rent_mean, area_for_rent_max]], columns=['Min', 'Avg','Max'], index=['For sale'])
fra = fra.T

In [None]:
Area =['Min', 'Avg','Max']
sq_m =[area_for_sale_min, area_for_sale_mean, area_for_sale_max]

df = pd.DataFrame({"Area":Area, "sq_m":sq_m})
df

plt.figure(figsize=(10, 6))
splot=sns.barplot(x="Area",y="sq_m",data=df, palette = 'bright')
for p in splot.patches:
    splot.annotate(format(p.get_height(), '.1f'), 
                   (p.get_x() + p.get_width() / 2., p.get_height()), 
                   ha = 'center', va = 'center', 
                   xytext = (0, 9), 
                   textcoords = 'offset points')
plt.xlabel("Area", size=14)
plt.ylabel("sq.m.", size=14)
plt.title("Area for sale", size = 14)


Area =['Min', 'Avg','Max']
sq_m =[area_for_rent_min, area_for_rent_mean, area_for_rent_max]

df = pd.DataFrame({"Area":Area, "sq_m":sq_m})
df

plt.figure(figsize=(10, 6))
splot=sns.barplot(x="Area",y="sq_m",data=df, palette = 'bright')
for p in splot.patches:
    splot.annotate(format(p.get_height(), '.1f'), 
                   (p.get_x() + p.get_width() / 2., p.get_height()), 
                   ha = 'center', va = 'center', 
                   xytext = (0, 9), 
                   textcoords = 'offset points')
plt.xlabel("Area", size=14)
plt.ylabel("sq.m.", size=14)
plt.title("Area for rent", size = 14)

#### Average price by number of rooms

In [None]:
print(f'This graph confirms the assumption that the price will increase as the number of rooms increases, which usually means a larger living space.')
      
line,ax = plt.subplots(figsize=(10,6))
rre.groupby('rooms').price.mean()                                           
sns.barplot(data=rre, x="rooms", y="price", palette='pastel')
plt.title("Average price by number of rooms", size=14)
plt.xlabel("Number of rooms")
plt.ylabel("Price Eur")


### Overview of districts

In [None]:
unique_district_len=len(rre['district'].unique())
print(f'There are {unique_district_len} districts in the dataset') 
print(f'Number of objects by district')
print(f'The largest number of objects is offered in the center.')
line,ax = plt.subplots(figsize=(16,6))
sns.countplot(data = rre, x='district', hue = 'op_type', palette = 'Set2')
plt.xticks(rotation=90, size = 10)
ax.set_title ("Number of objects by district", size = 16)
ax.set_xlabel ("Districts")
ax.set_ylabel ("Number of objects")

#### Average price by district

In [None]:
print(f'In these graphs we see that the highest average selling price is in Klīversala, Kīpsala, Bukulti districts,\nbut the rental price in Klīversala district is quite low. Therefore, if you think about earning rent, it really doesnt worth, because the return on rent will be small.') 
print(f'However, as we can see from the graph above, VEF has the lowest supply (only 2 objects). So, this amount of data is not sufficient to make an objective assumptions.')
avgpriceforsale = For_sale.groupby('district').price.mean()
avgpriceforrent = For_rent.groupby('district').price.mean()

line,ax = plt.subplots(figsize=(14,6))
sns.lineplot(data=avgpriceforsale, palette = "hls")
plt.xticks(rotation=90, size = 10)
ax.set_title ("Average selling price by district", size = 16)
ax.set_xlabel ("Districts", size = 12)
ax.set_ylabel ("Price Eur", size = 12)

line,ax = plt.subplots(figsize=(14,6))
sns.lineplot(data=avgpriceforrent, palette = "hls")
plt.xticks(rotation=90, size = 10)
ax.set_title ("Average rental price by district", size = 16)
ax.set_xlabel ("Districts", size = 12)
ax.set_ylabel ("Price Eur", size = 12)

###  Overview by house type

In [None]:
print(f'The following table shows the prices of dwellings for sale and for rent and the number of objects by type of house:')
grouped_by_optype_and_housetype = rre.groupby(['op_type', 'house_type'])['price'].agg([min, max, len]).rename(columns={'len': 'Number of objects'})
grouped_by_optype_and_housetype

In [None]:
print(f'Masonry, Brick and Panel houses are the most offered.\nWood and Panel-Brick are the least available.')
line,ax = plt.subplots(figsize=(14,6))                                                         
sns.countplot(data=rre, x='house_type', hue='op_type', palette = 'bright')
plt.title("Objects for sale and rent by type of house", size = 14)
ax.set_xlabel ("House type")
ax.set_ylabel ("Objects")

In [None]:
print(f'Average price by house type:')
round(rre.groupby(['op_type', 'house_type'])['price'].mean(),2)

In [None]:
print(f'Masonry house type is the most expensive and Wood house is the cheapest to sell. In the rental supply the most expensive is Masonry house type, but the cheapest Panel house type. ')

# average selling price by house type:
house_type_by_price_for_sale = round(For_sale.groupby('house_type')['price'].mean())

line,ax = plt.subplots(figsize=(12,6))
sns.scatterplot(data=house_type_by_price_for_sale, s=250)  # s=250 -> changing dot size
plt.xticks(size = 12)
ax.set_title ("Average selling price by house type", size = 18)
ax.set_xlabel ("House type", size = 14)
ax.set_ylabel ("Price Eur", size = 14)

# average rental price by house type
house_type_by_price_for_rent = round(For_rent.groupby('house_type')['price'].mean())

line,ax = plt.subplots(figsize=(12,6))
sns.scatterplot(data=house_type_by_price_for_rent, s=250)  # s=250 -> changing dot size
plt.xticks(size = 12)
ax.set_title ("Average rental price by house type", size = 18)
ax.set_xlabel ("House type", size = 14)
ax.set_ylabel ("Price Eur", size = 14)

In [None]:
print(f'Here is the average price per square meter by house type.')
print(f'We see that the highest is the price of Masonry and Panel-Brick house types for sale and the lowest is the price of Wood house type.')
line,ax = plt.subplots(figsize=(12,6))
sns.boxenplot(data=For_sale, x='price', y='house_type', palette = 'Set2')  
plt.xticks(size = 12)
ax.set_title ("Selling price by house type", size = 16)
ax.set_xlabel ("Price Eur", size = 12)
ax.set_ylabel ("House type", size = 12)

print(f'The highest rental price is also for Masonry and Panel-Brick house types, and the lowest is for Wood and Panel.')
line,ax = plt.subplots(figsize=(12,6))
sns.boxenplot(data=For_rent, x='price', y='house_type', palette = 'Set2')  
plt.xticks(size = 12)
ax.set_title ("Rental price by house type", size = 16)
ax.set_xlabel ("Price Eur", size = 12)
ax.set_ylabel ("House type", size = 12)

In [None]:
print(f'Here is another way to display the average price and the number of objects for each type of house:')
g = sns.FacetGrid(For_sale, col="house_type")
g.map(sns.histplot, "price")
g.set_axis_labels("price", "Count")
g.fig.suptitle("Objects for sale", size = 16)

g = sns.FacetGrid(For_rent, col="house_type")
g.map(sns.histplot, "price")
g.set_axis_labels("price", "Count")
g.fig.suptitle("Objects for rent", size = 16)

In [None]:
print(f'I also want to see what the sale prices are by house type and floor')
g = sns.FacetGrid(For_sale, col="op_type", hue="house_type", height=8,  aspect=.99, palette ='bright')
g.map(sns.scatterplot, "price", "floor", alpha=.9)
g.add_legend()
print(f'We see that dominate Brick-Panel and Panel-Brick on the 1st - 6th floors up to about 250,000 Eur and Masonry houses from about 180,000 Eur to 400,000 Eur ')

In [None]:
print(f'In the rental supply the most objects are in Brick and Panel_Brick houses on the 1st to 9th floor with price up to 700 Eur.')
g = sns.FacetGrid(For_rent, col="op_type", hue="house_type", height=8,  aspect=.99, palette ='bright')
g.map(sns.scatterplot, "price", "floor", alpha=.9)
g.add_legend()

#### The impact of the floor on the price

In [None]:
print(f'We can see that most objects are offered on the 2nd floor, and from 10 floors and above the supply is reduced to a minimum')
floor_value_counts = rre['floor'].value_counts()
line,ax = plt.subplots(figsize=(8,2), dpi=300)
floor_value_counts = sns.lineplot(data=floor_value_counts)
plt.xticks(rotation=0, size = 10)

floor_value_counts.set_title("Number of objects by the floor", fontsize=10)
floor_value_counts.set_xlabel ("Floor")
floor_value_counts.set_ylabel ("Objects")


In [None]:
print(f'I want to look at the selling price per square meter by floor and confirm or refute the hypothesis that the apartments sold on the 1st floor are the cheapest.')
print()
print(f'So, we see that the sale price of the dwellings on the 1st floor is indeed at the lower limit, but also on the 8th, 9th and 11th floors the same low price as on the first floor.')
print(f'However, the rental price by the first floor is not affected at all. Price fluctuations start only from the 8th floor.')
fs_sqmp_mean = round(For_sale.groupby(['floor'])['sqm_price'].mean())

line,ax = plt.subplots(figsize=(14,4))
sns.lineplot(data=fs_sqmp_mean, palette='dark')
ax.set_title ("Average sq. m. selling price by the floor", size=16)

fr_sqmp_mean = round(For_rent.groupby(['floor'])['sqm_price'].mean())
line,ax = plt.subplots(figsize=(14,4))
sns.lineplot(data=fr_sqmp_mean, palette='dark')
ax.set_title ("Average sq. m. rental price by the floor", size=16)
print()

#### The price by the area

In [None]:
print(f'I want to see how the price changes as the housing area changes. I guess as the area increases, so does the price.\nHowever, according to the graph, we can see that the curve is quite wavy.')
print(f'I think this is because the price is also affected by  the type of house, the area, and the floor.')
line,ax = plt.subplots(figsize=(16,4))
sns.lineplot(data=For_sale, x="area", y="price")

plt.title("Impact of the area on price")
ax.set_xlabel ("Area")
ax.set_ylabel ("Price Eur")


In [None]:
print(f'To be more precise, I filtered out only 50 sq.m. average house prices - the obvious price difference can be seen here, although the housing area is identical.')
area50 = For_sale[For_sale['area']==50]
grouped_area50=area50.groupby('house_type').price.mean()
round(grouped_area50,2)

In [None]:
print(f'So, the filtering out of different types of houses showed that the type of house significantly influences the price.')
line,ax = plt.subplots(figsize=(16,4))
sns.lineplot(data=grouped_area50)
plt.title("Average cost of 50 sq.m. housing by house type")
ax.set_xlabel ("House type")
ax.set_ylabel ("Price Eur")

In [None]:
print(f'Below we can review what the average area is offered in each area.')
print(f'We see that we would have almost no choice if we were looking for housing over 100 sq.m., whether buying or renting - the offer is very limited.')
# Avg area for sale by district
district_by_area_mean_for_sale=For_sale.groupby('district').area.mean()

line,ax = plt.subplots(figsize=(14,6))
sns.lineplot(data=district_by_area_mean_for_sale, palette = "bright")
plt.xticks(rotation=90, size = 10)
ax.set_title ("Average area for sale by district", size = 18, color = 'b')
ax.set_xlabel ("Districts", size = 12, color = 'b')
ax.set_ylabel ("Area sq.m.", size = 12, color = 'b')

# Avg area for rent by district
district_by_area_mean_for_rent=For_rent.groupby('district').area.mean()

line,ax = plt.subplots(figsize=(14,6))
sns.lineplot(data=district_by_area_mean_for_rent, palette = "bright")
plt.xticks(rotation=90, size = 10)
ax.set_title ("Average area for rent by district", size = 18, color = 'b')
ax.set_xlabel ("Districts", size = 12, color = 'b')
ax.set_ylabel ("Area sq.m.", size = 12, color = 'b')


#### Condition

In [None]:
print(f"Let's see how many objects are offered with all amenities, partial and no amenities at all.")
rre.groupby('condition').price.count()

In [None]:
sns.displot(data=rre, x='condition', hue='op_type', palette = 'bright')         
plt.title("Number of objects according to the offered amenities")

In [None]:
print(f'Although only 13 objects are offered without amenities, I want to test another hypothesis that objects with all amenities are more expensive, with partial amenities - cheaper, and without all amenities - the cheapest. However, we see that the price of objects without amenities is not lower, but higher than with partial amenities.')
print(f'Why is that? These prices are likely to be influenced by other factors, e.g. district.')
condition=rre.groupby('condition').price.mean()
condition

In [None]:
print(f'Since I have just looked at the prices without distinguishing whether it is a rented or sold object, this information should be distinguished.')
print(f'I look at the data of objects for sale and rent and I see that the situation with objects for sale remains the same - apartments without any amenities are somehow cheaper than with partial amenities, but there are no options in the rental offer without amenities at all.')

condition_fs=For_sale.groupby('condition').price.mean()

line,ax = plt.subplots(figsize=(12,4))
sns.lineplot(data=condition_fs, palette='pastel')
ax.set_title ("Selling price by condition", size=16)
ax.set_xlabel ("Condition")
ax.set_ylabel ("Price Eur")

condition_fr=For_rent.groupby('condition').price.mean()

line,ax = plt.subplots(figsize=(12,4))
sns.lineplot(data=condition_fr, palette='pastel')
ax.set_title ("Rental price by condition", size=16)
ax.set_xlabel ("Condition")
ax.set_ylabel ("Price Eur")

In [None]:
print(f'After performing this analysis, I found out that the largest supply of objects for sale and for rent is in the center. These are usually 2-room dwellings on the 2nd floor of a brick house.') 
print()
print(f'The average price for sale is 91483 Eur, for rent - 426 Eur.') 
print(f'The lowest average housing sale price is in Jaunmīlgrāvis district - 12250 Eur. The largest - in Klīversala district - 287974 Eur.')
print(f'Lowest average rental price in Šķirotava district for 140 Eur, the highest - in Ķīpsala district - 1308.')
print()
print(f'I found out that the floor of the house affects the price of the objects for sale - the houses on the 1st floor are cheaper. Housing on the 8th, 9th and 11th floors also falls into the category of cheaper. From the 12th floor up, the price goes up a bit. But the price of rented housing is not affected at all by the first floor. The rental price remains stable from the 1st to the 7th floor inclusive. The largest supply on the market is housing from 1 to 6 floors.')
print()     
print(f'The type of house also has an obvious effect on the price - Masonry is the most expensive house type and Wood houses are the cheapest. The most expensive rental is in a Masonry house type and the cheapest in a Panel house type.')
print(f'According to this dataset, there are no homes without amenities in the rental offer at all.')
print(f'The fact that there are no amenities at all does not lower the purchase price - housing without amenities is even cheaper than with partial amenities.')

<img src="https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcTZ7ceuOxKPLfJDiNIcnmxgtp9cmy6VvdW1RQ&usqp=CAU" width="1000">
