# <font color=darkpink>Airbnb Berlin</font>

This script contains the following points:

 - 1. Import libraries
 - 2. Import data
 - 3. Data Wrangling
 - 4. Data Consistency Checks
 - 5. Export the dataframe

## 1. Import libraries

In [1]:
# Import libraries

import pandas as pd
import numpy as np
import os

In [2]:
path = r'/Users/berk/Airbnb_Berlin'

## 2. Import data

In [3]:
# Import airbnb data

df_list = pd.read_csv(os.path.join(path, 'Data', 'Original_Data', 'listings.csv'))

In [4]:
df_list.head()

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365,number_of_reviews_ltm,license
0,3176,Fabulous Flat in great Location,3718,Britta,Pankow,Prenzlauer Berg Südwest,52.53471,13.4181,Entire home/apt,83,63,147,2021-01-01,0.88,1,303,0,First name and Last name: Nicolas Krotz Conta...
1,251423,2-room I 50 m² I Comfort I Balcony,1023063,Raja Jooseppi,Mitte,Brunnenstr. Nord,52.53867,13.39712,Entire home/apt,100,3,14,2023-01-29,0.11,8,93,10,01/Z/NA/003420-15
2,9991,Geourgeous flat - outstanding views,33852,Philipp,Pankow,Prenzlauer Berg Südwest,52.53269,13.41805,Entire home/apt,180,6,8,2020-01-04,0.09,1,0,0,03/Z/RA/003410-18
3,251486,I 2 Room I 41 m² I Balcony,1023063,Raja Jooseppi,Mitte,Brunnenstr. Süd,52.53667,13.39505,Entire home/apt,108,3,31,2023-03-14,0.24,8,104,28,01/Z/ZA/006181-16
4,251493,2-room I 41 m² I Terrace > Mitte,1023063,Raja Jooseppi,Mitte,Brunnenstr. Nord,52.53881,13.39654,Entire home/apt,98,3,15,2023-02-26,0.12,8,98,13,01/Z/ZA/006180-16


In [5]:
# Check the shape of the dataframe

df_list.shape

(12049, 18)

In [6]:
# Check the info of the dataframe

df_list.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12049 entries, 0 to 12048
Data columns (total 18 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   id                              12049 non-null  int64  
 1   name                            12035 non-null  object 
 2   host_id                         12049 non-null  int64  
 3   host_name                       12039 non-null  object 
 4   neighbourhood_group             12049 non-null  object 
 5   neighbourhood                   12049 non-null  object 
 6   latitude                        12049 non-null  float64
 7   longitude                       12049 non-null  float64
 8   room_type                       12049 non-null  object 
 9   price                           12049 non-null  int64  
 10  minimum_nights                  12049 non-null  int64  
 11  number_of_reviews               12049 non-null  int64  
 12  last_review                     

## 3. Data Wrangling

In [7]:
# Renaming columns

df_list.rename(columns = {'id' : 'listing_id', 'name' : 'listing_name', 'number_of_reviews_ltm' : 'review_in_the_last_year'}, inplace = True)

In [8]:
df_list.head()

Unnamed: 0,listing_id,listing_name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365,review_in_the_last_year,license
0,3176,Fabulous Flat in great Location,3718,Britta,Pankow,Prenzlauer Berg Südwest,52.53471,13.4181,Entire home/apt,83,63,147,2021-01-01,0.88,1,303,0,First name and Last name: Nicolas Krotz Conta...
1,251423,2-room I 50 m² I Comfort I Balcony,1023063,Raja Jooseppi,Mitte,Brunnenstr. Nord,52.53867,13.39712,Entire home/apt,100,3,14,2023-01-29,0.11,8,93,10,01/Z/NA/003420-15
2,9991,Geourgeous flat - outstanding views,33852,Philipp,Pankow,Prenzlauer Berg Südwest,52.53269,13.41805,Entire home/apt,180,6,8,2020-01-04,0.09,1,0,0,03/Z/RA/003410-18
3,251486,I 2 Room I 41 m² I Balcony,1023063,Raja Jooseppi,Mitte,Brunnenstr. Süd,52.53667,13.39505,Entire home/apt,108,3,31,2023-03-14,0.24,8,104,28,01/Z/ZA/006181-16
4,251493,2-room I 41 m² I Terrace > Mitte,1023063,Raja Jooseppi,Mitte,Brunnenstr. Nord,52.53881,13.39654,Entire home/apt,98,3,15,2023-02-26,0.12,8,98,13,01/Z/ZA/006180-16


## 4. Data Consistency Checks

In [9]:
#  Data Consistency Checks the dataframe

df_list.describe()

Unnamed: 0,listing_id,host_id,latitude,longitude,price,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365,review_in_the_last_year
count,12049.0,12049.0,12049.0,12049.0,12049.0,12049.0,12049.0,9773.0,12049.0,12049.0,12049.0
mean,1.493366e+17,126239600.0,52.509474,13.403401,109.135032,46.419952,35.533405,1.063864,6.166736,113.615155,9.547182
std,2.922496e+17,148639900.0,0.033557,0.066236,833.391109,55.93486,76.107987,1.87028,17.168217,130.086139,21.897381
min,3176.0,1581.0,52.36904,13.11815,0.0,1.0,0.0,0.01,1.0,0.0,0.0
25%,14243390.0,12320120.0,52.49001,13.365,48.0,2.0,1.0,0.12,1.0,0.0,0.0
50%,32420370.0,50736150.0,52.50973,13.41157,75.0,8.0,7.0,0.45,1.0,48.0,1.0
75%,52052420.0,206699600.0,52.53222,13.43836,120.0,92.0,32.0,1.4,3.0,231.0,10.0
max,8.498932e+17,505799700.0,52.65611,13.72139,89105.0,1100.0,1806.0,70.73,133.0,365.0,703.0


In [10]:
# Run a check for missing values in the dataframe

df_list.isnull().sum()

listing_id                           0
listing_name                        14
host_id                              0
host_name                           10
neighbourhood_group                  0
neighbourhood                        0
latitude                             0
longitude                            0
room_type                            0
price                                0
minimum_nights                       0
number_of_reviews                    0
last_review                       2276
reviews_per_month                 2276
calculated_host_listings_count       0
availability_365                     0
review_in_the_last_year              0
license                           5082
dtype: int64

In [11]:
# Address the missing values using an appropriate method.

df_list.dropna(subset = ['listing_name'], inplace = True)

In [12]:
df_list.dropna(subset = ['host_name'], inplace = True)

In [13]:
df_list.dropna(subset = ['last_review'], inplace = True)

In [14]:
df_list.dropna(subset = ['reviews_per_month'], inplace = True)

In [15]:
df_list.dropna(subset = ['license'], inplace = True)

In [16]:
# Check again missing values

df_list.isnull().sum()

listing_id                        0
listing_name                      0
host_id                           0
host_name                         0
neighbourhood_group               0
neighbourhood                     0
latitude                          0
longitude                         0
room_type                         0
price                             0
minimum_nights                    0
number_of_reviews                 0
last_review                       0
reviews_per_month                 0
calculated_host_listings_count    0
availability_365                  0
review_in_the_last_year           0
license                           0
dtype: int64

In [17]:
# Check for mixed-type data in the dataframe

for col in df_list.columns.tolist():
  weird = (df_list[[col]].applymap(type) != df_list[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (df_list[weird]) > 0:
        print (col)

There is no mixed-type data

In [18]:
# Run a check for duplicate values in the data

df_list_dups = df_list[df_list.duplicated()]

In [19]:
df_list_dups.size

0

In [20]:
df_list.shape

(6065, 18)

## 5. Export the dataframe

In [21]:
# Export df_list
df_list.to_csv(os.path.join(path, 'Data','Prepared_Data', 'list_cleaned.csv'))