# Data preparation pipeline

In [2]:
# import libraries
import pandas as pd
import numpy as np

In [22]:
# load dataset
data = pd.read_csv('Comune-di-Milano-Strutture-ricettive-alberghiere.csv',sep=';',encoding='unicode_escape')
data

Unnamed: 0,Ubicazione,Tipo via,Descrizione via,Civico,Codice via,ZD,Camere,Camere piano,Categoria,Insegna,Piani totali,Piano piano,Posti letto,Posti letto per piano,Tipo attività struture extra
0,ALZ NAVIGLIO GRANDE N. 8 (z.d. 6),ALZ,NAVIGLIO GRANDE,8.0,5144.0,6.0,14.0,,4,HOTEL MAISON BORELLA,,,25.0,,Albergo
1,codvia 0000 num.024 ; (),,,,,,259.0,259,4,radisson blu hotel milan,,,518.0,518,Albergo
2,CSO BUENOS AIRES N. 18 (z.d. 3),CSO,BUENOS AIRES,18.0,2129.0,3.0,16.0,16,1,hotel aurora,1.0,1,25.0,25,Albergo
3,CSO BUENOS AIRES N. 26 (z.d. 3),CSO,BUENOS AIRES,26.0,2129.0,3.0,25.0,,3,hotel buenos aires,,,39.0,,Albergo
4,CSO BUENOS AIRES N. 2 (z.d. 3),CSO,BUENOS AIRES,2.0,2129.0,3.0,46.0,15;11;8,3,albergo fenice,4.0,1;2;3;4,98.0,24;19;13,Albergo
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
446,VLE TUNISIA N. 6 (z.d. 3),VLE,TUNISIA,6.0,2121.0,3.0,13.0,16,1,hotel kennedy,1.0,6,23.0,23,Albergo
447,VLE TUNISIA N. 9 (z.d. 3),VLE,TUNISIA,9.0,2121.0,3.0,50.0,3;6;9;9;9,4,st. george hotel,,A;1;2;3;4,99.0,5;16;26;26;26,Albergo
448,VLE VITTORIO VENETO N. 30 (z.d. 2),VLE,VITTORIO VENETO,30.0,2107.0,2.0,15.0,,2,hotel casa mia,,,25.0,,Albergo
449,VLE ZARA N. 1 (z.d. 9),VLE,ZARA,1.0,1170.0,9.0,32.0,,4,casa albergo residence zara lagosta,,,64.0,,Residence


## Data profiling
Extracts metadata and statistics

In [None]:
print(data.dtypes)

In [None]:
#Check the number of unique values in each column
rows=data.shape[0]
print("Total rows:",rows)
for col in data.columns:
    print("Column",col,"has", data[col].nunique(), "unique values")

Ydata profiling

In [None]:
import sys
!{sys.executable} -m pip install -U "ydata-profiling[notebook]"
!pip install jupyter-contrib-nbextensions

In [None]:
!jupyter nbextension enable --py widgetsnbextension

In [None]:
!pip install dataprofiler

In [None]:
from ydata_profiling import ProfileReport
import pandas as pd
import json

In [None]:
profile= ProfileReport(data, title="data report")
profile

In [None]:
profile.to_file("data_report.html")

Do a report also fer the cleaned dataset

In [None]:
profile.to_file("data_report.json")

In [None]:
file = open("data_report.json")
jsonFile=json.load(file)

Functional dependencies

In [None]:
!git clone https://github.com/camillasancricca/DATADIQ.git

In [None]:
from DATADIQ import tane
from DATADIQ import ctane

In [None]:
source='Comune-di-Milano-Strutture-ricettive-alberghiere.csv'
tane.compute(source)

In [None]:
ctane.compute(source,0.5)

In [None]:
import sys
sys.path.append('SCRIPTS')
import fdtool

In [None]:
source='Comune-di-Milano-Strutture-ricettive-alberghiere.csv'
fdtool.main(source)

In [None]:
!pip install desbordante==2.3.2

In [None]:
import desbordante as db
import pandas as pd

In [None]:
df = data
algo = db.fd.algorithms.Default()
algo.load_data(table=df)
algo.execute()
print('FDs:')
for fd in algo.get_fds():
    print(fd)

Association rules

In [None]:
!pip install mlxtend pyECLAT efficient-apriori plotly

In [None]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [None]:
import pandas as pd
import numpy as np
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules
import plotly.express as px
from mlxtend.frequent_patterns import fpgrowth
from pyECLAT import ECLAT
from DATADIQ import eff_apriori
import plotly.offline as pyo

In [None]:
#Put all items of each transactions into a list
records = []
for i in range (0, len(data)):
    records.append([str(data.values[i,j]) for j in range(0, 15)])

In [None]:
#Initializing the transactionEncoder
TE = TransactionEncoder()
array = TE.fit(records).transform(records)

In [None]:
#Building the data frame rows are logical and columns are the items have been purchased
transf_df = pd.DataFrame(array, columns = TE.columns_)
transf_df

In [None]:
#Drop NaN
basket_clean = transf_df.drop(['nan'], axis = 1)
basket_clean

In [None]:
#Chose 0.03 minimum support
a_rules = apriori(basket_clean, min_support = 0.03, use_colnames = True)
a_rules['length'] = a_rules['itemsets'].apply(lambda x: len(x))

In [None]:
#Frequent itemset
a_rules

In [None]:
#Chose 0.05 minimum confidence
rules = association_rules(a_rules, metric = 'confidence', min_threshold = 0.80)
rules

In [None]:
eff_apriori.rules(data,0.1,1)

## Data quality assessment
Calculate a number for each of the data quality dimensions

Duplication

In [None]:
data.duplicated().any()
#There are no rows exaclty duplicated in the dataset

Completeness

In [None]:
print("Total rows:",data.shape[0])
print(data.count()) #number of not null values

In [None]:
#Completeness
Null=data.isnull().sum().sum()
Not_null=data.count().sum()
Total=Null+Not_null
Completeness=Not_null/Total
Completeness = '{:.2%}'.format(Completeness)
Completeness

In [None]:
#Check if there are other common values representing NaN
np.sum(data.isin(['na','-','--','nan','null']))

Accuracy, timeliness and consistency cannot be assessed since we don't have the ground truth

Consistency if we know some dependencies

# Data Cleaning
## Data transformation/standardization

Data wrangling

In [None]:
data.info()

In ubicazione ci sono tipo via, nome, civico e Municipio, si possono usare questi dati per riempire i valori nulli delle colonne e poi droppare ubicazione

In [None]:
data["Ubicazione"].unique()

#si potrebbe separare la via dal numero

In [125]:
data["ZD"].unique()
data=data.rename(columns={'ZD':'Municipio'})

In [None]:
data[data['Municipio'].isnull()] # Si possono riempire con (z.d. )

In [None]:
data["Codice via"] #boh
data['Descrizione via'] #é IL NOME DELLA VIA

In [23]:
Split_1=data.Ubicazione.str.split(' ',n=1,expand=True)
data[['Tipologia','Altro']]=Split_1
data[data["Tipo via"]!=data["Tipologia"]] #Check that the old Tipo via is coherent
#Since they are coherent we can fix the null values
data["Tipo via"] = data["Tipologia"]
data = data.drop(["Tipologia"], axis = 1)
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 451 entries, 0 to 450
Data columns (total 16 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Ubicazione                    451 non-null    object 
 1   Tipo via                      451 non-null    object 
 2   Descrizione via               437 non-null    object 
 3   Civico                        421 non-null    float64
 4   Codice via                    437 non-null    float64
 5   ZD                            437 non-null    float64
 6   Camere                        450 non-null    float64
 7   Camere piano                  345 non-null    object 
 8   Categoria                     444 non-null    object 
 9   Insegna                       441 non-null    object 
 10  Piani totali                  187 non-null    float64
 11  Piano piano                   199 non-null    object 
 12  Posti letto                   450 non-null    float64
 13  Posti

In [24]:
Split_2=data.Altro.str.split('z.d. ',n=1,expand=True)
data[['Nomeecivico','Muni']]=Split_2
data["Muni"]=data["Muni"].str.slice(-2,-1)
data["Muni"].unique()


array(['6', None, '3', '1', '5', '4', '8', '7', '9', '2'], dtype=object)

In [25]:
data[data["ZD"].fillna(-1).astype(int).astype(str)!=data["Muni"]] #Check that the old Tipo via is coherent
#There are some inconsistencies
#data.loc[data["Municipio"].isna(), "Municipio"] = data["Muni"]
data.loc[data["Muni"].isna(), "Muni"] = data["ZD"]
data[data["ZD"].fillna(-1).astype(int).astype(str)!=data["Muni"].fillna(-1).astype(int).astype(str)]
data["ZD"]=data["Muni"]
data = data.drop(["Muni"], axis = 1)
data=data.rename(columns={'ZD':'Municipio'})
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 451 entries, 0 to 450
Data columns (total 17 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Ubicazione                    451 non-null    object 
 1   Tipo via                      451 non-null    object 
 2   Descrizione via               437 non-null    object 
 3   Civico                        421 non-null    float64
 4   Codice via                    437 non-null    float64
 5   Municipio                     448 non-null    object 
 6   Camere                        450 non-null    float64
 7   Camere piano                  345 non-null    object 
 8   Categoria                     444 non-null    object 
 9   Insegna                       441 non-null    object 
 10  Piani totali                  187 non-null    float64
 11  Piano piano                   199 non-null    object 
 12  Posti letto                   450 non-null    float64
 13  Posti

In [26]:
data[data['Municipio'].isna()] #3 Null values are still present

Unnamed: 0,Ubicazione,Tipo via,Descrizione via,Civico,Codice via,Municipio,Camere,Camere piano,Categoria,Insegna,Piani totali,Piano piano,Posti letto,Posti letto per piano,Tipo attività struture extra,Altro,Nomeecivico
1,codvia 0000 num.024 ; (),codvia,,,,,259.0,259.0,4,radisson blu hotel milan,,,518.0,518.0,Albergo,0000 num.024 ; (),0000 num.024 ; ()
206,VIA LORENTEGGIO num.278 ;,VIA,,,,,128.0,,4,IDEA HOTEL MILANO LORENTEGGIO,,,242.0,,Albergo,LORENTEGGIO num.278 ;,LORENTEGGIO num.278 ;
355,VIA STEPHENSON GIORGIO Int. 55,VIA,,,,,256.0,,4,,,,512.0,,Albergo,STEPHENSON GIORGIO Int. 55,STEPHENSON GIORGIO Int. 55


In [27]:
delim = ' N. | num.| Int.'
Split_3 = data.Nomeecivico.str.split(delim, n=1, expand=True, regex=True)
data[['Nome','Civ']]=Split_3
data['Nome'].unique()
data[data['Nome']!=data['Descrizione via']] #They are coherent
data['Descrizione via']=data['Nome']
data = data.drop(["Nome"], axis = 1)
data=data.rename(columns={'Descrizione via':'Nome via'})
data.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 451 entries, 0 to 450
Data columns (total 18 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Ubicazione                    451 non-null    object 
 1   Tipo via                      451 non-null    object 
 2   Nome via                      451 non-null    object 
 3   Civico                        421 non-null    float64
 4   Codice via                    437 non-null    float64
 5   Municipio                     448 non-null    object 
 6   Camere                        450 non-null    float64
 7   Camere piano                  345 non-null    object 
 8   Categoria                     444 non-null    object 
 9   Insegna                       441 non-null    object 
 10  Piani totali                  187 non-null    float64
 11  Piano piano                   199 non-null    object 
 12  Posti letto                   450 non-null    float64
 13  Posti

In [None]:
data = data.drop(["Altro"], axis = 1)
data = data.drop(["Nomeecivico"], axis = 1)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 451 entries, 0 to 450
Data columns (total 16 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Ubicazione                    451 non-null    object 
 1   Tipo via                      451 non-null    object 
 2   Nome via                      451 non-null    object 
 3   Civico                        421 non-null    float64
 4   Codice via                    437 non-null    float64
 5   Municipio                     448 non-null    object 
 6   Camere                        450 non-null    float64
 7   Camere piano                  345 non-null    object 
 8   Categoria                     444 non-null    object 
 9   Insegna                       441 non-null    object 
 10  Piani totali                  187 non-null    float64
 11  Piano piano                   199 non-null    object 
 12  Posti letto                   450 non-null    float64
 13  Posti

In [29]:
data['Civ'].unique()
Split_4 = data.Civ.str.split(' ', n=1, expand=True)
data[['Num','Altro']]=Split_4
data

Unnamed: 0,Ubicazione,Tipo via,Nome via,Civico,Codice via,Municipio,Camere,Camere piano,Categoria,Insegna,Piani totali,Piano piano,Posti letto,Posti letto per piano,Tipo attività struture extra,Civ,Num,Altro
0,ALZ NAVIGLIO GRANDE N. 8 (z.d. 6),ALZ,NAVIGLIO GRANDE,8.0,5144.0,6,14.0,,4,HOTEL MAISON BORELLA,,,25.0,,Albergo,8 (,8,(
1,codvia 0000 num.024 ; (),codvia,0000,,,,259.0,259,4,radisson blu hotel milan,,,518.0,518,Albergo,024 ; (),024,; ()
2,CSO BUENOS AIRES N. 18 (z.d. 3),CSO,BUENOS AIRES,18.0,2129.0,3,16.0,16,1,hotel aurora,1.0,1,25.0,25,Albergo,18 (,18,(
3,CSO BUENOS AIRES N. 26 (z.d. 3),CSO,BUENOS AIRES,26.0,2129.0,3,25.0,,3,hotel buenos aires,,,39.0,,Albergo,26 (,26,(
4,CSO BUENOS AIRES N. 2 (z.d. 3),CSO,BUENOS AIRES,2.0,2129.0,3,46.0,15;11;8,3,albergo fenice,4.0,1;2;3;4,98.0,24;19;13,Albergo,2 (,2,(
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
446,VLE TUNISIA N. 6 (z.d. 3),VLE,TUNISIA,6.0,2121.0,3,13.0,16,1,hotel kennedy,1.0,6,23.0,23,Albergo,6 (,6,(
447,VLE TUNISIA N. 9 (z.d. 3),VLE,TUNISIA,9.0,2121.0,3,50.0,3;6;9;9;9,4,st. george hotel,,A;1;2;3;4,99.0,5;16;26;26;26,Albergo,9 (,9,(
448,VLE VITTORIO VENETO N. 30 (z.d. 2),VLE,VITTORIO VENETO,30.0,2107.0,2,15.0,,2,hotel casa mia,,,25.0,,Albergo,30 (,30,(
449,VLE ZARA N. 1 (z.d. 9),VLE,ZARA,1.0,1170.0,9,32.0,,4,casa albergo residence zara lagosta,,,64.0,,Residence,1 (,1,(


In [30]:
data['Altro'].unique()

array(['(', ' ; ()', '; (', 'cinque stelle lusso; (',
       'sembra avere come indirizzo ufficiale corso buenos aires 42 ma ermes non riconosce il civico (',
       ';', 'subingresso con riduzione came; (',
       'r.t.a.con 23 alloggi e 24 p.l.; (', 'num.001 ; (', 'num.000 ; (',
       'suap; (', 'deleg. somministr. zheng ruile; (', '; ()',
       'residence cerva; (', 'numero civico 7/d; (', 'n. civico 19/b (',
       'iniziale; (', '3 e 4 piano; (', 'via goldoni 84 2^ piano; (',
       'licenza rilasciata dallo suap; (', 'angolo finocchiaro aprile; (',
       'annessa dipendenza 3 stelle; (',
       "attivita' promiscua rta e albe; (", '2^  e 4° piano; (',
       'rta due stelle-appartam 7-14pl; (', '1ø piano; (',
       'via san raffaele 7/9; (', 'Int. a (',
       'rilascio licenza dallo suap; (', '55', 'sportello unico; (',
       'licenza rilasciata dallo sport; (', 'ang. via petrocchi n. 1; (',
       '2ø piano; ('], dtype=object)

In [31]:
data = data.drop(["Altro"], axis = 1)
data = data.drop(["Civ"], axis = 1)

In [32]:
data[data["Num"].fillna(-1).astype(str)!=data["Civico"].fillna(-1).astype(int).astype(str)]
data["Civico"]=data["Num"]
data = data.drop(["Num"], axis = 1)
data["Civico"].unique()

array(['8', '024', '18', '26', '2', '33', '3', '15', '1', '48', '64', '9',
       '68', '84', '34', '011/a', '4', '74', '61', '78', '019', '004/6',
       '69', '091', '32', '43', '55', '5', '10', '002', '12', '14', '22',
       '012/14', '6', '31', '25', '13', '17', '20', '16', '125', '30',
       '121', '79', '086/88', '001/a', '24', '006/b', '21', '59', '27',
       '004/a', '28', '46', '38', '37', '39', '47', '7', '71', '19', '29',
       '018/20', '93', '41', '010/12', '025/a', '11', '35', '73', '49',
       '40', '50', '117', '278', '77', '045/a', '83', '90', '371', '60',
       '153', '030', '004/7', '132', '143', '170', '52', '56', '87', '45',
       '102', '134', '76', '81', '012', '005/7', '', '23', '021/3', '85',
       '018/a', '42', '66', '67', '108', '88', '060/62', '12/13', '120',
       '139', '013/15', '300', '89'], dtype=object)

In [33]:
data['Civico'] = data['Civico'].str.lstrip('0')
data['Civico'].unique()

array(['8', '24', '18', '26', '2', '33', '3', '15', '1', '48', '64', '9',
       '68', '84', '34', '11/a', '4', '74', '61', '78', '19', '4/6', '69',
       '91', '32', '43', '55', '5', '10', '12', '14', '22', '12/14', '6',
       '31', '25', '13', '17', '20', '16', '125', '30', '121', '79',
       '86/88', '1/a', '6/b', '21', '59', '27', '4/a', '28', '46', '38',
       '37', '39', '47', '7', '71', '29', '18/20', '93', '41', '10/12',
       '25/a', '11', '35', '73', '49', '40', '50', '117', '278', '77',
       '45/a', '83', '90', '371', '60', '153', '4/7', '132', '143', '170',
       '52', '56', '87', '45', '102', '134', '76', '81', '5/7', '', '23',
       '21/3', '85', '18/a', '42', '66', '67', '108', '88', '60/62',
       '12/13', '120', '139', '13/15', '300', '89'], dtype=object)

In [None]:
data.info()
data[data['Municipio'].isna()]
#data[data['Nome via']=='LORENTEGGIO']
#data[data['Nome via']=='STEPHENSON GIORGIO']
#There are non other vie with this name from which to extract the Municipio

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 451 entries, 0 to 450
Data columns (total 15 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Ubicazione                    451 non-null    object 
 1   Tipo via                      451 non-null    object 
 2   Nome via                      451 non-null    object 
 3   Civico                        451 non-null    object 
 4   Codice via                    437 non-null    float64
 5   Municipio                     448 non-null    object 
 6   Camere                        450 non-null    float64
 7   Camere piano                  345 non-null    object 
 8   Categoria                     444 non-null    object 
 9   Insegna                       441 non-null    object 
 10  Piani totali                  187 non-null    float64
 11  Piano piano                   199 non-null    object 
 12  Posti letto                   450 non-null    float64
 13  Posti

Unnamed: 0,Ubicazione,Tipo via,Nome via,Civico,Codice via,Municipio,Camere,Camere piano,Categoria,Insegna,Piani totali,Piano piano,Posti letto,Posti letto per piano,Tipo attività struture extra
355,VIA STEPHENSON GIORGIO Int. 55,VIA,STEPHENSON GIORGIO,,,,256.0,,4,,,,512.0,,Albergo


In [41]:
data['Categoria'].unique()
data=data.rename(columns={'Categoria':'Stelle'})


In [None]:
data['Tipo attività struture extra'].unique()
data=data.rename(columns={'Tipo attività struture extra':'Tipo struttura'})
data.loc[data['Tipo struttura']=='albergo', "Tipo struttura"] = "Albergo"
data['Tipo struttura'].unique()

array(['Albergo', 'Residence', nan], dtype=object)

In [48]:
data[data['Tipo struttura'].isna()]
condition = data["Insegna"].str.contains("hotel", case=False, na=False)
data.loc[condition, "Tipo struttura"] = "Albergo"
data[data['Tipo struttura'].isna()]

Unnamed: 0,Ubicazione,Tipo via,Nome via,Civico,Codice via,Municipio,Camere,Camere piano,Stelle,Insegna,Piani totali,Piano piano,Posti letto,Posti letto per piano,Tipo struttura
182,VIA HAJECH CAMILLO N. 18 (z.d. 4),VIA,HAJECH CAMILLO,18,3091.0,4,11.0,0.0,2.0,la caravella,,,16.0,0.0,
322,VIA SANTA RADEGONDA N. 14 (z.d. 1),VIA,SANTA RADEGONDA,14,,1,,,,,,,,,


## Error detection and correction

## Data duplication