# **1. DATA QUALITY ASSESSMENT**

Import libraries:

In [2]:
import pandas as pd
import numpy as np
from datetime import datetime
import os

Import data:

In [13]:
STRUTTURE = pd.read_csv('./Comune-di-Milano-Strutture-ricettive-alberghiere.csv',sep=';',encoding='unicode_escape')
STRUTTURE

Unnamed: 0,Ubicazione,Tipo via,Descrizione via,Civico,Codice via,ZD,Camere,Camere piano,Categoria,Insegna,Piani totali,Piano piano,Posti letto,Posti letto per piano,Tipo attività struture extra
0,ALZ NAVIGLIO GRANDE N. 8 (z.d. 6),ALZ,NAVIGLIO GRANDE,8.0,5144.0,6.0,14.0,,4,HOTEL MAISON BORELLA,,,25.0,,Albergo
1,codvia 0000 num.024 ; (),,,,,,259.0,259,4,radisson blu hotel milan,,,518.0,518,Albergo
2,CSO BUENOS AIRES N. 18 (z.d. 3),CSO,BUENOS AIRES,18.0,2129.0,3.0,16.0,16,1,hotel aurora,1.0,1,25.0,25,Albergo
3,CSO BUENOS AIRES N. 26 (z.d. 3),CSO,BUENOS AIRES,26.0,2129.0,3.0,25.0,,3,hotel buenos aires,,,39.0,,Albergo
4,CSO BUENOS AIRES N. 2 (z.d. 3),CSO,BUENOS AIRES,2.0,2129.0,3.0,46.0,15;11;8,3,albergo fenice,4.0,1;2;3;4,98.0,24;19;13,Albergo
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
446,VLE TUNISIA N. 6 (z.d. 3),VLE,TUNISIA,6.0,2121.0,3.0,13.0,16,1,hotel kennedy,1.0,6,23.0,23,Albergo
447,VLE TUNISIA N. 9 (z.d. 3),VLE,TUNISIA,9.0,2121.0,3.0,50.0,3;6;9;9;9,4,st. george hotel,,A;1;2;3;4,99.0,5;16;26;26;26,Albergo
448,VLE VITTORIO VENETO N. 30 (z.d. 2),VLE,VITTORIO VENETO,30.0,2107.0,2.0,15.0,,2,hotel casa mia,,,25.0,,Albergo
449,VLE ZARA N. 1 (z.d. 9),VLE,ZARA,1.0,1170.0,9.0,32.0,,4,casa albergo residence zara lagosta,,,64.0,,Residence


Basic operation to inspect data:

In [None]:
#number of tuples and columns of the data source
STRUTTURE.shape

(451, 15)

In [None]:
#show the schema of the data source
STRUTTURE.columns
for c in STRUTTURE.columns:
  print(c)


Ubicazione
Tipo via
Descrizione via
Civico
Codice via
ZD
Camere
Camere piano
Categoria
Insegna
Piani totali
Piano piano
Posti letto
Posti letto per piano
Tipo attività struture extra


In [None]:
#show the first 5 tuples of the data source
#BEERS.head(5)
#head(K) shows the first K lines of the data source

In [None]:
#for each attribute the system shows the type of data. The type of data is defined analyzing the values
STRUTTURE.dtypes

Ubicazione                       object
Tipo via                         object
Descrizione via                  object
Civico                          float64
Codice via                      float64
ZD                              float64
Camere                          float64
Camere piano                     object
Categoria                        object
Insegna                          object
Piani totali                    float64
Piano piano                      object
Posti letto                     float64
Posti letto per piano            object
Tipo attività struture extra     object
dtype: object

In [None]:
#unique display the list of distinct values in a column
#BEERS['brewery_id']

In [4]:
#nunique counts the number of distinct values
for c in STRUTTURE.columns:
  print(f"{c}: {STRUTTURE[c].nunique()}")
  
#BEERS['brewery_id'].unique()
#BEERS['brewery_id'].nunique() #for numbers

Ubicazione: 438
Tipo via: 8
Descrizione via: 300
Civico: 90
Codice via: 302
ZD: 9
Camere: 148
Camere piano: 207
Categoria: 8
Insegna: 437
Piani totali: 11
Piano piano: 58
Posti letto: 196
Posti letto per piano: 219
Tipo attività struture extra: 3


In [None]:
#value_counts() returns an object containing counts for each unique value

#BEERS['brewery_id'].value_counts()
#BEERS['brewery_id'].value_counts().value_counts()  #<- to see uniformity, distribution

In [None]:
#here we want to inspect how many unique values have the same count


**DUPLICATION**

Duplication occurs when a real-world entity is stored twice or more in a data source.

*Definition*: A measure of unwanted duplication existing within a data set.

*Evaluation*: Number of duplicates

In [None]:
#duplicated returns a boolean Series denoting the duplicate rows (exact matching)
STRUTTURE.duplicated()

#any shows if duplicates exist
STRUTTURE.duplicated().any()
#STRUTTURE[STRUTTURE.duplicated()]

False

**COMPLETENESS**

The completeness of a table characterizes the extent to which a table represents the corresponding real world.

Completeness in the relational model can be characterized by the presence of null values. In a model with null values, the presence of a null value has the general meaning of a missing value, i.e., a value that exist in the real-world but it is not available.

*Definition*: The degree to which a given data collection includes the data describing the corresponding set of real-world objects.

*Evaluation*: Number of not null values / Total number of values

In [None]:
#isnull() shows which values are null

STRUTTURE.isnull()

Unnamed: 0,Ubicazione,Tipo via,Descrizione via,Civico,Codice via,ZD,Camere,Camere piano,Categoria,Insegna,Piani totali,Piano piano,Posti letto,Posti letto per piano,Tipo attività struture extra
0,False,False,False,False,False,False,False,True,False,False,True,True,False,True,False
1,False,True,True,True,True,True,False,False,False,False,True,True,False,False,False
2,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,True,False,False,True,True,False,True,False
4,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
446,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
447,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False
448,False,False,False,False,False,False,False,True,False,False,True,True,False,True,False
449,False,False,False,False,False,False,False,True,False,False,True,True,False,True,False


In [None]:
#display the number of not null values for each column
STRUTTURE.count()

Ubicazione                      451
Tipo via                        437
Descrizione via                 437
Civico                          421
Codice via                      437
ZD                              437
Camere                          450
Camere piano                    345
Categoria                       444
Insegna                         441
Piani totali                    187
Piano piano                     199
Posti letto                     450
Posti letto per piano           345
Tipo attività struture extra    441
dtype: int64

In [None]:
#total number of not null values
NOT_NULL = STRUTTURE.count().sum()

In [16]:
#display the number of null values for each column
STRUTTURE.isnull().sum()

Ubicazione                        0
Tipo via                         14
Descrizione via                  14
Civico                           30
Codice via                       14
ZD                               14
Camere                            1
Camere piano                    106
Categoria                         7
Insegna                          10
Piani totali                    264
Piano piano                     252
Posti letto                       1
Posti letto per piano           106
Tipo attività struture extra     10
dtype: int64

In [None]:
#total number of null values
NULL = STRUTTURE.isnull().sum().sum()
NULL

843

In [None]:
#total number of cells
TOT = NOT_NULL + NULL
TOT   #ok (451*15)

6765

COMPLETENESS EVALUATION:

In [23]:
COMPLETENESS = '{:,.2%}'.format(NOT_NULL/TOT)
COMPLETENESS

'87.54%'

Dealing with missing values with a different format:

In [None]:
MISSING = ['--', 'na', 'n.a.', 'N/A', 'NA', 'NaN', 'nan', 'null', 'Null', 'NULL'] #just to test

PROPERTY = pd.read_csv('./Comune-di-Milano-Strutture-ricettive-alberghiere.csv', sep=';',encoding='unicode_escape', na_values = MISSING)
PROPERTY
print(PROPERTY.isnull().sum().sum()) #<- result same as before
#PROPERTY = pd.read_csv('https://raw.githubusercontent.com/camillasancricca/DATADIQ/master/PROPERTY.csv', na_values = MISSING)

843


In [None]:
#we added to the set of missing values also 'na' and '--'

#PROPERTY = pd.read_csv('https://raw.githubusercontent.com/camillasancricca/DATADIQ/master/PROPERTY.csv') #, na_values = MISSING)

**ACCURACY**

*Definition*: The extent to which data are correct, reliable and certified.

Syntactic Accuracy is the closeness of a value v to the elements of the corresponding definition domain D.

Semantic Accuracy is defined as the closeness between a data value v and a data value v’.

It is possible to calculate the accuracy of an attribute, i.e., attribute (or column) accuracy, of a relation, i.e., relation accuracy, or of a whole database, i.e., database accuracy.

*Evaluation*: Number of accurate values / Total number of values

*CAN'T BE DONE BECAUSE NO EXTERNAL DATASET (same for TIMELINESS)*

**TIMELINESS**

*Definition*: The extent to which age of the data is appropriate for the task at hand.

Timeliness has two components: currency and volatility. Currency is a measure of how old the information is, based on how long ago it was recorded. Volatility is a measure of information instability/the frequency of change of the value for an entity attribute.
Currency = Age + (Delivery Time - Input Time)

*Evaluation*: Max(0, 1 - Currency/Volatility)

**CONSISTENCY**

The consistency dimension captures the violation of semantic rules defined over (a set of) data items, where items can be tuples of relational tables or records in a file.

Semantic rules can be integrity constaints, data edits or business rules.

*Definition*: The satisfaction of semantic rules defined over a set of data items.

*Evaluation*: Number of consistent tuples / Total number of tuples

In [None]:
#PROPERTY = pd.read_csv('https://raw.githubusercontent.com/camillasancricca/DATADIQ/master/PROPERTY.csv')

In [32]:
STRUTTURE_COPY = STRUTTURE.copy()
STRUTTURE_COPY.head()

Unnamed: 0,Ubicazione,Tipo via,Descrizione via,Civico,Codice via,ZD,Camere,Camere piano,Categoria,Insegna,Piani totali,Piano piano,Posti letto,Posti letto per piano,Tipo attività struture extra
0,ALZ NAVIGLIO GRANDE N. 8 (z.d. 6),ALZ,NAVIGLIO GRANDE,8.0,5144.0,6.0,14.0,,4,HOTEL MAISON BORELLA,,,25.0,,Albergo
1,codvia 0000 num.024 ; (),,,,,,259.0,259,4,radisson blu hotel milan,,,518.0,518,Albergo
2,CSO BUENOS AIRES N. 18 (z.d. 3),CSO,BUENOS AIRES,18.0,2129.0,3.0,16.0,16,1,hotel aurora,1.0,1,25.0,25,Albergo
3,CSO BUENOS AIRES N. 26 (z.d. 3),CSO,BUENOS AIRES,26.0,2129.0,3.0,25.0,,3,hotel buenos aires,,,39.0,,Albergo
4,CSO BUENOS AIRES N. 2 (z.d. 3),CSO,BUENOS AIRES,2.0,2129.0,3.0,46.0,15;11;8,3,albergo fenice,4.0,1;2;3;4,98.0,24;19;13,Albergo


In [None]:
#rules:
#       - 1)Camere = somma Camere piano
#       - 2)Piani totali = somma Piano piano
#       - 3)Posti letto = somma Posti letto per piano
#       - 4)Camere <= (minore/uguale) Posti letto

STRUTTURE_COPY['Camere piano'] = STRUTTURE_COPY['Camere piano'].fillna(0)
sum_Camere = STRUTTURE_COPY['Camere piano'].apply(lambda x: sum(int(n) for n in str(x).split(';') if n.strip().isdigit()))
STRUTTURE_COPY['Check_sum_camere'] = np.where(STRUTTURE_COPY['Camere'] == sum_Camere, 1, 0)

STRUTTURE_COPY['Piano piano'] = STRUTTURE_COPY['Piano piano'].fillna(0)
sum_Piano = STRUTTURE_COPY['Piano piano'].apply(lambda x: sum(int(n) for n in str(x).split(';') if n.strip().isdigit()))
STRUTTURE_COPY['Check_sum_piani'] = np.where(STRUTTURE_COPY['Piani totali'] == sum_Piano, 1, 0)

STRUTTURE_COPY['Posti letto per piano'] = STRUTTURE_COPY['Posti letto per piano'].fillna(0)
sum_Posti_letto_piano = STRUTTURE_COPY['Posti letto per piano'].apply(lambda x: sum(int(n) for n in str(x).split(';') if n.strip().isdigit()))
STRUTTURE_COPY['Check_sum_posti_letto'] = np.where(STRUTTURE_COPY['Posti letto'] == sum_Posti_letto_piano, 1, 0)

STRUTTURE_COPY['Check_camere_minore_letti'] = np.where(sum_Camere <= sum_Posti_letto_piano, 1, 0)

STRUTTURE_COPY.head()

Unnamed: 0,Ubicazione,Tipo via,Descrizione via,Civico,Codice via,ZD,Camere,Camere piano,Categoria,Insegna,Piani totali,Piano piano,Posti letto,Posti letto per piano,Tipo attività struture extra,Check_sum_camere,Check_sum_piani,Check_sum_posti_letto,Check_camere_minore_letti
0,ALZ NAVIGLIO GRANDE N. 8 (z.d. 6),ALZ,NAVIGLIO GRANDE,8.0,5144.0,6.0,14.0,0,4,HOTEL MAISON BORELLA,,0,25.0,0,Albergo,0,0,0,1
1,codvia 0000 num.024 ; (),,,,,,259.0,259,4,radisson blu hotel milan,,0,518.0,518,Albergo,1,0,1,1
2,CSO BUENOS AIRES N. 18 (z.d. 3),CSO,BUENOS AIRES,18.0,2129.0,3.0,16.0,16,1,hotel aurora,1.0,1,25.0,25,Albergo,1,1,1,1
3,CSO BUENOS AIRES N. 26 (z.d. 3),CSO,BUENOS AIRES,26.0,2129.0,3.0,25.0,0,3,hotel buenos aires,,0,39.0,0,Albergo,0,0,0,1
4,CSO BUENOS AIRES N. 2 (z.d. 3),CSO,BUENOS AIRES,2.0,2129.0,3.0,46.0,15;11;8,3,albergo fenice,4.0,1;2;3;4,98.0,24;19;13,Albergo,0,0,0,1


In [35]:

PROPERTY_count = STRUTTURE_COPY[((STRUTTURE_COPY['Camere'].notna())& (STRUTTURE_COPY['Piani totali'].notna())&(STRUTTURE_COPY['Posti letto'].notna()))]
PROPERTY_count

Unnamed: 0,Ubicazione,Tipo via,Descrizione via,Civico,Codice via,ZD,Camere,Camere piano,Categoria,Insegna,Piani totali,Piano piano,Posti letto,Posti letto per piano,Tipo attività struture extra,Check_sum_camere,Check_sum_piani,Check_sum_posti_letto,Check_camere_minore_letti
2,CSO BUENOS AIRES N. 18 (z.d. 3),CSO,BUENOS AIRES,18.0,2129.0,3.0,16.0,16,1,hotel aurora,1.0,1,25.0,25,Albergo,1,1,1,1
4,CSO BUENOS AIRES N. 2 (z.d. 3),CSO,BUENOS AIRES,2.0,2129.0,3.0,46.0,15;11;8,3,albergo fenice,4.0,1;2;3;4,98.0,24;19;13,Albergo,0,0,0,1
6,CSO BUENOS AIRES N. 3 (z.d. 3),CSO,BUENOS AIRES,3.0,2129.0,3.0,116.0,4;23;24;24,4,cristoforo colombo,4.0,2;3;4;5,191.0,5;38;40;40,Albergo,0,0,0,1
11,CSO DI PORTA ROMANA N. 64 (z.d. 1),CSO,DI PORTA ROMANA,64.0,402.0,1.0,72.0,0,4,hotel romana residence,6.0,0,147.0,0,Residence,0,0,0,1
12,CSO EUROPA N. 9 (z.d. 1),CSO,EUROPA,9.0,300.0,1.0,89.0,12;12;12;12;12;11;5,4,hotel galileo,7.0,1;2;3;4;5;6;7,159.0,22;22;22;22;22;21;6,Albergo,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
442,VLE SUZZANI GIOVANNI N. 13 (z.d. 9),VLE,SUZZANI GIOVANNI,13.0,1446.0,9.0,172.0,30;11,4,novotel milano nord,7.0,1;2;3;4;5;6;7,344.0,60;22,Albergo,0,0,0,1
443,VLE SUZZANI GIOVANNI num.013/15 ; (z.d. 9),,,,,,131.0,33,3,hotel ibis milano ca granda,4.0,1;2;3;4,262.0,66,Albergo,0,0,0,1
445,VLE TUNISIA N. 6 (z.d. 3),VLE,TUNISIA,6.0,2121.0,3.0,12.0,0,1,hotel san tomaso,1.0,0,25.0,0,Albergo,0,0,0,1
446,VLE TUNISIA N. 6 (z.d. 3),VLE,TUNISIA,6.0,2121.0,3.0,13.0,16,1,hotel kennedy,1.0,6,23.0,23,Albergo,0,0,1,1


In [None]:
#we define a rule that the number of bathrooms should be lower than the number of bedrooms
#we add the column consistency
#we assign the value 1 if the rule is satisfied, 0 otherwise

#fix the error into NUM_BATH column
#PROPERTY['NUM_BATH'] = pd.to_numeric(PROPERTY['NUM_BATH'], errors='coerce')

#PROPERTY['consistency'] = np.where(PROPERTY['NUM_BATH'] >= PROPERTY['NUM_BEDROOMS'],
#                                   0,
#                                   1)
#PROPERTY.head(10)

In [56]:
#exclude null vales in NUM_BATH and NUM_BEDROOMS
PROPERTY_COUNT = PROPERTY[((PROPERTY['NUM_BATH'].notna()) & (PROPERTY['NUM_BEDROOMS'].notna()))]
PROPERTY_COUNT

Unnamed: 0,ID,ST_NUM,ST_NAME,OWN_OCCUPIED,NUM_BEDROOMS,NUM_BATH,SQ_FT,TS_UPDATE,consistency
0,100001000,104.0,PUTNAM,Y,3,1.0,1000,11/8/2022,1
1,100002000,197.0,LEXINGTON,N,3,1.5,--,22/6/2022,1
4,100005000,203.0,BERKELEY,Y,3,2.0,1600,31/8/2022,1
7,100008000,213.0,TREMONT,Y,1,1.0,70,12/7/2022,0
8,100009000,215.0,TREMONT,Y,na,2.0,1800,25/7/2022,1
9,100010000,216.0,BERELEY,Y,1,3.0,10,27/9/2022,0
10,100011000,10.0,LEINGTON,N,2,1.0,800,21/8/2022,1
11,100012000,213.0,TREMONT,Y,1,1.0,78,12/7/2022,0


In [37]:
#count the number of consistent tuples considering the rule
CONSISTENT1 = sum(PROPERTY_count['Check_sum_camere'] == 1)
CONSISTENT2 = sum(PROPERTY_count['Check_sum_piani'] == 1)
CONSISTENT3 = sum(PROPERTY_count['Check_sum_posti_letto'] == 1)
CONSISTENT4 = sum(PROPERTY_count['Check_camere_minore_letti'] == 1)

COUNT1 = CONSISTENT1 / len(PROPERTY_count)
COUNT2 = CONSISTENT2 / len(PROPERTY_count)
COUNT3 = CONSISTENT3 / len(PROPERTY_count)
COUNT4 = CONSISTENT4 / len(PROPERTY_count)

CONSISTENCY1 = '{:,.2%}'.format(COUNT1)
CONSISTENCY2 = '{:,.2%}'.format(COUNT2)
CONSISTENCY3 = '{:,.2%}'.format(COUNT3)
CONSISTENCY4 = '{:,.2%}'.format(COUNT4)

print("Consistency Check 1 (Camere = somma Camere piano): ", CONSISTENCY1)
print("Consistency Check 2 (Piani totali = somma Piano piano): ", CONSISTENCY2)
print("Consistency Check 3 (Posti letto = somma Posti letto per piano): ", CONSISTENCY3)
print("Consistency Check 4 (Camere <= Posti letto): ", CONSISTENCY4)    

#CONSISTENT = sum(PROPERTY_COUNT['consistency'] == 1)
#CONSISTENT

Consistency Check 1 (Camere = somma Camere piano):  41.18%
Consistency Check 2 (Piani totali = somma Piano piano):  17.65%
Consistency Check 3 (Posti letto = somma Posti letto per piano):  31.02%
Consistency Check 4 (Camere <= Posti letto):  99.47%


In [58]:
#count the total number of tuples in the property dataset
ROWS = PROPERTY.shape[0]
ROWS

12

In [60]:
#count the total number of tuples in the property dataset (excluding null values)
COUNT = PROPERTY_COUNT['consistency'].count()
COUNT

np.int64(8)

CONSISTENCY EVALUATION:

In [61]:
CONSISTENCY = CONSISTENT/COUNT*100
print('Consistency:', CONSISTENCY, '%')

Consistency: 62.5 %
