# **1. DATA QUALITY ASSESSMENT**

Import libraries:

In [2]:
import pandas as pd
import numpy as np
from datetime import datetime
import os

Import data:

In [4]:
STRUTTURE = pd.read_csv('./Comune-di-Milano-Strutture-ricettive-alberghiere.csv',sep=';',encoding='unicode_escape')
STRUTTURE.head()
#BEERS = pd.read_csv("https://raw.githubusercontent.com/camillasancricca/DATADIQ/master/BEERS.csv")
#BEERS

Unnamed: 0,Ubicazione,Tipo via,Descrizione via,Civico,Codice via,ZD,Camere,Camere piano,Categoria,Insegna,Piani totali,Piano piano,Posti letto,Posti letto per piano,Tipo attività struture extra
0,ALZ NAVIGLIO GRANDE N. 8 (z.d. 6),ALZ,NAVIGLIO GRANDE,8.0,5144.0,6.0,14.0,,4,HOTEL MAISON BORELLA,,,25.0,,Albergo
1,codvia 0000 num.024 ; (),,,,,,259.0,259,4,radisson blu hotel milan,,,518.0,518,Albergo
2,CSO BUENOS AIRES N. 18 (z.d. 3),CSO,BUENOS AIRES,18.0,2129.0,3.0,16.0,16,1,hotel aurora,1.0,1,25.0,25,Albergo
3,CSO BUENOS AIRES N. 26 (z.d. 3),CSO,BUENOS AIRES,26.0,2129.0,3.0,25.0,,3,hotel buenos aires,,,39.0,,Albergo
4,CSO BUENOS AIRES N. 2 (z.d. 3),CSO,BUENOS AIRES,2.0,2129.0,3.0,46.0,15;11;8,3,albergo fenice,4.0,1;2;3;4,98.0,24;19;13,Albergo


Basic operation to inspect data:

In [5]:
#number of tuples and columns of the data source
STRUTTURE.shape
#BEERS.shape

(451, 15)

In [6]:
#show the schema of the data source
STRUTTURE.columns
for c in STRUTTURE.columns:
  print(c)

#BEERS.columns
#for c in BEERS.columns:
#  print(c)


Ubicazione
Tipo via
Descrizione via
Civico
Codice via
ZD
Camere
Camere piano
Categoria
Insegna
Piani totali
Piano piano
Posti letto
Posti letto per piano
Tipo attività struture extra


In [None]:
#show the first 5 tuples of the data source
#BEERS.head(5)

In [None]:
#head(K) shows the first K lines of the data source


In [7]:
#for each attribute the system shows the type of data. The type of data is defined analyzing the values
STRUTTURE.dtypes
#BEERS.dtypes

Ubicazione                       object
Tipo via                         object
Descrizione via                  object
Civico                          float64
Codice via                      float64
ZD                              float64
Camere                          float64
Camere piano                     object
Categoria                        object
Insegna                          object
Piani totali                    float64
Piano piano                      object
Posti letto                     float64
Posti letto per piano            object
Tipo attività struture extra     object
dtype: object

In [None]:
#unique display the list of distinct values in a column
#BEERS['brewery_id']

In [None]:
#nunique counts the number of distinct values

#BEERS['brewery_id'].unique()
#BEERS['brewery_id'].nunique() #for numbers

In [None]:
#value_counts() returns an object containing counts for each unique value

#BEERS['brewery_id'].value_counts()
#BEERS['brewery_id'].value_counts().value_counts()  #<- to see uniformity, distribution

In [None]:
#here we want to inspect how many unique values have the same count


**DUPLICATION**

Duplication occurs when a real-world entity is stored twice or more in a data source.

*Definition*: A measure of unwanted duplication existing within a data set.

*Evaluation*: Number of duplicates

In [8]:
#duplicated returns a boolean Series denoting the duplicate rows (exact matching)
STRUTTURE.duplicated()
#BEERS.duplicated()

0      False
1      False
2      False
3      False
4      False
       ...  
446    False
447    False
448    False
449    False
450    False
Length: 451, dtype: bool

In [9]:
#any shows if duplicates exist
STRUTTURE.duplicated().any()
#BEERS.duplicated().any()

False

In [10]:
STRUTTURE[STRUTTURE.duplicated()]

Unnamed: 0,Ubicazione,Tipo via,Descrizione via,Civico,Codice via,ZD,Camere,Camere piano,Categoria,Insegna,Piani totali,Piano piano,Posti letto,Posti letto per piano,Tipo attività struture extra


**COMPLETENESS**

The completeness of a table characterizes the extent to which a table represents the corresponding real world.

Completeness in the relational model can be characterized by the presence of null values. In a model with null values, the presence of a null value has the general meaning of a missing value, i.e., a value that exist in the real-world but it is not available.

*Definition*: The degree to which a given data collection includes the data describing the corresponding set of real-world objects.

*Evaluation*: Number of not null values / Total number of values

In [11]:
#isnull() shows which values are null

STRUTTURE.isnull()
#BEERS.isnull()

Unnamed: 0,Ubicazione,Tipo via,Descrizione via,Civico,Codice via,ZD,Camere,Camere piano,Categoria,Insegna,Piani totali,Piano piano,Posti letto,Posti letto per piano,Tipo attività struture extra
0,False,False,False,False,False,False,False,True,False,False,True,True,False,True,False
1,False,True,True,True,True,True,False,False,False,False,True,True,False,False,False
2,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,True,False,False,True,True,False,True,False
4,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
446,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
447,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False
448,False,False,False,False,False,False,False,True,False,False,True,True,False,True,False
449,False,False,False,False,False,False,False,True,False,False,True,True,False,True,False


In [None]:
#BEERS['ibu'].isnull()

In [12]:
#display the number of not null values for each column
STRUTTURE.count()
#BEERS.count()

Ubicazione                      451
Tipo via                        437
Descrizione via                 437
Civico                          421
Codice via                      437
ZD                              437
Camere                          450
Camere piano                    345
Categoria                       444
Insegna                         441
Piani totali                    187
Piano piano                     199
Posti letto                     450
Posti letto per piano           345
Tipo attività struture extra    441
dtype: int64

In [13]:
#total number of not null values
NOT_NULL = STRUTTURE.count().sum()
#NOT_NULL = BEERS.count().sum()

In [16]:
#display the number of null values for each column
STRUTTURE.isnull().sum()

Ubicazione                        0
Tipo via                         14
Descrizione via                  14
Civico                           30
Codice via                       14
ZD                               14
Camere                            1
Camere piano                    106
Categoria                         7
Insegna                          10
Piani totali                    264
Piano piano                     252
Posti letto                       1
Posti letto per piano           106
Tipo attività struture extra     10
dtype: int64

In [17]:
#total number of null values
NULL = STRUTTURE.isnull().sum().sum()
NULL
#BEERS.isnull().sum().sum()

843

In [None]:
#total number of cells
TOT = NOT_NULL + NULL
TOT   #ok (451*15)

6765

COMPLETENESS EVALUATION:

In [23]:
COMPLETENESS = '{:,.2%}'.format(NOT_NULL/TOT)
COMPLETENESS

'87.54%'

Dealing with missing values with a different format:

In [None]:
MISSING = ['--', 'na', 'n.a.', 'N/A', 'NA', 'NaN', 'nan', 'null', 'Null', 'NULL'] #just to test

PROPERTY = pd.read_csv('./Comune-di-Milano-Strutture-ricettive-alberghiere.csv', sep=';',encoding='unicode_escape', na_values = MISSING)
PROPERTY
print(PROPERTY.isnull().sum().sum()) #<- result same as before
#PROPERTY = pd.read_csv('https://raw.githubusercontent.com/camillasancricca/DATADIQ/master/PROPERTY.csv', na_values = MISSING)

843


In [None]:
#we added to the set of missing values also 'na' and '--'

#PROPERTY = pd.read_csv('https://raw.githubusercontent.com/camillasancricca/DATADIQ/master/PROPERTY.csv') #, na_values = MISSING)

**ACCURACY**

*Definition*: The extent to which data are correct, reliable and certified.

Syntactic Accuracy is the closeness of a value v to the elements of the corresponding definition domain D.

Semantic Accuracy is defined as the closeness between a data value v and a data value v’.

It is possible to calculate the accuracy of an attribute, i.e., attribute (or column) accuracy, of a relation, i.e., relation accuracy, or of a whole database, i.e., database accuracy.

*Evaluation*: Number of accurate values / Total number of values

*CAN'T BE DONE BECAUSE NO EXTERNAL DATASET (same for TIMELINESS)*

**TIMELINESS**

*Definition*: The extent to which age of the data is appropriate for the task at hand.

Timeliness has two components: currency and volatility. Currency is a measure of how old the information is, based on how long ago it was recorded. Volatility is a measure of information instability/the frequency of change of the value for an entity attribute.
Currency = Age + (Delivery Time - Input Time)

*Evaluation*: Max(0, 1 - Currency/Volatility)

**CONSISTENCY**

The consistency dimension captures the violation of semantic rules defined over (a set of) data items, where items can be tuples of relational tables or records in a file.

Semantic rules can be integrity constaints, data edits or business rules.

*Definition*: The satisfaction of semantic rules defined over a set of data items.

*Evaluation*: Number of consistent tuples / Total number of tuples

In [None]:
#PROPERTY = pd.read_csv('https://raw.githubusercontent.com/camillasancricca/DATADIQ/master/PROPERTY.csv')

In [None]:
#we define a rule that the number of bathrooms should be lower than the number of bedrooms
#we add the column consistency
#we assign the value 1 if the rule is satisfied, 0 otherwise

#fix the error into NUM_BATH column
PROPERTY['NUM_BATH'] = pd.to_numeric(PROPERTY['NUM_BATH'], errors='coerce')

PROPERTY['consistency'] = np.where(PROPERTY['NUM_BATH'] >= PROPERTY['NUM_BEDROOMS'],
                                   0,
                                   1)
PROPERTY.head(10)

In [56]:
#exclude null vales in NUM_BATH and NUM_BEDROOMS
PROPERTY_COUNT = PROPERTY[((PROPERTY['NUM_BATH'].notna()) & (PROPERTY['NUM_BEDROOMS'].notna()))]
PROPERTY_COUNT

Unnamed: 0,ID,ST_NUM,ST_NAME,OWN_OCCUPIED,NUM_BEDROOMS,NUM_BATH,SQ_FT,TS_UPDATE,consistency
0,100001000,104.0,PUTNAM,Y,3,1.0,1000,11/8/2022,1
1,100002000,197.0,LEXINGTON,N,3,1.5,--,22/6/2022,1
4,100005000,203.0,BERKELEY,Y,3,2.0,1600,31/8/2022,1
7,100008000,213.0,TREMONT,Y,1,1.0,70,12/7/2022,0
8,100009000,215.0,TREMONT,Y,na,2.0,1800,25/7/2022,1
9,100010000,216.0,BERELEY,Y,1,3.0,10,27/9/2022,0
10,100011000,10.0,LEINGTON,N,2,1.0,800,21/8/2022,1
11,100012000,213.0,TREMONT,Y,1,1.0,78,12/7/2022,0


In [57]:
#count the number of consistent tuples considering the rule
CONSISTENT = sum(PROPERTY_COUNT['consistency'] == 1)
CONSISTENT

5

In [58]:
#count the total number of tuples in the property dataset
ROWS = PROPERTY.shape[0]
ROWS

12

In [60]:
#count the total number of tuples in the property dataset (excluding null values)
COUNT = PROPERTY_COUNT['consistency'].count()
COUNT

np.int64(8)

CONSISTENCY EVALUATION:

In [61]:
CONSISTENCY = CONSISTENT/COUNT*100
print('Consistency:', CONSISTENCY, '%')

Consistency: 62.5 %
