# **1. DATA QUALITY ASSESSMENT**

Import libraries:

In [54]:
import pandas as pd
import numpy as np
from datetime import datetime
import os

Import data:

In [86]:
STRUTTURE = pd.read_csv('./Comune-di-Milano-Strutture-ricettive-alberghiere.csv',sep=';',encoding='unicode_escape')
STRUTTURE

Unnamed: 0,Ubicazione,Tipo via,Descrizione via,Civico,Codice via,ZD,Camere,Camere piano,Categoria,Insegna,Piani totali,Piano piano,Posti letto,Posti letto per piano,Tipo attività struture extra
0,ALZ NAVIGLIO GRANDE N. 8 (z.d. 6),ALZ,NAVIGLIO GRANDE,8.0,5144.0,6.0,14.0,,4,HOTEL MAISON BORELLA,,,25.0,,Albergo
1,codvia 0000 num.024 ; (),,,,,,259.0,259,4,radisson blu hotel milan,,,518.0,518,Albergo
2,CSO BUENOS AIRES N. 18 (z.d. 3),CSO,BUENOS AIRES,18.0,2129.0,3.0,16.0,16,1,hotel aurora,1.0,1,25.0,25,Albergo
3,CSO BUENOS AIRES N. 26 (z.d. 3),CSO,BUENOS AIRES,26.0,2129.0,3.0,25.0,,3,hotel buenos aires,,,39.0,,Albergo
4,CSO BUENOS AIRES N. 2 (z.d. 3),CSO,BUENOS AIRES,2.0,2129.0,3.0,46.0,15;11;8,3,albergo fenice,4.0,1;2;3;4,98.0,24;19;13,Albergo
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
446,VLE TUNISIA N. 6 (z.d. 3),VLE,TUNISIA,6.0,2121.0,3.0,13.0,16,1,hotel kennedy,1.0,6,23.0,23,Albergo
447,VLE TUNISIA N. 9 (z.d. 3),VLE,TUNISIA,9.0,2121.0,3.0,50.0,3;6;9;9;9,4,st. george hotel,,A;1;2;3;4,99.0,5;16;26;26;26,Albergo
448,VLE VITTORIO VENETO N. 30 (z.d. 2),VLE,VITTORIO VENETO,30.0,2107.0,2.0,15.0,,2,hotel casa mia,,,25.0,,Albergo
449,VLE ZARA N. 1 (z.d. 9),VLE,ZARA,1.0,1170.0,9.0,32.0,,4,casa albergo residence zara lagosta,,,64.0,,Residence


Basic operation to inspect data:

In [56]:
#number of tuples and columns of the data source
STRUTTURE.shape

(451, 15)

In [57]:
#show the schema of the data source
STRUTTURE.columns
for c in STRUTTURE.columns:
  print(c)


Ubicazione
Tipo via
Descrizione via
Civico
Codice via
ZD
Camere
Camere piano
Categoria
Insegna
Piani totali
Piano piano
Posti letto
Posti letto per piano
Tipo attività struture extra


In [58]:
#show the first 5 tuples of the data source
#BEERS.head(5)
#head(K) shows the first K lines of the data source

In [59]:
#for each attribute the system shows the type of data. The type of data is defined analyzing the values
STRUTTURE.dtypes

Ubicazione                       object
Tipo via                         object
Descrizione via                  object
Civico                          float64
Codice via                      float64
ZD                              float64
Camere                          float64
Camere piano                     object
Categoria                        object
Insegna                          object
Piani totali                    float64
Piano piano                      object
Posti letto                     float64
Posti letto per piano            object
Tipo attività struture extra     object
dtype: object

In [60]:
#unique display the list of distinct values in a column
#BEERS['brewery_id']

In [61]:
#nunique counts the number of distinct values
for c in STRUTTURE.columns:
  print(f"{c}: {STRUTTURE[c].nunique()}")
  
#BEERS['brewery_id'].unique()
#BEERS['brewery_id'].nunique() #for numbers

Ubicazione: 438
Tipo via: 8
Descrizione via: 300
Civico: 90
Codice via: 302
ZD: 9
Camere: 148
Camere piano: 207
Categoria: 8
Insegna: 437
Piani totali: 11
Piano piano: 58
Posti letto: 196
Posti letto per piano: 219
Tipo attività struture extra: 3


In [62]:
#value_counts() returns an object containing counts for each unique value

#BEERS['brewery_id'].value_counts()
#BEERS['brewery_id'].value_counts().value_counts()  #<- to see uniformity, distribution

In [63]:
#here we want to inspect how many unique values have the same count


**DUPLICATION**

Duplication occurs when a real-world entity is stored twice or more in a data source.

*Definition*: A measure of unwanted duplication existing within a data set.

*Evaluation*: Number of duplicates

In [64]:
#duplicated returns a boolean Series denoting the duplicate rows (exact matching)
STRUTTURE.duplicated()

#any shows if duplicates exist
STRUTTURE.duplicated().any()
#STRUTTURE[STRUTTURE.duplicated()]

False

**COMPLETENESS**

The completeness of a table characterizes the extent to which a table represents the corresponding real world.

Completeness in the relational model can be characterized by the presence of null values. In a model with null values, the presence of a null value has the general meaning of a missing value, i.e., a value that exist in the real-world but it is not available.

*Definition*: The degree to which a given data collection includes the data describing the corresponding set of real-world objects.

*Evaluation*: Number of not null values / Total number of values

In [65]:
#isnull() shows which values are null

STRUTTURE.isnull()

Unnamed: 0,Ubicazione,Tipo via,Descrizione via,Civico,Codice via,ZD,Camere,Camere piano,Categoria,Insegna,Piani totali,Piano piano,Posti letto,Posti letto per piano,Tipo attività struture extra
0,False,False,False,False,False,False,False,True,False,False,True,True,False,True,False
1,False,True,True,True,True,True,False,False,False,False,True,True,False,False,False
2,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,True,False,False,True,True,False,True,False
4,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
446,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
447,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False
448,False,False,False,False,False,False,False,True,False,False,True,True,False,True,False
449,False,False,False,False,False,False,False,True,False,False,True,True,False,True,False


In [66]:
#display the number of not null values for each column
STRUTTURE.count()

Ubicazione                      451
Tipo via                        437
Descrizione via                 437
Civico                          421
Codice via                      437
ZD                              437
Camere                          450
Camere piano                    345
Categoria                       444
Insegna                         441
Piani totali                    187
Piano piano                     199
Posti letto                     450
Posti letto per piano           345
Tipo attività struture extra    441
dtype: int64

In [67]:
#total number of not null values
NOT_NULL = STRUTTURE.count().sum()

In [68]:
#display the number of null values for each column
STRUTTURE.isnull().sum()

Ubicazione                        0
Tipo via                         14
Descrizione via                  14
Civico                           30
Codice via                       14
ZD                               14
Camere                            1
Camere piano                    106
Categoria                         7
Insegna                          10
Piani totali                    264
Piano piano                     252
Posti letto                       1
Posti letto per piano           106
Tipo attività struture extra     10
dtype: int64

In [69]:
#total number of null values
NULL = STRUTTURE.isnull().sum().sum()
NULL

843

In [70]:
#total number of cells
TOT = NOT_NULL + NULL
TOT   #ok (451*15)

6765

COMPLETENESS EVALUATION:

In [71]:
COMPLETENESS = '{:,.2%}'.format(NOT_NULL/TOT)
COMPLETENESS

'87.54%'

Dealing with missing values with a different format:

In [72]:
MISSING = ['--', 'na', 'n.a.', 'N/A', 'NA', 'NaN', 'nan', 'null', 'Null', 'NULL'] #just to test

PROPERTY = pd.read_csv('./Comune-di-Milano-Strutture-ricettive-alberghiere.csv', sep=';',encoding='unicode_escape', na_values = MISSING)
PROPERTY
print(PROPERTY.isnull().sum().sum()) #<- result same as before
#PROPERTY = pd.read_csv('https://raw.githubusercontent.com/camillasancricca/DATADIQ/master/PROPERTY.csv', na_values = MISSING)

843


In [73]:
#we added to the set of missing values also 'na' and '--'

#PROPERTY = pd.read_csv('https://raw.githubusercontent.com/camillasancricca/DATADIQ/master/PROPERTY.csv') #, na_values = MISSING)

**ACCURACY**

*Definition*: The extent to which data are correct, reliable and certified.

Syntactic Accuracy is the closeness of a value v to the elements of the corresponding definition domain D.

Semantic Accuracy is defined as the closeness between a data value v and a data value v’.

It is possible to calculate the accuracy of an attribute, i.e., attribute (or column) accuracy, of a relation, i.e., relation accuracy, or of a whole database, i.e., database accuracy.

*Evaluation*: Number of accurate values / Total number of values

*CAN'T BE DONE BECAUSE NO EXTERNAL DATASET (same for TIMELINESS)*

**TIMELINESS**

*Definition*: The extent to which age of the data is appropriate for the task at hand.

Timeliness has two components: currency and volatility. Currency is a measure of how old the information is, based on how long ago it was recorded. Volatility is a measure of information instability/the frequency of change of the value for an entity attribute.
Currency = Age + (Delivery Time - Input Time)

*Evaluation*: Max(0, 1 - Currency/Volatility)

**CONSISTENCY**

The consistency dimension captures the violation of semantic rules defined over (a set of) data items, where items can be tuples of relational tables or records in a file.

Semantic rules can be integrity constaints, data edits or business rules.

*Definition*: The satisfaction of semantic rules defined over a set of data items.

*Evaluation*: Number of consistent tuples / Total number of tuples

In [74]:
STRUTTURE_COPY = STRUTTURE.copy()
STRUTTURE_COPY.head()

Unnamed: 0,Ubicazione,Tipo via,Descrizione via,Civico,Codice via,ZD,Camere,Camere piano,Categoria,Insegna,Piani totali,Piano piano,Posti letto,Posti letto per piano,Tipo attività struture extra
0,ALZ NAVIGLIO GRANDE N. 8 (z.d. 6),ALZ,NAVIGLIO GRANDE,8.0,5144.0,6.0,14.0,,4,HOTEL MAISON BORELLA,,,25.0,,Albergo
1,codvia 0000 num.024 ; (),,,,,,259.0,259,4,radisson blu hotel milan,,,518.0,518,Albergo
2,CSO BUENOS AIRES N. 18 (z.d. 3),CSO,BUENOS AIRES,18.0,2129.0,3.0,16.0,16,1,hotel aurora,1.0,1,25.0,25,Albergo
3,CSO BUENOS AIRES N. 26 (z.d. 3),CSO,BUENOS AIRES,26.0,2129.0,3.0,25.0,,3,hotel buenos aires,,,39.0,,Albergo
4,CSO BUENOS AIRES N. 2 (z.d. 3),CSO,BUENOS AIRES,2.0,2129.0,3.0,46.0,15;11;8,3,albergo fenice,4.0,1;2;3;4,98.0,24;19;13,Albergo


In [75]:
#rules:
#       - 1)Camere = somma Camere piano
#       - 2)Piani totali = somma Piano piano
#       - 3)Posti letto = somma Posti letto per piano
#       - 4)Camere <= (minore/uguale) Posti letto

#STRUTTURE_COPY['Camere piano'] = STRUTTURE_COPY['Camere piano'].fillna(0)
sum_Camere = STRUTTURE_COPY['Camere piano'].apply(lambda x: sum(int(n) for n in str(x).split(';') if n.strip().isdigit()))
STRUTTURE_COPY['Check_sum_camere'] = np.where(STRUTTURE_COPY['Camere'] == sum_Camere, 1, 0)

#STRUTTURE_COPY['Piano piano'] = STRUTTURE_COPY['Piano piano'].fillna(0)
sum_Piano = STRUTTURE_COPY['Piano piano'].apply(lambda x: len([n for n in str(x).split(';') if n.strip() != '' and pd.notna(n)]))
STRUTTURE_COPY['Check_sum_piani'] = np.where(STRUTTURE_COPY['Piani totali'] == sum_Piano, 1, 0)

#STRUTTURE_COPY['Posti letto per piano'] = STRUTTURE_COPY['Posti letto per piano'].fillna(0)
sum_Posti_letto_piano = STRUTTURE_COPY['Posti letto per piano'].apply(lambda x: sum(int(n) for n in str(x).split(';') if n.strip().isdigit()))
STRUTTURE_COPY['Check_sum_posti_letto'] = np.where(STRUTTURE_COPY['Posti letto'] == sum_Posti_letto_piano, 1, 0)

STRUTTURE_COPY['Check_camere_minore_letti'] = np.where(sum_Camere <= sum_Posti_letto_piano, 1, 0)

STRUTTURE_COPY.head()

Unnamed: 0,Ubicazione,Tipo via,Descrizione via,Civico,Codice via,ZD,Camere,Camere piano,Categoria,Insegna,Piani totali,Piano piano,Posti letto,Posti letto per piano,Tipo attività struture extra,Check_sum_camere,Check_sum_piani,Check_sum_posti_letto,Check_camere_minore_letti
0,ALZ NAVIGLIO GRANDE N. 8 (z.d. 6),ALZ,NAVIGLIO GRANDE,8.0,5144.0,6.0,14.0,,4,HOTEL MAISON BORELLA,,,25.0,,Albergo,0,0,0,1
1,codvia 0000 num.024 ; (),,,,,,259.0,259,4,radisson blu hotel milan,,,518.0,518,Albergo,1,0,1,1
2,CSO BUENOS AIRES N. 18 (z.d. 3),CSO,BUENOS AIRES,18.0,2129.0,3.0,16.0,16,1,hotel aurora,1.0,1,25.0,25,Albergo,1,1,1,1
3,CSO BUENOS AIRES N. 26 (z.d. 3),CSO,BUENOS AIRES,26.0,2129.0,3.0,25.0,,3,hotel buenos aires,,,39.0,,Albergo,0,0,0,1
4,CSO BUENOS AIRES N. 2 (z.d. 3),CSO,BUENOS AIRES,2.0,2129.0,3.0,46.0,15;11;8,3,albergo fenice,4.0,1;2;3;4,98.0,24;19;13,Albergo,0,1,0,1


In [76]:

PROPERTY_count = STRUTTURE_COPY[((STRUTTURE_COPY['Camere'].notna())& 
                                (STRUTTURE_COPY['Piani totali'].notna())&
                                (STRUTTURE_COPY['Posti letto'].notna())&
                                (STRUTTURE_COPY['Camere piano'].notna())&
                                (STRUTTURE_COPY['Piano piano'].notna())&
                                (STRUTTURE_COPY['Posti letto per piano'].notna()))]
PROPERTY_count

#PROPERTY_count.isnull().sum()

Unnamed: 0,Ubicazione,Tipo via,Descrizione via,Civico,Codice via,ZD,Camere,Camere piano,Categoria,Insegna,Piani totali,Piano piano,Posti letto,Posti letto per piano,Tipo attività struture extra,Check_sum_camere,Check_sum_piani,Check_sum_posti_letto,Check_camere_minore_letti
2,CSO BUENOS AIRES N. 18 (z.d. 3),CSO,BUENOS AIRES,18.0,2129.0,3.0,16.0,16,1,hotel aurora,1.0,1,25.0,25,Albergo,1,1,1,1
4,CSO BUENOS AIRES N. 2 (z.d. 3),CSO,BUENOS AIRES,2.0,2129.0,3.0,46.0,15;11;8,3,albergo fenice,4.0,1;2;3;4,98.0,24;19;13,Albergo,0,1,0,1
6,CSO BUENOS AIRES N. 3 (z.d. 3),CSO,BUENOS AIRES,3.0,2129.0,3.0,116.0,4;23;24;24,4,cristoforo colombo,4.0,2;3;4;5,191.0,5;38;40;40,Albergo,0,1,0,1
12,CSO EUROPA N. 9 (z.d. 1),CSO,EUROPA,9.0,300.0,1.0,89.0,12;12;12;12;12;11;5,4,hotel galileo,7.0,1;2;3;4;5;6;7,159.0,22;22;22;22;22;21;6,Albergo,0,1,0,1
14,CSO GARIBALDI GIUSEPPE N. 84 (z.d. 1),CSO,GARIBALDI GIUSEPPE,84.0,1010.0,1.0,98.0,14;14;14;14;14;14;14,4,carlyle brera hotel,7.0,1;2;3;4;5;6;7,203.0,29;29;29;29;29;29;29,Albergo,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
441,VLE STURZO DON LUIGI N. 45 (z.d. 9),VLE,STURZO DON LUIGI,45.0,1704.0,9.0,420.0,70,4,atahotel executive,6.0,1;2;3;4;5;6,792.0,140,Albergo,0,1,0,1
442,VLE SUZZANI GIOVANNI N. 13 (z.d. 9),VLE,SUZZANI GIOVANNI,13.0,1446.0,9.0,172.0,30;11,4,novotel milano nord,7.0,1;2;3;4;5;6;7,344.0,60;22,Albergo,0,1,0,1
443,VLE SUZZANI GIOVANNI num.013/15 ; (z.d. 9),,,,,,131.0,33,3,hotel ibis milano ca granda,4.0,1;2;3;4,262.0,66,Albergo,0,1,0,1
446,VLE TUNISIA N. 6 (z.d. 3),VLE,TUNISIA,6.0,2121.0,3.0,13.0,16,1,hotel kennedy,1.0,6,23.0,23,Albergo,0,1,1,1


In [77]:
#count the number of consistent tuples considering the rule
CONSISTENT1 = sum(PROPERTY_count['Check_sum_camere'] == 1)
CONSISTENT2 = sum(PROPERTY_count['Check_sum_piani'] == 1)
CONSISTENT3 = sum(PROPERTY_count['Check_sum_posti_letto'] == 1)
CONSISTENT4 = sum(PROPERTY_count['Check_camere_minore_letti'] == 1)

CONSISTENCY1 = '{:,.2%}'.format((CONSISTENT1 / PROPERTY_count['Check_sum_camere'].count()))
CONSISTENCY2 = '{:,.2%}'.format((CONSISTENT2 / PROPERTY_count['Check_sum_piani'].count()))
CONSISTENCY3 = '{:,.2%}'.format((CONSISTENT3 / PROPERTY_count['Check_sum_posti_letto'].count()))
CONSISTENCY4 = '{:,.2%}'.format((CONSISTENT4 / PROPERTY_count['Check_camere_minore_letti'].count()))

print("Consistency Check 1 (Camere = somma Camere piano): ", CONSISTENCY1)
print("Consistency Check 2 (Piani totali = somma Piano piano): ", CONSISTENCY2)
print("Consistency Check 3 (Posti letto = somma Posti letto per piano): ", CONSISTENCY3)
print("Consistency Check 4 (Camere <= Posti letto): ", CONSISTENCY4)    


Consistency Check 1 (Camere = somma Camere piano):  44.05%
Consistency Check 2 (Piani totali = somma Piano piano):  94.64%
Consistency Check 3 (Posti letto = somma Posti letto per piano):  33.33%
Consistency Check 4 (Camere <= Posti letto):  99.40%


# **2. DATA PROFILING**

In [78]:
!pip install lux-api plotly matplotlib



**SINGLE COLUMN ANALYSIS**

**Cardinalities**

Cardinalities are numbers that summarize simple metadata (*e.g.,* number of rows, attributes, null values, distinct values, Uniqueness and Distinctness).

*Cardinality* = count of the number of distinct actual values.

*Uniqueness* = percentage calculated as Cardinality divided by the total number of records.

*Actual* = count of the number of records with an actual value (*i.e.,* not-null).

*Distinctness* = percentage calculated as Cardinality divided by Actual.

In [79]:
import sys
!{sys.executable} -m pip install -U ydata-profiling[notebook]



In [80]:
!pip install jupyter-contrib-nbextensions



In [81]:
!jupyter-contrib-nbextension enable --py widgetsnbextension

Traceback (most recent call last):
  File "C:\Users\Utente\anaconda3\Lib\site-packages\jupyter_contrib_core\notebook_compat\nbextensions.py", line 6, in <module>
    from notebook.extensions import BaseExtensionApp
ModuleNotFoundError: No module named 'notebook.extensions'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "C:\Users\Utente\anaconda3\Lib\site-packages\jupyter_contrib_core\notebook_compat\nbextensions.py", line 10, in <module>
    from notebook.nbextensions import BaseNBExtensionApp
ModuleNotFoundError: No module named 'notebook.nbextensions'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "C:\Users\Utente\anaconda3\Scripts\jupyter-contrib-nbextension.exe\__main__.py", line 4, in <module>
  File "C:\Users\Utente\anaconda3\Lib\site-packages\jupyt

In [82]:
!pip install dataprofiler



In [83]:
!pip install dataprofiler[ml] --user



In [84]:
#...
from ydata_profiling import ProfileReport
import pandas as pd
import json
from dataprofiler import Data, Profiler

profile = ProfileReport(STRUTTURE, title="STRUTTURE_Report")
#in html
profile.to_file("STRUTTURE_Report.html")

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

100%|██████████| 15/15 [00:00<00:00, 54.59it/s]


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

In [85]:
readable_report = profile.report(report_options={"output_format": "compact"})
readable_report        #reported without the visualization info, more readable

TypeError: 'Root' object is not callable

# **3. ASSOCIATION RULES MINING**

Personal hp: no associations

# **4. DATA WRANGLING**

1. Column Renaiming:

In [89]:
STRUTTURE.head(1)

Unnamed: 0,Ubicazione,Tipo via,Descrizione via,Civico,Codice via,ZD,Camere,Camere piano,Categoria,Insegna,Piani totali,Piano piano,Posti letto,Posti letto per piano,Tipo attività struture extra
0,ALZ NAVIGLIO GRANDE N. 8 (z.d. 6),ALZ,NAVIGLIO GRANDE,8.0,5144.0,6.0,14.0,,4,HOTEL MAISON BORELLA,,,25.0,,Albergo


In [None]:
STRUTTURE = STRUTTURE.rename({'Camere':'Camere tot',
                              'Camere piano':'Camere per piano',
                              'Piani totali':'Piani tot',
                              'Piano piano':'Elenco piani',
                              'Posti letto':'Posti letto tot'})

2. Sorting:

In [92]:
#before put the name at the beginning:
col = 'Insegna'  # column to be moved
cols = [col] + [c for c in STRUTTURE.columns if c != col]
STRUTTURE = STRUTTURE[cols]

#then sort:
STRUTTURE = STRUTTURE.sort_values(by ='Insegna',ascending=True)
STRUTTURE.head()

Unnamed: 0,Insegna,Ubicazione,Tipo via,Descrizione via,Civico,Codice via,ZD,Camere,Camere piano,Categoria,Piani totali,Piano piano,Posti letto,Posti letto per piano,Tipo attività struture extra
218,ARMANI HOTEL MILANO,VIA MANZONI ALESSANDRO N. 31 (z.d. 1),VIA,MANZONI ALESSANDRO,31.0,230.0,1.0,95.0,,5 STELLE LUSSO,,,190.0,,Albergo
144,BIO CITY HOTEL,VIA EDOLO N. 18 (z.d. 2),VIA,EDOLO,18.0,1217.0,2.0,17.0,,1,,,32.0,,Albergo
5,GALAXY G SRL,CSO BUENOS AIRES N. 33 (z.d. 3),CSO,BUENOS AIRES,33.0,2129.0,3.0,65.0,0.0,4,,,97.0,0.0,Albergo
257,HOTEL ADAM,VIA PALMANOVA N. 153 (z.d. 2),VIA,PALMANOVA,153.0,2390.0,2.0,59.0,,,,,99.0,,Albergo
301,HOTEL ADLER,VIA RICORDI GIOVANNI N. 10 (z.d. 3),VIA,RICORDI GIOVANNI,10.0,2251.0,3.0,23.0,,,,,47.0,,Albergo


3. Column filtering: _not done_
4. Dropping:

In [152]:
#Before drop 'Ubicazione' fill the values of 'Tipo via' 'Descrizione via' 'Civico' 'Codice via' 'ZD'
# download fixed by Sveva

STRUTTURE2 = pd.read_csv('Dataset_indirizzi_sistemati.csv')
STRUTTURE2 = STRUTTURE2.drop(["Unnamed: 0"], axis = 1)
STRUTTURE2.head()

#added rename and sorting part:
STRUTTURE2 = STRUTTURE2.rename(columns={'Camere':'Camere tot',
                              'Camere piano':'Camere per piano',
                              'Piani totali':'Piani tot',
                              'Piano piano':'Elenco piani',
                              'Posti letto':'Posti letto tot'})

#before put the name at the beginning:
col = 'Insegna'  # la colonna da spostare
cols = [col] + [c for c in STRUTTURE2.columns if c != col]
STRUTTURE2 = STRUTTURE2[cols]

STRUTTURE2 = STRUTTURE2.sort_values(by ='Insegna',ascending=True)
STRUTTURE2.head()

Unnamed: 0,Insegna,Ubicazione,Tipo via,Nome via,Civico,Codice via,Municipio,Camere tot,Camere per piano,Stelle,Piani tot,Elenco piani,Posti letto tot,Posti letto per piano,Tipo struttura
218,ARMANI HOTEL MILANO,VIA MANZONI ALESSANDRO N. 31 (z.d. 1),VIA,MANZONI ALESSANDRO,31,230.0,1.0,95.0,,5 STELLE LUSSO,,,190.0,,Albergo
144,BIO CITY HOTEL,VIA EDOLO N. 18 (z.d. 2),VIA,EDOLO,18,1217.0,2.0,17.0,,1,,,32.0,,Albergo
5,GALAXY G SRL,CSO BUENOS AIRES N. 33 (z.d. 3),CSO,BUENOS AIRES,33,2129.0,3.0,65.0,0.0,4,,,97.0,0.0,Albergo
257,HOTEL ADAM,VIA PALMANOVA N. 153 (z.d. 2),VIA,PALMANOVA,153,2390.0,2.0,59.0,,,,,99.0,,Albergo
301,HOTEL ADLER,VIA RICORDI GIOVANNI N. 10 (z.d. 3),VIA,RICORDI GIOVANNI,10,2251.0,3.0,23.0,,,,,47.0,,Albergo


In [153]:
#drop 'Ubicazione'
STRUTTURE2 = STRUTTURE2.drop(columns=['Ubicazione'])
STRUTTURE2.head()

Unnamed: 0,Insegna,Tipo via,Nome via,Civico,Codice via,Municipio,Camere tot,Camere per piano,Stelle,Piani tot,Elenco piani,Posti letto tot,Posti letto per piano,Tipo struttura
218,ARMANI HOTEL MILANO,VIA,MANZONI ALESSANDRO,31,230.0,1.0,95.0,,5 STELLE LUSSO,,,190.0,,Albergo
144,BIO CITY HOTEL,VIA,EDOLO,18,1217.0,2.0,17.0,,1,,,32.0,,Albergo
5,GALAXY G SRL,CSO,BUENOS AIRES,33,2129.0,3.0,65.0,0.0,4,,,97.0,0.0,Albergo
257,HOTEL ADAM,VIA,PALMANOVA,153,2390.0,2.0,59.0,,,,,99.0,,Albergo
301,HOTEL ADLER,VIA,RICORDI GIOVANNI,10,2251.0,3.0,23.0,,,,,47.0,,Albergo


5. Standardization: _nothing_
6. Column Splitting: _nothing_
7. Column Merging:

In [154]:
STRUTTURE2['Via'] = STRUTTURE2['Tipo via'].str.cat(STRUTTURE2['Nome via'], sep=' ')
col = 'Via'  # column to be moved
cols = ['Insegna'] + [col] + [c for c in STRUTTURE2.columns if c != col and c != 'Insegna']
STRUTTURE2 = STRUTTURE2[cols]

STRUTTURE2 = STRUTTURE2.drop(columns=['Tipo via','Nome via'])
STRUTTURE2.head()

Unnamed: 0,Insegna,Via,Civico,Codice via,Municipio,Camere tot,Camere per piano,Stelle,Piani tot,Elenco piani,Posti letto tot,Posti letto per piano,Tipo struttura
218,ARMANI HOTEL MILANO,VIA MANZONI ALESSANDRO,31,230.0,1.0,95.0,,5 STELLE LUSSO,,,190.0,,Albergo
144,BIO CITY HOTEL,VIA EDOLO,18,1217.0,2.0,17.0,,1,,,32.0,,Albergo
5,GALAXY G SRL,CSO BUENOS AIRES,33,2129.0,3.0,65.0,0.0,4,,,97.0,0.0,Albergo
257,HOTEL ADAM,VIA PALMANOVA,153,2390.0,2.0,59.0,,,,,99.0,,Albergo
301,HOTEL ADLER,VIA RICORDI GIOVANNI,10,2251.0,3.0,23.0,,,,,47.0,,Albergo


8. Dealing with missing numbers: _to be done later_
9. Formatting values:

In [158]:
# select float columns
float_cols = [col for col in STRUTTURE2.select_dtypes(include="float64").columns if col != "Civico"]

# apply format mapping (in this case change of format)
STRUTTURE2[float_cols] = STRUTTURE2[float_cols].astype("Int64")

STRUTTURE2['Insegna'] = STRUTTURE2['Insegna'].str.upper()
STRUTTURE2['Via'] = STRUTTURE2['Via'].str.upper()
STRUTTURE2['Tipo struttura'] = STRUTTURE2['Tipo struttura'].str.upper()
STRUTTURE2


Unnamed: 0,Insegna,Via,Civico,Codice via,Municipio,Camere tot,Camere per piano,Stelle,Piani tot,Elenco piani,Posti letto tot,Posti letto per piano,Tipo struttura
218,ARMANI HOTEL MILANO,VIA MANZONI ALESSANDRO,31,230,1,95,,5 STELLE LUSSO,,,190,,ALBERGO
144,BIO CITY HOTEL,VIA EDOLO,18,1217,2,17,,1,,,32,,ALBERGO
5,GALAXY G SRL,CSO BUENOS AIRES,33,2129,3,65,0,4,,,97,0,ALBERGO
257,HOTEL ADAM,VIA PALMANOVA,153,2390,2,59,,,,,99,,ALBERGO
301,HOTEL ADLER,VIA RICORDI GIOVANNI,10,2251,3,23,,,,,47,,ALBERGO
...,...,...,...,...,...,...,...,...,...,...,...,...,...
248,,VIA ORSEOLO PIETRO,1,5113,6,59,,4,,,101,,ALBERGO
322,,VIA SANTA RADEGONDA,14,,1,,,,,,,,
326,,VIA SAN TOMASO,8,723,1,11,,4,,,22,,ALBERGO
355,,VIA STEPHENSON GIORGIO,55,,,256,,4,,,512,,ALBERGO


In [162]:
#Save to file
STRUTTURE2.to_csv('Dataset_strutture.csv')

In [159]:
STRUTTURE2.dtypes

Insegna                  object
Via                      object
Civico                   object
Codice via                Int64
Municipio                 Int64
Camere tot                Int64
Camere per piano         object
Stelle                   object
Piani tot                 Int64
Elenco piani             object
Posti letto tot           Int64
Posti letto per piano    object
Tipo struttura           object
dtype: object

In [160]:
STRUTTURE2.isnull().sum()

Insegna                   10
Via                        0
Civico                     0
Codice via                14
Municipio                  3
Camere tot                 1
Camere per piano         106
Stelle                     7
Piani tot                264
Elenco piani             252
Posti letto tot            1
Posti letto per piano    106
Tipo struttura             2
dtype: int64