In [62]:
import pandas as pd
import numpy as np
from itertools import compress

In [126]:
def tsv_to_dataframe(file_tsv, structure = "normal"):
    '''
    this function transforms Eurostat tsv file in pandas dataframe
    file_tsv: file name. It's work with tsv and compressed file "tsv.gz"
    '''
    
    if structure == "inverse":
        print("use tsv_to_dataframe_long instead tsv_to_dataframe")
    
    def clean_cells(x):
        '''This function transforms Eurostat Missing Values ": " in numpy missing values.
        Then clean Eurostat annotation "b, u, .."'''
        try:
            return float(x)
        except:
            try:
                return float(x.split(" ")[0])
            except:
                return np.nan
    # open the Eurostat TSV file 

    data = pd.read_csv(file_tsv, sep="\t")
    # Create a dataframe for values data
    data_clean = data
    # Clean data values with clean_cells function
    data_clean = data_clean.applymap(lambda x: clean_cells(x))
    # Drop column with variable name like "age,isced11,unit,sex,geo\time". It is the first column. we have a 
    # dataframe with only data values 
    data_clean.drop(data_clean.columns[0], axis = 1, inplace = True)
    # transform column with variable in multiple-columns  
    variabili = data[data.columns[0]].apply(lambda x: pd.Series(x.split(",")))
    variabili.columns = data.columns[0].split(",")
    # return cleaned dataframe in pandas dataframe
    return pd.concat([variabili, data_clean], axis = 1)

In [127]:
def tsv_to_dataframe_long(file_tsv, structure = "normal"):
    '''
    this function transforms Eurostat tsv file in pandas dataframe
    file_tsv: file name. It's work with tsv and compressed file "tsv.gz"
    structure: "normal", columns indicates time
    structure: "inverse", columns indicates Nuts / geo
    
    '''
    
    def clean_cells(x):
        '''This function transforms Eurostat Missing Values ": " in numpy missing values.
        Then clean Eurostat annotation "b, u, .."'''
        try:
            return float(x)
        except:
            try:
                return float(x.split(" ")[0])
            except:
                return np.nan
            
    def annotation(x):
            '''This function extracts Eurostat annotation "b, u, .."'''
            try:
                return x.split(" ")[1]
            except:
                return np.nan

    def columns_type(x):
            try:
                return int(x.strip())
            except:
                if x == "geo\\time":
                    return "var_" + "geo"
                else:
                    return "var_" + x
                
    
    def columns_type_inverse(columns):
        new_columns =[]
        sep = columns.get_loc("time\\geo")
        new_columns.extend(list(columns[:sep].map(lambda x: "var_" + x)))
        new_columns.append("var_time")
        new_columns.extend(list(columns[sep + 1:]))
        return new_columns
    
    # open the Eurostat TSV file 

    data = pd.read_csv(file_tsv, sep="\t")
    # Create a dataframe for values data
    data_clean = data
    # Clean data values with clean_cells function
    data_clean = data_clean.applymap(lambda x: x)
    # Drop column with variable name like "age,isced11,unit,sex,geo\time". It is the first column. we have a 
    # dataframe with only data values 
    data_clean.drop(data_clean.columns[0], axis = 1, inplace = True)
    # transform column with variable in multiple-columns  
    variabili = data[data.columns[0]].apply(lambda x: pd.Series(x.split(",")))
    variabili.columns = data.columns[0].split(",")
    # return cleaned dataframe in pandas dataframe
    result = pd.concat([variabili, data_clean], axis = 1)
    
    if structure == "normal":
        colonne  = list(map(lambda x: columns_type(x), list(result.columns.values)))
        result.columns = colonne
        colonne_var = result.columns.map(lambda x: str(x)[0:3] == "var")
        index = list(compress(result.columns, colonne_var))
        result = result.melt(id_vars = index)
        result["value_raw"] = result["value"]
        result["eurostat_annotation"] = result["value_raw"].apply(lambda x: annotation(x))
        result["value"] = result["value"].apply(clean_cells) 
    elif structure == "inverse":
        result.columns = columns_type_inverse(result.columns)
        colonne_var = result.columns.map(lambda x: str(x)[0:3] == "var")
        index = list(compress(result.columns, colonne_var))
        result = result.melt(id_vars = index, var_name = "geo")
        result["value_raw"] = result["value"]
        result["eurostat_annotation"] = result["value_raw"].apply(lambda x: annotation(x))
        result["value"] = result["value"].apply(clean_cells) 
        
    
    return result

In [128]:
#example 
#if __name__ == "__main__":
import urllib.request
import gzip

eurostat_link = "http://ec.europa.eu/eurostat/estat-navtree-portlet-prod/BulkDownloadListing?sort=1&file=data%2Fhlth_dlm030.tsv.gz"
urllib.request.urlretrieve(eurostat_link , "file.tsv.gz")

('file.tsv.gz', <http.client.HTTPMessage at 0x7f5cdc5ce6a0>)

In [129]:
result = tsv_to_dataframe_long("file.tsv.gz", structure = "inverse")

In [130]:
result

Unnamed: 0,var_unit,var_hlth_pb,var_age,var_sex,var_time,geo,value,value_raw,eurostat_annotation
0,PC,PB1040,TOTAL,F,2011,EU28,12.0,12,
1,PC,PB1040,TOTAL,M,2011,EU28,12.2,12.2,
2,PC,PB1040,TOTAL,T,2011,EU28,12.1,12.1,
3,PC,PB1040,Y15-24,F,2011,EU28,21.9,21.9,
4,PC,PB1040,Y15-24,M,2011,EU28,25.5,25.5,
5,PC,PB1040,Y15-24,T,2011,EU28,23.7,23.7,
6,PC,PB1040,Y25-34,F,2011,EU28,17.0,17,
7,PC,PB1040,Y25-34,M,2011,EU28,13.9,13.9,
8,PC,PB1040,Y25-34,T,2011,EU28,15.4,15.4,
9,PC,PB1040,Y25-64,F,2011,EU28,11.5,11.5,


In [97]:
columns_type_inverse(result.columns)

['var_unit',
 'var_hlth_pb',
 'var_age',
 'var_sex',
 'time',
 'geo_EU28 ',
 'geo_BE ',
 'geo_BG ',
 'geo_CZ ',
 'geo_DK ',
 'geo_DE ',
 'geo_EE ',
 'geo_IE ',
 'geo_EL ',
 'geo_ES ',
 'geo_FR ',
 'geo_HR ',
 'geo_IT ',
 'geo_CY ',
 'geo_LV ',
 'geo_LT ',
 'geo_LU ',
 'geo_HU ',
 'geo_MT ',
 'geo_NL ',
 'geo_AT ',
 'geo_PL ',
 'geo_PT ',
 'geo_RO ',
 'geo_SI ',
 'geo_SK ',
 'geo_FI ',
 'geo_SE ',
 'geo_UK ',
 'geo_IS ',
 'geo_CH ',
 'geo_TR ']

In [72]:
        
            try:
                return int(x.strip())
            except:
                if x == "time\\geo":
                    return "var_" + "geo"
                else:
                    return "var_" + x
    

Unnamed: 0,var_unit,var_hlth_pb,var_age,var_sex,var_time\geo,var_EU28,var_BE,var_BG,var_CZ,var_DK,...,var_FI,var_SE,var_UK,var_IS,var_CH,var_TR,variable,value,value_raw,eurostat_annotation


In [51]:
result = tsv_to_dataframe_long("file.tsv.gz")
result

Unnamed: 0,var_itm_newa,var_indic_ag,var_unit,var_geo,variable,value,value_raw,eurostat_annotation
0,01000,PROD_BP,MIO_EUR,AT,2017,857.51,857.51 e,e
1,01000,PROD_BP,MIO_EUR,BE,2017,393.61,393.61 e,e
2,01000,PROD_BP,MIO_EUR,BG,2017,1136.57,1136.57 e,e
3,01000,PROD_BP,MIO_EUR,CH,2017,290.24,290.24 e,e
4,01000,PROD_BP,MIO_EUR,CY,2017,11.99,11.99 e,e
5,01000,PROD_BP,MIO_EUR,CZ,2017,1062.51,1062.51 e,e
6,01000,PROD_BP,MIO_EUR,DE,2017,4530.67,4530.67 e,e
7,01000,PROD_BP,MIO_EUR,DK,2017,1761.17,1761.17 e,e
8,01000,PROD_BP,MIO_EUR,EA11,2017,26275.97,26275.97 e,e
9,01000,PROD_BP,MIO_EUR,EA12,2017,27068.94,27068.94 e,e


In [53]:
#Python library

import tsv_to_pandas

In [56]:
eurostat_link = "http://ec.europa.eu/eurostat/estat-navtree-portlet-prod/BulkDownloadListing?sort=1&file=data%2Fhlth_dlm030.tsv.gz"
urllib.request.urlretrieve(eurostat_link , "file.tsv.gz")

('file.tsv.gz', <http.client.HTTPMessage at 0x7f5ce9fef2b0>)

In [None]:
result = tsv_to_pandas.tsv_to_dataframe("file.tsv.gz")

In [59]:
result.columns[4]

'time\\geo'

In [11]:
result = tsv_to_pandas.tsv_to_dataframe_long("file.tsv.gz")
result

Unnamed: 0,var_itm_newa,var_indic_ag,var_unit,var_geo,variable,value,value_raw,eurostat_annotation
0,01000,PROD_BP,MIO_EUR,AT,2017,857.51,857.51 e,e
1,01000,PROD_BP,MIO_EUR,BE,2017,393.61,393.61 e,e
2,01000,PROD_BP,MIO_EUR,BG,2017,1136.57,1136.57 e,e
3,01000,PROD_BP,MIO_EUR,CH,2017,290.24,290.24 e,e
4,01000,PROD_BP,MIO_EUR,CY,2017,11.99,11.99 e,e
5,01000,PROD_BP,MIO_EUR,CZ,2017,1062.51,1062.51 e,e
6,01000,PROD_BP,MIO_EUR,DE,2017,4530.67,4530.67 e,e
7,01000,PROD_BP,MIO_EUR,DK,2017,1761.17,1761.17 e,e
8,01000,PROD_BP,MIO_EUR,EA11,2017,26275.97,26275.97 e,e
9,01000,PROD_BP,MIO_EUR,EA12,2017,27068.94,27068.94 e,e
