In [1]:
import pandas as pd
import numpy as np
from itertools import compress

In [2]:
def tsv_to_dataframe(file_tsv):
    '''
    this function transforms Eurostat tsv file in pandas dataframe
    file_tsv: file name. It's work with tsv and compressed file "tsv.gz"
    '''
    
    def clean_cells(x):
        '''This function transforms Eurostat Missing Values ": " in numpy missing values.
        Then clean Eurostat annotation "b, u, .."'''
        try:
            return float(x)
        except:
            try:
                return float(x.split(" ")[0])
            except:
                return np.nan
    # open the Eurostat TSV file 

    data = pd.read_csv(file_tsv, sep="\t")
    # Create a dataframe for values data
    data_clean = data
    # Clean data values with clean_cells function
    data_clean = data_clean.applymap(lambda x: clean_cells(x))
    # Drop column with variable name like "age,isced11,unit,sex,geo\time". It is the first column. we have a 
    # dataframe with only data values 
    data_clean.drop(data_clean.columns[0], axis = 1, inplace = True)
    # transform column with variable in multiple-columns  
    variabili = data[data.columns[0]].apply(lambda x: pd.Series(x.split(",")))
    variabili.columns = data.columns[0].split(",")
    # return cleaned dataframe in pandas dataframe
    return pd.concat([variabili, data_clean], axis = 1)

In [3]:
def tsv_to_dataframe_long(file_tsv):
    '''
    this function transforms Eurostat tsv file in pandas dataframe
    file_tsv: file name. It's work with tsv and compressed file "tsv.gz"
    '''
    
    def clean_cells(x):
        '''This function transforms Eurostat Missing Values ": " in numpy missing values.
        Then clean Eurostat annotation "b, u, .."'''
        try:
            return float(x)
        except:
            try:
                return float(x.split(" ")[0])
            except:
                return np.nan
            
    def annotation(x):
            '''This function extracts Eurostat annotation "b, u, .."'''
            try:
                return x.split(" ")[1]
            except:
                return np.nan

    def columns_type(x):
            try:
                return int(x.strip())
            except:
                if x == "geo\\time":
                    return "var_" + "geo"
                else:
                    return "var_" + x

    
    # open the Eurostat TSV file 

    data = pd.read_csv(file_tsv, sep="\t")
    # Create a dataframe for values data
    data_clean = data
    # Clean data values with clean_cells function
    data_clean = data_clean.applymap(lambda x: x)
    # Drop column with variable name like "age,isced11,unit,sex,geo\time". It is the first column. we have a 
    # dataframe with only data values 
    data_clean.drop(data_clean.columns[0], axis = 1, inplace = True)
    # transform column with variable in multiple-columns  
    variabili = data[data.columns[0]].apply(lambda x: pd.Series(x.split(",")))
    variabili.columns = data.columns[0].split(",")
    # return cleaned dataframe in pandas dataframe
    result = pd.concat([variabili, data_clean], axis = 1)
    colonne  = list(map(lambda x: columns_type(x), list(result.columns.values)))
    result.columns = colonne
    colonne_var = result.columns.map(lambda x: str(x)[0:3] == "var")
    index = list(compress(result.columns, colonne_var))
    result = result.melt(id_vars = index)
    result["value_raw"] = result["value"]
    result["eurostat_annotation"] = result["value_raw"].apply(lambda x: annotation(x))
    result["value"] = result["value"].apply(clean_cells) 
    
    
    return result

In [4]:
#example 
#if __name__ == "__main__":
import urllib.request
import gzip

eurostat_link = "http://ec.europa.eu/eurostat/estat-navtree-portlet-prod/BulkDownloadListing?sort=1&file=data%2Faact_eaa07.tsv.gz"
urllib.request.urlretrieve(eurostat_link , "file.tsv.gz")

('file.tsv.gz', <http.client.HTTPMessage at 0x7f8b769cd860>)

In [7]:
result = tsv_to_dataframe("file.tsv.gz")
result

Unnamed: 0,itm_newa,indic_ag,unit,geo\time,2017,2016,2015,2014,2013,2012,...,1982,1981,1980,1979,1978,1977,1976,1975,1974,1973
0,01000,PROD_BP,MIO_EUR,AT,857.51,990.64,835.39,978.07,758.43,817.98,...,,,,,,,,,,
1,01000,PROD_BP,MIO_EUR,BE,393.61,318.00,461.42,442.83,437.07,421.39,...,397.97,360.66,367.29,,,,,,,
2,01000,PROD_BP,MIO_EUR,BG,1136.57,1097.11,1059.30,1181.26,1066.85,847.92,...,,,,,,,,,,
3,01000,PROD_BP,MIO_EUR,CH,290.24,215.43,274.48,284.33,247.22,270.50,...,,,,,,,,,,
4,01000,PROD_BP,MIO_EUR,CY,11.99,2.60,20.60,1.77,12.28,10.14,...,,,,,,,,,,
5,01000,PROD_BP,MIO_EUR,CZ,1062.51,1258.68,1182.67,1266.09,1077.30,939.43,...,,,,,,,,,,
6,01000,PROD_BP,MIO_EUR,DE,4530.67,4393.34,5106.54,5512.77,5401.42,4944.19,...,,,,,,,,,,
7,01000,PROD_BP,MIO_EUR,DK,1761.17,1733.16,1983.50,1797.04,1512.44,1744.26,...,1363.45,1239.86,1201.47,1304.50,1271.93,1341.61,978.43,974.20,1248.77,1188.03
8,01000,PROD_BP,MIO_EUR,EA11,26275.97,25325.43,28581.34,29312.92,28605.70,26752.13,...,,,,,,,,,,
9,01000,PROD_BP,MIO_EUR,EA12,27068.94,26212.42,29417.30,30169.58,29535.22,27663.35,...,,,,,,,,,,


In [5]:
result = tsv_to_dataframe_long("file.tsv.gz")
result

Unnamed: 0,var_itm_newa,var_indic_ag,var_unit,var_geo,variable,value,value_raw,eurostat_annotation
0,01000,PROD_BP,MIO_EUR,AT,2017,857.51,857.51 e,e
1,01000,PROD_BP,MIO_EUR,BE,2017,393.61,393.61 e,e
2,01000,PROD_BP,MIO_EUR,BG,2017,1136.57,1136.57 e,e
3,01000,PROD_BP,MIO_EUR,CH,2017,290.24,290.24 e,e
4,01000,PROD_BP,MIO_EUR,CY,2017,11.99,11.99 e,e
5,01000,PROD_BP,MIO_EUR,CZ,2017,1062.51,1062.51 e,e
6,01000,PROD_BP,MIO_EUR,DE,2017,4530.67,4530.67 e,e
7,01000,PROD_BP,MIO_EUR,DK,2017,1761.17,1761.17 e,e
8,01000,PROD_BP,MIO_EUR,EA11,2017,26275.97,26275.97 e,e
9,01000,PROD_BP,MIO_EUR,EA12,2017,27068.94,27068.94 e,e


In [8]:
#Python library

import tsv_to_pandas

In [9]:
eurostat_link = "http://ec.europa.eu/eurostat/estat-navtree-portlet-prod/BulkDownloadListing?sort=1&file=data%2Faact_eaa07.tsv.gz"
urllib.request.urlretrieve(eurostat_link , "file.tsv.gz")

('file.tsv.gz', <http.client.HTTPMessage at 0x7f8b6fad29e8>)

In [10]:
result = tsv_to_pandas.tsv_to_dataframe("file.tsv.gz")
result

Unnamed: 0,itm_newa,indic_ag,unit,geo\time,2017,2016,2015,2014,2013,2012,...,1982,1981,1980,1979,1978,1977,1976,1975,1974,1973
0,01000,PROD_BP,MIO_EUR,AT,857.51,990.64,835.39,978.07,758.43,817.98,...,,,,,,,,,,
1,01000,PROD_BP,MIO_EUR,BE,393.61,318.00,461.42,442.83,437.07,421.39,...,397.97,360.66,367.29,,,,,,,
2,01000,PROD_BP,MIO_EUR,BG,1136.57,1097.11,1059.30,1181.26,1066.85,847.92,...,,,,,,,,,,
3,01000,PROD_BP,MIO_EUR,CH,290.24,215.43,274.48,284.33,247.22,270.50,...,,,,,,,,,,
4,01000,PROD_BP,MIO_EUR,CY,11.99,2.60,20.60,1.77,12.28,10.14,...,,,,,,,,,,
5,01000,PROD_BP,MIO_EUR,CZ,1062.51,1258.68,1182.67,1266.09,1077.30,939.43,...,,,,,,,,,,
6,01000,PROD_BP,MIO_EUR,DE,4530.67,4393.34,5106.54,5512.77,5401.42,4944.19,...,,,,,,,,,,
7,01000,PROD_BP,MIO_EUR,DK,1761.17,1733.16,1983.50,1797.04,1512.44,1744.26,...,1363.45,1239.86,1201.47,1304.50,1271.93,1341.61,978.43,974.20,1248.77,1188.03
8,01000,PROD_BP,MIO_EUR,EA11,26275.97,25325.43,28581.34,29312.92,28605.70,26752.13,...,,,,,,,,,,
9,01000,PROD_BP,MIO_EUR,EA12,27068.94,26212.42,29417.30,30169.58,29535.22,27663.35,...,,,,,,,,,,


In [11]:
result = tsv_to_pandas.tsv_to_dataframe_long("file.tsv.gz")
result

Unnamed: 0,var_itm_newa,var_indic_ag,var_unit,var_geo,variable,value,value_raw,eurostat_annotation
0,01000,PROD_BP,MIO_EUR,AT,2017,857.51,857.51 e,e
1,01000,PROD_BP,MIO_EUR,BE,2017,393.61,393.61 e,e
2,01000,PROD_BP,MIO_EUR,BG,2017,1136.57,1136.57 e,e
3,01000,PROD_BP,MIO_EUR,CH,2017,290.24,290.24 e,e
4,01000,PROD_BP,MIO_EUR,CY,2017,11.99,11.99 e,e
5,01000,PROD_BP,MIO_EUR,CZ,2017,1062.51,1062.51 e,e
6,01000,PROD_BP,MIO_EUR,DE,2017,4530.67,4530.67 e,e
7,01000,PROD_BP,MIO_EUR,DK,2017,1761.17,1761.17 e,e
8,01000,PROD_BP,MIO_EUR,EA11,2017,26275.97,26275.97 e,e
9,01000,PROD_BP,MIO_EUR,EA12,2017,27068.94,27068.94 e,e
