In [1]:
# cargo librerias
import pandas as pd
import re as re
import numpy as np
from collections import Counter

### Correspondencia de productos

World Integrated Trade Solution (WITS) ofrece información sobre diversas nomenclaturas de productos, lo que contribuye con el mapeo entre varias nomenclaturas de productos. En este caso se descargó la tabla de correlación entre el SA 1996 o H1 y la ISIC Rev 3, ya que los datos de exportaciones se encuentran en dicha nomenclatura del SA y los de clasificación de bienes pertenecientes a la Bioeconomía en la mencionada clasificación ISIC. https://wits.worldbank.org/product_concordance.html

#### Lista de productos pertenecientes a la bioeconomía en CIIU Rev3

In [2]:
# Bienes pertenecientes a la bioeconomia argentina
CIIU_bioecon = pd.read_excel('../data/raw/Nomenclaturas/clasif_CIIUrev3_bioecon.xlsx', index_col=None)  
CIIU_bioecon = CIIU_bioecon.loc[ 3:79 , : ]

In [3]:
name_columns = CIIU_bioecon.loc[ 3 , : ].values
name_columns[0]

'CIIU Rev3'

In [4]:
CIIU_bioecon = CIIU_bioecon.loc[ 4:79 , : ].rename(columns = {'Actividades consideradas para estimar la bioeconomía': name_columns[0], 'Unnamed: 1': name_columns[1], 'Unnamed: 2': name_columns[2]})

In [5]:
CIIU_bioecon

Unnamed: 0,CIIU Rev3,Concepto,Participación %
4,Letra A,"Agricultura, ganadería, caza y silvicultura",100
5,Letra B,Pesca,100
6,Letra D,Industria Manufacturera Bio,
7,15111,"Matanza de ganado, producción, procesamiento y...",100
8,15112,"Producción, procesamiento y conservación de ca...",100
...,...,...,...
75,24241,Fabricación de jabones y preparados de limpieza,50
76,24249,"Fabricación de cosméticos, perfumes y otros pr...",25
77,24290,Fabricación de productos químicos n.c.p. (BIOD...,100
78,36101,"Fabricación de muebles y partes de muebles, pr...",100


De las letras A y B se consideran todos los productos como productos de la Bioeconomía, mientras que de la letra D sólo los que se mencionan ahí en los porcentajes que allí indican. En este caso, si figura en el listado se considerará parte de la bioeconomía aunque no corresponda en su totalidad dado que no hay forma de separarlos. 

Voy a tener que extraer del listado completo de nomenclatura CIIU Rev3 los códigos numéricos de productos correspondientes a las letras A y B que no figuran en el listado previo. 

In [6]:
# dejo afuera las filas correspondientes a letras (A, B y D) y me quedo sólo con los códigos numéricos
CIIU_bioecon = CIIU_bioecon.loc[ 7:79 , : ]

In [7]:
CIIU_bioecon.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 73 entries, 7 to 79
Data columns (total 3 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   CIIU Rev3        73 non-null     object
 1   Concepto         73 non-null     object
 2   Participación %  73 non-null     object
dtypes: object(3)
memory usage: 1.8+ KB


#### Lista total de productos en nomenclatura CIIU Rev3

In [8]:
# descargo datos completos de nomenclatura CIIU_Rev3 para completar las referencias a letras
CIIU_Rev3 = pd.read_excel('../data/raw/Nomenclaturas/CIIU_Rev3_es.xlsx')  
CIIU_Rev3

Unnamed: 0,Code,Description,Categoria
0,A,"Agricultura, ganadería, caza y silvicultura",
1,1,"Agricultura, ganadería, caza y actividades de ...",A
2,11,Cultivos en general; cultivo de productos de m...,A
3,111,Cultivo de cereales y otros cultivos n.c.p.,A
4,112,"Cultivo de hortalizas y legumbres, especialida...",A
...,...,...,...
523,9500,Hogares privados con servicio doméstico,P
524,Q,Organizaciones y órganos extraterritoriales,
525,99,Organizaciones y órganos extraterritoriales,Q
526,990,Organizaciones y órganos extraterritoriales,Q


In [9]:
# se corrobora que la clasificación CIIU Rev3 tiene como máximo hasta 4 dígitos
CIIU_Rev3.Code.astype(str).str.len().max()

4

Por lo que se pudo observar en la clasificación CIIU Rev3 los códigos tienen solo 4 dígitos, por lo que se descubrió que la clasificación que en el trabajo de bioeconomía figura como CIIU Rev3 en realidad corresponde a la [Clasificación Nacional de Actividades Económicas (ClaNAE-97)](https://www.argentina.gob.ar/sites/default/files/clasificacion_nacional_de_actividades_economicas-clanae-97.pdf) de Argentina. Según se pudo verificar en el link de dicha clasificación que compara con la CIIU Rev3, la diferencia está en el 5to número, mientras los primeros 4 coinciden con la clasificación CIIU Rev3. Por lo que en este caso se procederá a seleccionar los primeros 4 dígitos de dicha columna para llegar a obtener la clasificación CIIU Rev3 efectiva. 

In [10]:
# selecciono los datos correspondientes a letras A y B de esta base
CIIU_Rev3[CIIU_Rev3.Categoria.isin(['A','B'])]

Unnamed: 0,Code,Description,Categoria
1,1,"Agricultura, ganadería, caza y actividades de ...",A
2,11,Cultivos en general; cultivo de productos de m...,A
3,111,Cultivo de cereales y otros cultivos n.c.p.,A
4,112,"Cultivo de hortalizas y legumbres, especialida...",A
5,113,"Cultivo de frutas, nueces, plantas cuyas hojas...",A
6,12,Cría de animales,A
7,121,"Cría de ganado vacuno y de ovejas, cabras, cab...",A
8,122,Cría de otros animales; elaboración de product...,A
9,13,Cultivo de productos agrícolas en combinación ...,A
10,130,Cultivo de productos agrícolas en combinación ...,A


In [11]:
# selecciono los códigos correspondientes a letras A y B
codigos_AB = CIIU_Rev3[CIIU_Rev3.Categoria.isin(['A','B'])].Code
codigos_AB = codigos_AB.astype(str).unique()
codigos_AB

array(['1', '11', '111', '112', '113', '12', '121', '122', '13', '130',
       '14', '140', '15', '150', '2', '20', '200', '5', '50', '500'],
      dtype=object)

Para complementar los códigos de las letras A y B, selecciono las posiciones correspondientes a la letra D de la tabla CIIU_bioecon 

In [12]:
CIIU_bioecon['CIIU Rev3'] = CIIU_bioecon['CIIU Rev3'].astype(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  CIIU_bioecon['CIIU Rev3'] = CIIU_bioecon['CIIU Rev3'].astype(str)


In [13]:
CIIU_bioecon['CIIU_4dig'] = CIIU_bioecon['CIIU Rev3'].str[:4]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  CIIU_bioecon['CIIU_4dig'] = CIIU_bioecon['CIIU Rev3'].str[:4]


In [14]:
CIIU_bioecon

Unnamed: 0,CIIU Rev3,Concepto,Participación %,CIIU_4dig
7,15111,"Matanza de ganado, producción, procesamiento y...",100,1511
8,15112,"Producción, procesamiento y conservación de ca...",100,1511
9,15113,Elaboración de fiambres y embutidos,100,1511
10,15120,Elaboración y conservación de pescado y produc...,100,1512
11,15130,"Elaboración y conservación de frutas, legumbre...",100,1513
...,...,...,...,...
75,24241,Fabricación de jabones y preparados de limpieza,50,2424
76,24249,"Fabricación de cosméticos, perfumes y otros pr...",25,2424
77,24290,Fabricación de productos químicos n.c.p. (BIOD...,100,2429
78,36101,"Fabricación de muebles y partes de muebles, pr...",100,3610


In [15]:
# códigos de CIIU que corresponden a la letra D
codigos_D = CIIU_bioecon.CIIU_4dig.unique() 
codigos_D

array(['1511', '1512', '1513', '1514', '1520', '1531', '1532', '1533',
       '1541', '1542', '1543', '1544', '1549', '1551', '1552', '1553',
       '1554', '1600', '1711', '1712', '1721', '1722', '1723', '1729',
       '1730', '1810', '1820', '1911', '1912', '1920', '2010', '2021',
       '2022', '2023', '2029', '2101', '2102', '2109', '2320', '2411',
       '2421', '2423', '2424', '2429', '3610', '3720'], dtype=object)

In [16]:
# uno el listado de códigos correspondientes a la bioeconomia en CIIU Rev3
list_CIIU_bioecon = list(codigos_AB) + list(codigos_D) 
list_CIIU_bioecon = [int(i) for i in list_CIIU_bioecon] # los paso a número
list_CIIU_bioecon

[1,
 11,
 111,
 112,
 113,
 12,
 121,
 122,
 13,
 130,
 14,
 140,
 15,
 150,
 2,
 20,
 200,
 5,
 50,
 500,
 1511,
 1512,
 1513,
 1514,
 1520,
 1531,
 1532,
 1533,
 1541,
 1542,
 1543,
 1544,
 1549,
 1551,
 1552,
 1553,
 1554,
 1600,
 1711,
 1712,
 1721,
 1722,
 1723,
 1729,
 1730,
 1810,
 1820,
 1911,
 1912,
 1920,
 2010,
 2021,
 2022,
 2023,
 2029,
 2101,
 2102,
 2109,
 2320,
 2411,
 2421,
 2423,
 2424,
 2429,
 3610,
 3720]

#### Tabla de correspondencias CIIU Rev3 y HS 1996

In [17]:
correl_nomen = pd.read_csv('../data/raw/Nomenclaturas/JobID-19_Concordance_H1_to_I3.csv') 
correl_nomen

Unnamed: 0,HS 1996 Product Code,HS 1996 Product Description,ISIC Revision 3 Product Code,ISIC Revision 3 Product Description
0,10111,Horses :-- Pure-bred breeding animals,121,"Farming of cattle, sheep, goats, horses, asses..."
1,10119,Horses :-- Other,121,"Farming of cattle, sheep, goats, horses, asses..."
2,10120,"Asses, mules and hinnies",121,"Farming of cattle, sheep, goats, horses, asses..."
3,10210,Pure-bred breeding animals,121,"Farming of cattle, sheep, goats, horses, asses..."
4,10290,Other,121,"Farming of cattle, sheep, goats, horses, asses..."
...,...,...,...,...
5108,970200,"Original engravings, prints and lithographs.",9214,"Dramatic arts, music and other arts activities"
5109,970300,"Original sculptures and statuary, in any mater...",9214,"Dramatic arts, music and other arts activities"
5110,970400,"Postage or revenue stamps, stamp-postmarks, fi...",9214,"Dramatic arts, music and other arts activities"
5111,970500,Collections and collectors' pieces of zoologic...,9214,"Dramatic arts, music and other arts activities"


In [18]:
# verifico con un ejemplo para el codigo rev3 1511 que inlcuye 77 productos a 6 dígitos
correl_nomen[correl_nomen['ISIC Revision 3 Product Code']==1511]

Unnamed: 0,HS 1996 Product Code,HS 1996 Product Description,ISIC Revision 3 Product Code,ISIC Revision 3 Product Description
17,20110,Carcasses and half-carcasses,1511,"Production, processing and preserving of meat ..."
18,20120,Other cuts with bone in,1511,"Production, processing and preserving of meat ..."
19,20130,Boneless,1511,"Production, processing and preserving of meat ..."
20,20210,Carcasses and half-carcasses,1511,"Production, processing and preserving of meat ..."
21,20220,Other cuts with bone in,1511,"Production, processing and preserving of meat ..."
...,...,...,...,...
1841,410210,With wool on,1511,"Production, processing and preserving of meat ..."
1842,410221,Without wool on :-- Pickled,1511,"Production, processing and preserving of meat ..."
1843,410229,Without wool on :-- Other,1511,"Production, processing and preserving of meat ..."
1844,410310,Of goats or kids,1511,"Production, processing and preserving of meat ..."


In [19]:
correl_nomen = correl_nomen.rename(columns={'HS 1996 Product Code': 'HS_code', 'HS 1996 Product Description': 'HS_description', 'ISIC Revision 3 Product Code':'CIIU_Rev3_code', 'ISIC Revision 3 Product Description':'CIIU_Rev3_desc'})
correl_nomen

Unnamed: 0,HS_code,HS_description,CIIU_Rev3_code,CIIU_Rev3_desc
0,10111,Horses :-- Pure-bred breeding animals,121,"Farming of cattle, sheep, goats, horses, asses..."
1,10119,Horses :-- Other,121,"Farming of cattle, sheep, goats, horses, asses..."
2,10120,"Asses, mules and hinnies",121,"Farming of cattle, sheep, goats, horses, asses..."
3,10210,Pure-bred breeding animals,121,"Farming of cattle, sheep, goats, horses, asses..."
4,10290,Other,121,"Farming of cattle, sheep, goats, horses, asses..."
...,...,...,...,...
5108,970200,"Original engravings, prints and lithographs.",9214,"Dramatic arts, music and other arts activities"
5109,970300,"Original sculptures and statuary, in any mater...",9214,"Dramatic arts, music and other arts activities"
5110,970400,"Postage or revenue stamps, stamp-postmarks, fi...",9214,"Dramatic arts, music and other arts activities"
5111,970500,Collections and collectors' pieces of zoologic...,9214,"Dramatic arts, music and other arts activities"


Los datos a nivel de producto se encuentran a 6 dígitos del SA, por lo que voy a tener que convertir a nivel de partida arancelaria (4 dígitos) para que coincida con los datos de comercio con que se está trabajando. 

In [20]:
# genero una nueva columna que considere nivel de partida arancelaria (4 dígitos) en vez de 6 digtios del SA (HS)
correl_nomen['HS_4dig'] = np.where(correl_nomen.HS_code.astype(str).str.len() == 5, correl_nomen.HS_code.astype(str).str[:3], correl_nomen.HS_code.astype(str).str[:4]) 
correl_nomen

Unnamed: 0,HS_code,HS_description,CIIU_Rev3_code,CIIU_Rev3_desc,HS_4dig
0,10111,Horses :-- Pure-bred breeding animals,121,"Farming of cattle, sheep, goats, horses, asses...",101
1,10119,Horses :-- Other,121,"Farming of cattle, sheep, goats, horses, asses...",101
2,10120,"Asses, mules and hinnies",121,"Farming of cattle, sheep, goats, horses, asses...",101
3,10210,Pure-bred breeding animals,121,"Farming of cattle, sheep, goats, horses, asses...",102
4,10290,Other,121,"Farming of cattle, sheep, goats, horses, asses...",102
...,...,...,...,...,...
5108,970200,"Original engravings, prints and lithographs.",9214,"Dramatic arts, music and other arts activities",9702
5109,970300,"Original sculptures and statuary, in any mater...",9214,"Dramatic arts, music and other arts activities",9703
5110,970400,"Postage or revenue stamps, stamp-postmarks, fi...",9214,"Dramatic arts, music and other arts activities",9704
5111,970500,Collections and collectors' pieces of zoologic...,9214,"Dramatic arts, music and other arts activities",9705


Si bien se considerarán los datos de CIIU Rev3 como parte de la bioeconomía, para todo lo que es productos de la agricultura y pesca (letras A y B) incluidos en el listado de bioeconomia, se tomarán en cuenta las **partidas incluidas en el Acuerdo de Agricultura de la OMC** que corresponden al SA. Estas se encuentran en el Anexo 1: PRODUCTOS COMPRENDIDOS en el Acuerdo de AGricultura, a saber: 

ANEXO 1

PRODUCTOS COMPRENDIDOS


1.	El presente Acuerdo abarcará los siguientes productos:

	i)	Capítulos 1 a 24 del SA menos el pescado y los productos de pescado, más*:

	ii)	Código del SA		 2905.43 		(manitol)
		Código del SA		2905.44 		(sorbitol)
		Partida del SA		33.01		(aceites esenciales)
		Partidas del SA		35.01 a 35.05	(materias albuminoideas, productos a base de almidón o de fécula modificados, colas)
        Código del SA		3809.10 		(aprestos y productos de acabado)
        Código del SA		3823.60 		(sorbitol n.e.p.)
        Partidas del SA	41.01 a 41.03		(cueros y pieles)
        Partida del SA		43.01		(peletería en bruto)
        Partidas del SA	50.01 a 50.03		(seda cruda y desperdicios de seda) 
        Partidas del SA	51.01 a 51.03		(lana y pelo)
        Partidas del SA	52.01 a 52.03		(algodón en rama, desperdicios de algodón y algodón cardado o peinado)
        Partida del SA		53.01		(lino en bruto)
        Partida del SA		53.02		(cáñamo en bruto)

In [21]:
# capítulos 1 al 24 del sistema armonizado (incluida pesca)
print('Existen',len(correl_nomen[correl_nomen.HS_4dig.astype(int)<2500].HS_4dig.unique()), 'bienes de la base que corresponden a prod del cap 1 al 24')
list_1al24 = list(correl_nomen[correl_nomen.HS_4dig.astype(int)<2500].HS_4dig.unique().astype(int))
list_1al24

Existen 200 bienes de la base que corresponden a prod del cap 1 al 24


[101,
 102,
 103,
 104,
 105,
 106,
 201,
 202,
 203,
 204,
 205,
 206,
 207,
 208,
 209,
 210,
 301,
 302,
 303,
 304,
 305,
 306,
 307,
 401,
 402,
 403,
 404,
 405,
 406,
 407,
 408,
 409,
 410,
 501,
 502,
 503,
 504,
 505,
 506,
 507,
 508,
 509,
 510,
 511,
 601,
 602,
 603,
 604,
 701,
 702,
 703,
 704,
 705,
 706,
 707,
 708,
 709,
 710,
 711,
 712,
 713,
 714,
 801,
 802,
 803,
 804,
 805,
 806,
 807,
 808,
 809,
 810,
 811,
 812,
 813,
 814,
 901,
 902,
 903,
 904,
 905,
 906,
 907,
 908,
 909,
 910,
 1001,
 1002,
 1003,
 1004,
 1005,
 1006,
 1007,
 1008,
 1101,
 1102,
 1103,
 1104,
 1105,
 1106,
 1107,
 1108,
 1109,
 1201,
 1202,
 1203,
 1204,
 1205,
 1206,
 1207,
 1208,
 1209,
 1210,
 1211,
 1212,
 1213,
 1214,
 1301,
 1302,
 1401,
 1402,
 1403,
 1404,
 1501,
 1502,
 1503,
 1504,
 1505,
 1506,
 1507,
 1508,
 1509,
 1510,
 1511,
 1512,
 1513,
 1514,
 1515,
 1516,
 1517,
 1518,
 1520,
 1521,
 1522,
 1601,
 1602,
 1603,
 1604,
 1605,
 1701,
 1702,
 1703,
 1704,
 1801,
 1802,
 

In [22]:
list_restoagri = [2905, 3301, 3501, 3502, 3503, 3504, 3505, 3809, 3823, 4101, 4102, 4103, 4301, 5001, 5003, 5101, 5102, 5103, 5201, 5202, 5203, 5301, 5302]
list_restoagri

[2905,
 3301,
 3501,
 3502,
 3503,
 3504,
 3505,
 3809,
 3823,
 4101,
 4102,
 4103,
 4301,
 5001,
 5003,
 5101,
 5102,
 5103,
 5201,
 5202,
 5203,
 5301,
 5302]

In [23]:
list_agro = list_1al24 + list_restoagri
list_agro

[101,
 102,
 103,
 104,
 105,
 106,
 201,
 202,
 203,
 204,
 205,
 206,
 207,
 208,
 209,
 210,
 301,
 302,
 303,
 304,
 305,
 306,
 307,
 401,
 402,
 403,
 404,
 405,
 406,
 407,
 408,
 409,
 410,
 501,
 502,
 503,
 504,
 505,
 506,
 507,
 508,
 509,
 510,
 511,
 601,
 602,
 603,
 604,
 701,
 702,
 703,
 704,
 705,
 706,
 707,
 708,
 709,
 710,
 711,
 712,
 713,
 714,
 801,
 802,
 803,
 804,
 805,
 806,
 807,
 808,
 809,
 810,
 811,
 812,
 813,
 814,
 901,
 902,
 903,
 904,
 905,
 906,
 907,
 908,
 909,
 910,
 1001,
 1002,
 1003,
 1004,
 1005,
 1006,
 1007,
 1008,
 1101,
 1102,
 1103,
 1104,
 1105,
 1106,
 1107,
 1108,
 1109,
 1201,
 1202,
 1203,
 1204,
 1205,
 1206,
 1207,
 1208,
 1209,
 1210,
 1211,
 1212,
 1213,
 1214,
 1301,
 1302,
 1401,
 1402,
 1403,
 1404,
 1501,
 1502,
 1503,
 1504,
 1505,
 1506,
 1507,
 1508,
 1509,
 1510,
 1511,
 1512,
 1513,
 1514,
 1515,
 1516,
 1517,
 1518,
 1520,
 1521,
 1522,
 1601,
 1602,
 1603,
 1604,
 1605,
 1701,
 1702,
 1703,
 1704,
 1801,
 1802,
 

#### Selecciono productos pertenecientes a la bioeconomía de la tabla de correspondencias

In [24]:
# selecciono sólo los bienes que pertenecen al listado de la bioeconomía
df_bioecon = correl_nomen[(correl_nomen.CIIU_Rev3_code.isin(list_CIIU_bioecon))]
df_bioecon

Unnamed: 0,HS_code,HS_description,CIIU_Rev3_code,CIIU_Rev3_desc,HS_4dig
0,10111,Horses :-- Pure-bred breeding animals,121,"Farming of cattle, sheep, goats, horses, asses...",101
1,10119,Horses :-- Other,121,"Farming of cattle, sheep, goats, horses, asses...",101
2,10120,"Asses, mules and hinnies",121,"Farming of cattle, sheep, goats, horses, asses...",101
3,10210,Pure-bred breeding animals,121,"Farming of cattle, sheep, goats, horses, asses...",102
4,10290,Other,121,"Farming of cattle, sheep, goats, horses, asses...",102
...,...,...,...,...,...
4999,940421,"Mattresses :-- Of cellular rubber or plastics,...",3610,Manufacture of furniture,9404
5000,940429,Mattresses :-- Of other materials,3610,Manufacture of furniture,9404
5001,940430,Sleeping bags,1721,"Manufacture of made-up textile articles, excep...",9404
5002,940490,Other,1721,"Manufacture of made-up textile articles, excep...",9404


Una vez filtrados por código CIIU, selecciono los códigos únicos a 4 dígitos del SA (clasificación de datos de comercio) para poder clasificar los datos de comercio en bienes de la bioeconomía o no. 

In [25]:
# armo lista de códigos HS
list_bioecon_HS4dig = df_bioecon.HS_4dig.unique()
list_bioecon_HS4dig = list_bioecon_HS4dig.astype(int)
list_bioecon_HS4dig

array([ 101,  102,  103,  104,  105,  106,  201,  202,  203,  204,  205,
        206,  207,  208,  209,  210,  301,  302,  303,  304,  305,  306,
        307,  401,  402,  403,  404,  405,  406,  407,  408,  409,  410,
        503,  508,  509,  511,  601,  602,  603,  604,  701,  702,  703,
        704,  705,  706,  707,  708,  709,  710,  711,  712,  713,  714,
        801,  802,  803,  804,  805,  806,  807,  808,  809,  810,  811,
        812,  813,  814,  901,  902,  903,  904,  905,  906,  907,  908,
        909,  910, 1001, 1002, 1003, 1004, 1005, 1006, 1007, 1008, 1101,
       1102, 1103, 1104, 1105, 1106, 1107, 1108, 1109, 1201, 1202, 1203,
       1204, 1205, 1206, 1207, 1208, 1209, 1210, 1211, 1212, 1213, 1214,
       1301, 1302, 1401, 1402, 1403, 1404, 1501, 1502, 1503, 1504, 1505,
       1506, 1507, 1508, 1509, 1510, 1511, 1512, 1513, 1514, 1515, 1516,
       1517, 1518, 1520, 1521, 1522, 1601, 1602, 1603, 1604, 1605, 1701,
       1702, 1703, 1704, 1801, 1803, 1804, 1805, 18

In [26]:
list_bio = list(list_bioecon_HS4dig) + list(list_agro) # uno listas para que no quede ninguna posición afuera
list_bio = list(set(list_bio)) # selecciono valores unicos
list_bio

[4101,
 4102,
 4103,
 4104,
 4105,
 4106,
 4107,
 4108,
 4109,
 4111,
 5301,
 2101,
 2102,
 2103,
 2104,
 2105,
 2106,
 6201,
 6202,
 6203,
 6204,
 6205,
 6206,
 6207,
 6208,
 6209,
 6210,
 6211,
 6212,
 6213,
 6214,
 6215,
 6216,
 6217,
 101,
 102,
 103,
 104,
 105,
 106,
 4201,
 4202,
 4203,
 4204,
 4205,
 2201,
 2202,
 2203,
 2204,
 2205,
 2206,
 2207,
 2208,
 2209,
 6301,
 6302,
 6303,
 6304,
 6305,
 6306,
 6307,
 6308,
 9401,
 201,
 202,
 203,
 204,
 205,
 206,
 207,
 208,
 209,
 210,
 4301,
 4302,
 4303,
 4304,
 9404,
 2301,
 2302,
 2303,
 2304,
 2305,
 2306,
 6401,
 6402,
 2309,
 6403,
 6404,
 6405,
 6406,
 2307,
 2308,
 301,
 302,
 303,
 304,
 305,
 306,
 307,
 4401,
 4402,
 4403,
 4404,
 4405,
 4406,
 4407,
 4408,
 4409,
 4410,
 4411,
 4412,
 4413,
 4414,
 4415,
 4416,
 4417,
 4418,
 4419,
 4420,
 4421,
 8523,
 2401,
 2402,
 2403,
 6501,
 6502,
 6503,
 6504,
 6505,
 6506,
 6507,
 401,
 402,
 403,
 404,
 405,
 406,
 407,
 408,
 409,
 410,
 4501,
 4502,
 4503,
 4504,
 7019,
 501

In [27]:
# selecciono sólo los bienes que pertenecen al listado de la bioeconomía
df_bio = correl_nomen[(correl_nomen.CIIU_Rev3_code.isin(list_bio))]
df_bio

Unnamed: 0,HS_code,HS_description,CIIU_Rev3_code,CIIU_Rev3_desc,HS_4dig
17,20110,Carcasses and half-carcasses,1511,"Production, processing and preserving of meat ...",201
18,20120,Other cuts with bone in,1511,"Production, processing and preserving of meat ...",201
19,20130,Boneless,1511,"Production, processing and preserving of meat ...",201
20,20210,Carcasses and half-carcasses,1511,"Production, processing and preserving of meat ...",202
21,20220,Other cuts with bone in,1511,"Production, processing and preserving of meat ...",202
...,...,...,...,...,...
4971,930621,Shotgun cartridges and parts thereof; air gun ...,2927,Manufacture of weapons and ammunition,9306
4972,930629,Shotgun cartridges and parts thereof; air gun ...,2927,Manufacture of weapons and ammunition,9306
4973,930630,Other cartridges and parts thereof,2927,Manufacture of weapons and ammunition,9306
4974,930690,Other,2927,Manufacture of weapons and ammunition,9306


In [28]:
# for i in range(0, len(list_bio)):
#     list_bio[i] = int(list_bio[i])

In [29]:
print('Número de bienes en código HS_4dig que corresponden a la bioeconomía: ', len(list_bio)) # 593 bienes corresponden a la bioeconomía 

Número de bienes en código HS_4dig que corresponden a la bioeconomía:  593


In [30]:
df_list_bio = pd.DataFrame(list_bio, columns=["lista_prodbio"])
df_list_bio # 593 bienes de la bioeconomia

Unnamed: 0,lista_prodbio
0,4101
1,4102
2,4103
3,4104
4,4105
...,...
588,6114
589,6115
590,6116
591,6117


In [31]:
# guardo la lista para levantarla desde otros notebooks
df_list_bio.to_csv('../data/processed/list_bioecon_HS4dig.csv', index=False)

#### Lista total de productos en nomenclatura SITC Rev2

In [86]:
code_SITC = pd.read_excel('../data/raw/Nomenclaturas/SITC Rev2.xls') 
code_SITC = code_SITC.rename(columns={'Commodity Code': 'SITC_code', 'Commodity description': 'SITC_description'})
code_SITC

Unnamed: 0,SITC_code,SITC_description
0,0,Food and live animals chiefly for food
1,00,Live animals chiefly for food
2,001,Live animals chiefly for food
3,0011,Animals of the bovine species (including buffa...
4,00111,-- pure bred for breeding
...,...,...
2571,9710,"Gold, non-monetary (excluding gold ores and co..."
2572,97101,"Gold, non-monetary, unwrought or semi-manufact..."
2573,97102,"Rolled gold on base metal or silver, unworked ..."
2574,97103,"Gold, silver and jewels sweepings, residues, l..."


In [87]:
code_SITC.SITC_code.astype(str).str.len()

0       1
1       2
2       3
3       4
4       5
       ..
2571    4
2572    5
2573    5
2574    5
2575    5
Name: SITC_code, Length: 2576, dtype: int64

In [88]:
# agrego una columna con la categoría del producto
code_SITC['Category'] = np.where(code_SITC.SITC_code.astype(str).str.len()>1, code_SITC.SITC_code.astype(str).str[:1],'')
code_SITC

Unnamed: 0,SITC_code,SITC_description,Category
0,0,Food and live animals chiefly for food,
1,00,Live animals chiefly for food,0
2,001,Live animals chiefly for food,0
3,0011,Animals of the bovine species (including buffa...,0
4,00111,-- pure bred for breeding,0
...,...,...,...
2571,9710,"Gold, non-monetary (excluding gold ores and co...",9
2572,97101,"Gold, non-monetary, unwrought or semi-manufact...",9
2573,97102,"Rolled gold on base metal or silver, unworked ...",9
2574,97103,"Gold, silver and jewels sweepings, residues, l...",9


In [89]:
code_SITC.Category.unique()

array(['', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'T'],
      dtype=object)

In [90]:
# categorías de productos
code_SITC[code_SITC['Category'] =='']

Unnamed: 0,SITC_code,SITC_description,Category
0,0,Food and live animals chiefly for food,
265,1,Beverages and tobacco,
295,2,"Crude materials, inedible, except fuels",
577,3,"Mineral fuels, lubricants and related materials",
635,4,"Animal and vegetable oils, fats and waxes",
675,5,"Chemicals and related products, nes",
1053,6,Manufactured goods classified chiefly by mater...,
1726,7,Machinery and transport equipment,
2171,8,Miscellaneous manufactured articles,
2547,9,Commodities and transactions not classified el...,


In [91]:
categ_SITC_descrip = code_SITC[code_SITC['Category'] =='']
categ_SITC_descrip 

Unnamed: 0,SITC_code,SITC_description,Category
0,0,Food and live animals chiefly for food,
265,1,Beverages and tobacco,
295,2,"Crude materials, inedible, except fuels",
577,3,"Mineral fuels, lubricants and related materials",
635,4,"Animal and vegetable oils, fats and waxes",
675,5,"Chemicals and related products, nes",
1053,6,Manufactured goods classified chiefly by mater...,
1726,7,Machinery and transport equipment,
2171,8,Miscellaneous manufactured articles,
2547,9,Commodities and transactions not classified el...,


In [92]:
categ_SITC_descrip.to_csv('../data/processed/categ_SITC_descrip.csv', index=False)

In [37]:
code_SITC[code_SITC['Category'] =='T'] # chequeo existe una sola línea de total commodities

Unnamed: 0,SITC_code,SITC_description,Category
2575,Total,All Commodities,T


In [38]:
code_SITC = code_SITC[(code_SITC['Category'] != '') & (code_SITC['Category'] !='T')]
code_SITC

Unnamed: 0,SITC_code,SITC_description,Category
1,00,Live animals chiefly for food,0
2,001,Live animals chiefly for food,0
3,0011,Animals of the bovine species (including buffa...,0
4,00111,-- pure bred for breeding,0
5,00119,-- other than pure bred for breeding,0
...,...,...,...
2570,971,"Gold, non-monetary (excluding gold ores and co...",9
2571,9710,"Gold, non-monetary (excluding gold ores and co...",9
2572,97101,"Gold, non-monetary, unwrought or semi-manufact...",9
2573,97102,"Rolled gold on base metal or silver, unworked ...",9


In [39]:
code_SITC.SITC_code = code_SITC.SITC_code.astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


#### Tabla de correspondencias SITC Rev2 y HS 1996

In [40]:
correl_SITC_HS = pd.read_csv('../data/raw/Nomenclaturas/JobID-23_Concordance_H1_to_S2.csv') 
correl_SITC_HS

Unnamed: 0,HS 1996 Product Code,HS 1996 Product Description,SITC Revision 2 Product Code,SITC Revision 2 Product Description
0,10111,Horses :-- Pure-bred breeding animals,15,"Horses, asses, mules and hinnies, live"
1,10119,Horses :-- Other,15,"Horses, asses, mules and hinnies, live"
2,10120,"Asses, mules and hinnies",15,"Horses, asses, mules and hinnies, live"
3,10210,Pure-bred breeding animals,111,"Bovine species pure bred,for breeding"
4,10290,Other,119,Bovine species other than pure bred breeding s...
...,...,...,...,...
5106,970200,"Original engravings, prints and lithographs.",89602,"Original engravings,prints and lithographs"
5107,970300,"Original sculptures and statuary, in any mater...",89603,Original sculptures and statuary
5108,970400,"Postage or revenue stamps, stamp-postmarks, fi...",89604,"Postage,revenue and sim.stamps"
5109,970500,Collections and collectors' pieces of zoologic...,89605,"Collections of zoological,botanical etc.interest"


In [41]:
correl_SITC_HS = correl_SITC_HS.rename(columns={'HS 1996 Product Code': 'HS_code', 'HS 1996 Product Description': 'HS_description', 'SITC Revision 2 Product Code':'SITC_Rev2_code', 'SITC Revision 2 Product Description':'SITC_Rev2_desc'})
correl_SITC_HS

Unnamed: 0,HS_code,HS_description,SITC_Rev2_code,SITC_Rev2_desc
0,10111,Horses :-- Pure-bred breeding animals,15,"Horses, asses, mules and hinnies, live"
1,10119,Horses :-- Other,15,"Horses, asses, mules and hinnies, live"
2,10120,"Asses, mules and hinnies",15,"Horses, asses, mules and hinnies, live"
3,10210,Pure-bred breeding animals,111,"Bovine species pure bred,for breeding"
4,10290,Other,119,Bovine species other than pure bred breeding s...
...,...,...,...,...
5106,970200,"Original engravings, prints and lithographs.",89602,"Original engravings,prints and lithographs"
5107,970300,"Original sculptures and statuary, in any mater...",89603,Original sculptures and statuary
5108,970400,"Postage or revenue stamps, stamp-postmarks, fi...",89604,"Postage,revenue and sim.stamps"
5109,970500,Collections and collectors' pieces of zoologic...,89605,"Collections of zoological,botanical etc.interest"


Los datos a nivel de producto se encuentran a 6 dígitos del SA, por lo que voy a tener que convertir a nivel de partida arancelaria (4 dígitos) para que coincida con los datos de comercio con que se está trabajando. 

In [42]:
# genero una nueva columna que considere nivel de partida arancelaria (4 dígitos) en vez de 6 digtios del SA (HS)
correl_SITC_HS['HS_4dig'] = np.where(correl_SITC_HS.HS_code.astype(str).str.len() == 5, correl_SITC_HS.HS_code.astype(str).str[:3], correl_SITC_HS.HS_code.astype(str).str[:4]) 
correl_SITC_HS

Unnamed: 0,HS_code,HS_description,SITC_Rev2_code,SITC_Rev2_desc,HS_4dig
0,10111,Horses :-- Pure-bred breeding animals,15,"Horses, asses, mules and hinnies, live",101
1,10119,Horses :-- Other,15,"Horses, asses, mules and hinnies, live",101
2,10120,"Asses, mules and hinnies",15,"Horses, asses, mules and hinnies, live",101
3,10210,Pure-bred breeding animals,111,"Bovine species pure bred,for breeding",102
4,10290,Other,119,Bovine species other than pure bred breeding s...,102
...,...,...,...,...,...
5106,970200,"Original engravings, prints and lithographs.",89602,"Original engravings,prints and lithographs",9702
5107,970300,"Original sculptures and statuary, in any mater...",89603,Original sculptures and statuary,9703
5108,970400,"Postage or revenue stamps, stamp-postmarks, fi...",89604,"Postage,revenue and sim.stamps",9704
5109,970500,Collections and collectors' pieces of zoologic...,89605,"Collections of zoological,botanical etc.interest",9705


### Uno las tablas para conseguir un agrupamiento de bienes (según categoría SITC)

In [43]:
correl_SITC_HS = pd.merge(correl_SITC_HS, code_SITC, how='left', left_on=['SITC_Rev2_code'], right_on=['SITC_code'])
correl_SITC_HS          

Unnamed: 0,HS_code,HS_description,SITC_Rev2_code,SITC_Rev2_desc,HS_4dig,SITC_code,SITC_description,Category
0,10111,Horses :-- Pure-bred breeding animals,15,"Horses, asses, mules and hinnies, live",101,15,"Equine species, live",0
1,10119,Horses :-- Other,15,"Horses, asses, mules and hinnies, live",101,15,"Equine species, live",0
2,10120,"Asses, mules and hinnies",15,"Horses, asses, mules and hinnies, live",101,15,"Equine species, live",0
3,10210,Pure-bred breeding animals,111,"Bovine species pure bred,for breeding",102,111,-- pure bred for breeding,0
4,10210,Pure-bred breeding animals,111,"Bovine species pure bred,for breeding",102,111,"Bovine meat, fresh, chilled or frozen",0
...,...,...,...,...,...,...,...,...
5254,970200,"Original engravings, prints and lithographs.",89602,"Original engravings,prints and lithographs",9702,89602,"Original engravings, prints and lithographs",8
5255,970300,"Original sculptures and statuary, in any mater...",89603,Original sculptures and statuary,9703,89603,Original sculptures and statuary,8
5256,970400,"Postage or revenue stamps, stamp-postmarks, fi...",89604,"Postage,revenue and sim.stamps",9704,89604,Stamps for philately,8
5257,970500,Collections and collectors' pieces of zoologic...,89605,"Collections of zoological,botanical etc.interest",9705,89605,"Coins, nature collections",8


In [44]:
correl_SITC_HS.Category.astype(int).unique()

array([0, 1, 9, 4, 3, 2, 5, 7, 6, 8])

In [45]:
df_correl_SITC_HS = correl_SITC_HS[['HS_4dig', 'Category']].drop_duplicates() # hay más de una categoría por prod, según se pudo verificar
df_correl_SITC_HS

Unnamed: 0,HS_4dig,Category
0,101,0
3,102,0
5,102,1
7,103,0
10,104,0
...,...,...
5254,9702,8
5255,9703,8
5256,9704,8
5257,9705,8


In [46]:
df_correl_SITC_HS.HS_4dig.value_counts().head(55) # hay 55 codigos que tienen asignada más de una categoría

3215    2
3306    2
907     2
9306    2
1805    2
8471    2
7112    2
904     2
2101    2
805     2
902     2
7204    2
2818    2
8402    2
104     2
905     2
401     2
903     2
210     2
9018    2
906     2
1212    2
2844    2
5105    2
910     2
909     2
3823    2
3104    2
6910    2
2007    2
6301    2
102     2
7102    2
8403    2
7322    2
2302    2
8480    2
301     2
403     2
901     2
1701    2
3603    2
8404    2
407     2
2009    2
1801    2
1702    2
204     2
302     2
2006    2
701     2
908     2
1001    2
8544    2
3102    2
Name: HS_4dig, dtype: int64

In [47]:
# armo lista de códigos duplicados con distintas categorías
list_duplic_tot = df_correl_SITC_HS.HS_4dig.value_counts().head(55)
list_duplic_tot = list(list_duplic_tot.index)

In [48]:
# filtro el dataset por duplicados para ver de qué bienes se trata
# la mayoría de los duplicados son prod agro y corresponden a la categ 0
df_duplic_tot = df_correl_SITC_HS[df_correl_SITC_HS.HS_4dig.isin(list_duplic_tot)]
df_duplic_tot 

Unnamed: 0,HS_4dig,Category
3,102,0
5,102,1
10,104,0
12,104,1
40,204,0
...,...,...
4722,8544,6
4947,9018,7
4953,9018,8
5116,9306,9


In [49]:
# se listan todas las categorías existentes por cada codigo de producto
L = df_duplic_tot.groupby('HS_4dig')['Category'].apply(list)
df_L = pd.DataFrame(L)
df_L.Category.head()

HS_4dig
1001    [0, 4]
102     [0, 1]
104     [0, 1]
1212    [0, 2]
1701    [0, 6]
Name: Category, dtype: object

In [50]:
# hago pruebas para extraer el dato de categoría más frecuente
a = [1936, 2401, 2916, 4761, 9216, 9216, 9604, 9801] 
Counter(a).most_common(1)[0][0]

9216

In [51]:
# se selecciona la categoría más frecuente para luego reemplazar en el dataset original 
df_L['categ_most_freq'] = [Counter(item).most_common(1) for item in df_L.Category]
df_L['categ_limpia'] = [item[0][0] for item in df_L.categ_most_freq]

In [52]:
df_L.head()

Unnamed: 0_level_0,Category,categ_most_freq,categ_limpia
HS_4dig,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1001,"[0, 4]","[(0, 1)]",0
102,"[0, 1]","[(0, 1)]",0
104,"[0, 1]","[(0, 1)]",0
1212,"[0, 2]","[(0, 1)]",0
1701,"[0, 6]","[(0, 1)]",0


In [53]:
df_L = df_L[['categ_limpia']]
df_L = df_L.rename(columns={'categ_limpia':'Category'})
df_L['HS_4dig'] = df_L.index

In [54]:
df_L = df_L[['HS_4dig', 'Category']]
df_L.head()

Unnamed: 0_level_0,HS_4dig,Category
HS_4dig,Unnamed: 1_level_1,Unnamed: 2_level_1
1001,1001,0
102,102,0
104,104,0
1212,1212,0
1701,1701,0


In [55]:
# excluyo las posiciones que estaban con más de una categoría
df_correl_SITC_HS = df_correl_SITC_HS.set_index(df_correl_SITC_HS.HS_4dig)
df_correl_SITC_HS = df_correl_SITC_HS[~df_correl_SITC_HS.HS_4dig.isin(list_duplic_tot)]
df_correl_SITC_HS

Unnamed: 0_level_0,HS_4dig,Category
HS_4dig,Unnamed: 1_level_1,Unnamed: 2_level_1
101,101,0
103,103,0
105,105,0
106,106,9
201,201,0
...,...,...
9702,9702,8
9703,9703,8
9704,9704,8
9705,9705,8


In [56]:
# agrego datos de categorías sin duplicar código de producto según la selección de categ más frecuente
df_correl_SITC_HS = df_correl_SITC_HS.append(df_L)

In [57]:
df_correl_SITC_HS # 1241 bienes

Unnamed: 0_level_0,HS_4dig,Category
HS_4dig,Unnamed: 1_level_1,Unnamed: 2_level_1
101,101,0
103,103,0
105,105,0
106,106,9
201,201,0
...,...,...
907,907,0
908,908,0
909,909,0
910,910,0


In [58]:
# guardo la lista para levantarla desde otros notebooks
df_correl_SITC_HS.to_csv('../data/processed/df_correl_SITC_HS_categ.csv', index=False)

In [59]:
# aquí abajo dejo pruebas que hice para categorizar listado de prod de la bioecon
# termino usando la anterior porque quiero poder clasificar todos los bienes de la base
df_list_bio.lista_prodbio = df_list_bio.lista_prodbio.astype(int)
correl_SITC_HS.HS_4dig = correl_SITC_HS.HS_4dig.astype(int)

In [60]:
# uno lista de prod bio con la categoría en SITC para tener un agrupamiento de bienes 
df_list_ = pd.merge(df_list_bio, correl_SITC_HS, how='left', left_on=['lista_prodbio'], right_on=['HS_4dig'])
df_list_

Unnamed: 0,lista_prodbio,HS_code,HS_description,SITC_Rev2_code,SITC_Rev2_desc,HS_4dig,SITC_code,SITC_description,Category
0,4101,410110,"Whole hides and skins of bovine animals, of a ...",2112,"Calf skins,raw (fresh,salted,dried,pickled/limed",4101,2112,"Calf skins, raw, whether or not split",2
1,4101,410121,"Other hides and skins of bovine animals, fresh...",2111,"Bovine & equine hides (other than calf),raw",4101,2111,"Bovine and equine hides, raw, whether or not s...",2
2,4101,410122,"Other hides and skins of bovine animals, fresh...",2111,"Bovine & equine hides (other than calf),raw",4101,2111,"Bovine and equine hides, raw, whether or not s...",2
3,4101,410129,"Other hides and skins of bovine animals, fresh...",2111,"Bovine & equine hides (other than calf),raw",4101,2111,"Bovine and equine hides, raw, whether or not s...",2
4,4101,410130,"Other hides and skins of bovine animals, other...",2111,"Bovine & equine hides (other than calf),raw",4101,2111,"Bovine and equine hides, raw, whether or not s...",2
...,...,...,...,...,...,...,...,...,...
2734,6117,611780,Other accessories,84719,"Made up accessories,n.e.s., of textile fabbrics",6117,84719,"Made up accessories, nes, for articles of apparel",8
2735,6117,611790,Parts,84719,"Made up accessories,n.e.s., of textile fabbrics",6117,84719,"Made up accessories, nes, for articles of apparel",8
2736,2849,284910,Of calcium,52393,Calcium carbide,2849,52393,Calcium carbide,5
2737,2849,284920,Of silicon,52394,"Carbides,other than calcium carbide",2849,52394,Carbides (other than calcium carbide),5


In [61]:
df_list2 = df_list_[['lista_prodbio','Category']].drop_duplicates()
df_list2 # 632 no coincide con los 593, así que voy a revisar si no hay codigos que tengan más de una categoría

Unnamed: 0,lista_prodbio,Category
0,4101,2
6,4102,2
9,4103,2
12,4104,6
18,4105,6
...,...,...
2715,6114,8
2719,6115,8
2727,6116,8
2732,6117,8


In [62]:
# chequeo dato de categorías de productos
df_list2.Category.unique() 

array(['2', '6', '0', '7', '8', '1', '9', '5', '4', '3'], dtype=object)

In [63]:
# efectivamente hay códigos con más de una categoría 
df_list2.lista_prodbio.value_counts().head(39) 

910     2
805     2
2007    2
2006    2
2302    2
3823    2
1701    2
6301    2
3306    2
102     2
104     2
210     2
1801    2
204     2
909     2
908     2
907     2
906     2
905     2
904     2
903     2
3215    2
902     2
901     2
401     2
403     2
407     2
2009    2
1702    2
2101    2
301     2
2818    2
1805    2
1001    2
5105    2
3603    2
302     2
1212    2
701     2
Name: lista_prodbio, dtype: int64

In [64]:
list_duplic = df_list2.lista_prodbio.value_counts().head(39) 
list_duplic = list(list_duplic.index)

In [65]:
# la mayoría de los duplicados son prod agro y corresponden a la categ 0 => 33 bienes (vamos luego a filtrar para que quede cat 0)
df_duplic = df_list2[df_list2.lista_prodbio.isin(list_duplic)]
df_duplic[:20] 

Unnamed: 0,lista_prodbio,Category
37,2101,0
38,2101,7
178,102,0
180,102,1
185,104,0
187,104,1
245,6301,7
246,6301,6
324,204,0
325,204,1


In [66]:
# df_duplic[20:40] 

In [67]:
# df_duplic[40:60] 

In [68]:
# df_duplic[60:80] 

In [69]:
len(df_list2[(df_list2.lista_prodbio.isin(list_duplic)) & (df_list2.Category=='0')]) # 33 de los 39 bienes corresponden a categoría 0

33

In [70]:
df_list2[(df_list2.lista_prodbio.isin(list_duplic)) & (df_list2.Category=='0')]

Unnamed: 0,lista_prodbio,Category
37,2101,0
178,102,0
185,104,0
324,204,0
369,210,0
406,2302,0
462,301,0
472,302,0
669,401,0
680,403,0


In [71]:
# veamos que ocurre con los 6 bienes no agro que están duplicados, a cuál categoría efectivamente corresponden

In [72]:
df_list_[df_list_.lista_prodbio==6301] # mayoría de los subproductos a 6 díg corresponden a 6 así que le pondre esa cat

Unnamed: 0,lista_prodbio,HS_code,HS_description,SITC_Rev2_code,SITC_Rev2_desc,HS_4dig,SITC_code,SITC_description,Category
245,6301,630110,Electric blankets,77585,Electric blankets,6301,77585,Electric blankets,7
246,6301,630120,Blankets (other than electric blankets) and tr...,65831,"Travelling rugs and blankets,of wool/fin.anim....",6301,65831,-- of wool or fine animal hair,6
247,6301,630130,Blankets (other than electric blankets) and tr...,65832,"Travelling rugs and blankets,of cotton",6301,65832,-- of cotton,6
248,6301,630140,Blankets (other than electric blankets) and tr...,65833,"Travelling rugs and blankets,of synthetic fibres",6301,65833,-- of synthetic fibres,6
249,6301,630190,Other blankets and travelling rugs,65839,"Travelling rugs and blankets,of other fibres",6301,65839,-- of other fibres,6


In [73]:
df_list_[df_list_.lista_prodbio==2818] # cat 5 

Unnamed: 0,lista_prodbio,HS_code,HS_description,SITC_Rev2_code,SITC_Rev2_desc,HS_4dig,SITC_code,SITC_description,Category
1006,2818,281810,"Artificial corundum, whether or not chemically...",52257,Artificial corundum,2818,52257,Artificial corundum,5
1007,2818,281820,Aluminium oxide; other than artificial corundum,28732,Alumina (aluminium oxide),2818,28732,Alumina (aluminium oxide),2
1008,2818,281830,Aluminium hydroxide,52256,Aluminium hydroxide,2818,52256,Aluminium hydroxide,5


In [74]:
df_list_[df_list_.lista_prodbio==5105] # cat 2

Unnamed: 0,lista_prodbio,HS_code,HS_description,SITC_Rev2_code,SITC_Rev2_desc,HS_4dig,SITC_code,SITC_description,Category
1647,5105,510510,Carded wool,2687,"Sheep's/lamb's wool/other aimal hair,carded/co...",5105,2687,"Sheep's or lambs' wool, or of other animal hai...",2
1648,5105,510521,Wool tops and other combed wool :-- Combed woo...,2687,"Sheep's/lamb's wool/other aimal hair,carded/co...",5105,2687,"Sheep's or lambs' wool, or of other animal hai...",2
1649,5105,510529,Wool tops and other combed wool :-- Other,65121,Wool tops,5105,65121,Wool tops,6
1650,5105,510530,"Fine animal hair, carded or combed",2687,"Sheep's/lamb's wool/other aimal hair,carded/co...",5105,2687,"Sheep's or lambs' wool, or of other animal hai...",2
1651,5105,510540,"Coarse animal hair, carded or combed",2687,"Sheep's/lamb's wool/other aimal hair,carded/co...",5105,2687,"Sheep's or lambs' wool, or of other animal hai...",2


In [75]:
df_list_[df_list_.lista_prodbio==3215] # cat 5

Unnamed: 0,lista_prodbio,HS_code,HS_description,SITC_Rev2_code,SITC_Rev2_desc,HS_4dig,SITC_code,SITC_description,Category
1869,3215,321511,Printing ink :-- Black,5332,Printing ink,3215,5332,Printing inks,5
1870,3215,321519,Printing ink :-- Other,5332,Printing ink,3215,5332,Printing inks,5
1871,3215,321590,Other,89591,Writing and other inks,3215,89591,"Writing ink, excluding printing ink",8


In [76]:
df_list_[df_list_.lista_prodbio==3306] # cat 5

Unnamed: 0,lista_prodbio,HS_code,HS_description,SITC_Rev2_code,SITC_Rev2_desc,HS_4dig,SITC_code,SITC_description,Category
1969,3306,330610,Dentifrices,5530,"Perfumery,cosmetics and toilet preparations",3306,5530,"Perfumery, cosmetics, toilet preparations, etc",5
1970,3306,330620,Yarn used to clean between the teeth (dental f...,65142,"Yarn non-textured of contin.polyamide fibres,n...",3306,65142,"-- nontextured of continuous polyamide, from n...",6
1971,3306,330690,Other,5530,"Perfumery,cosmetics and toilet preparations",3306,5530,"Perfumery, cosmetics, toilet preparations, etc",5


In [77]:
df_list_[df_list_.lista_prodbio==3823] # cat 4

Unnamed: 0,lista_prodbio,HS_code,HS_description,SITC_Rev2_code,SITC_Rev2_desc,HS_4dig,SITC_code,SITC_description,Category
2485,3823,382311,Industrial monocarboxylic fatty acids; acid oi...,43131,Fatty acids;acid oils from refining,3823,43131,Fatty acids; acid oils from refining,4
2486,3823,382312,Industrial monocarboxylic fatty acids; acid oi...,43131,Fatty acids;acid oils from refining,3823,43131,Fatty acids; acid oils from refining,4
2487,3823,382313,Industrial monocarboxylic fatty acids; acid oi...,43131,Fatty acids;acid oils from refining,3823,43131,Fatty acids; acid oils from refining,4
2488,3823,382319,Industrial monocarboxylic fatty acids; acid oi...,43131,Fatty acids;acid oils from refining,3823,43131,Fatty acids; acid oils from refining,4
2489,3823,382370,Industrial fatty alcohols,51217,Fatty alcohols,3823,51217,Fatty alcohols,5


In [78]:
df_list = df_list2[~df_list2.lista_prodbio.isin(list_duplic)] # excluyo los datos duplicados
df_list

Unnamed: 0,lista_prodbio,Category
0,4101,2
6,4102,2
9,4103,2
12,4104,6
18,4105,6
...,...,...
2715,6114,8
2719,6115,8
2727,6116,8
2732,6117,8


In [79]:
# agrego datos sin duplicar, en base a las decisiones previas que tomé
df_list = df_list.append(df_list2[(df_list2.lista_prodbio.isin(list_duplic)) & (df_list2.Category=='0')])
df_list

Unnamed: 0,lista_prodbio,Category
0,4101,2
6,4102,2
9,4103,2
12,4104,6
18,4105,6
...,...,...
2499,1801,0
2505,1805,0
2589,2006,0
2591,2007,0


In [80]:
df_duplic[~df_duplic.lista_prodbio.isin(df_duplic[df_duplic.Category=='0'].lista_prodbio)]

Unnamed: 0,lista_prodbio,Category
245,6301,7
246,6301,6
1006,2818,5
1007,2818,2
1647,5105,2
1649,5105,6
1869,3215,5
1871,3215,8
1969,3306,5
1970,3306,6


In [81]:
# selecciono las categorías según la decisión que tomé antes
df_duplic[~df_duplic.lista_prodbio.isin(df_duplic[df_duplic.Category=='0'].lista_prodbio)].loc[[246,1006,1647,1869,1969,2485],:]

Unnamed: 0,lista_prodbio,Category
246,6301,6
1006,2818,5
1647,5105,2
1869,3215,5
1969,3306,5
2485,3823,4


In [82]:
# agrego datos sin duplicar de los restantes bienes
df_list = df_list.append(df_duplic[~df_duplic.lista_prodbio.isin(df_duplic[df_duplic.Category=='0'].lista_prodbio)].loc[[246,1006,1647,1869,1969,2485],:])
df_list # ahora si tengo los 593 bienens con su respectiva categoría

Unnamed: 0,lista_prodbio,Category
0,4101,2
6,4102,2
9,4103,2
12,4104,6
18,4105,6
...,...,...
1006,2818,5
1647,5105,2
1869,3215,5
1969,3306,5
