In [1]:
from compute_lda import *



### Some functions

In [2]:
def count_country(df,name):
    cols = ['port_of_unlading','foreign_port_of_lading','place_of_receipt'
            ,'port_of_destination','foreign_port_of_destination'
           ,'consignee_address','shipper_address']
    print('total rows is {}'.format(data.shape[0]))
    for i,col in enumerate(cols):
        temp_col = data[col].str.contains(name,case=False,regex=False)
        print('{} contains {} of rows of {}'.format(col,temp_col.sum(),name))
        if i==0:
            sum_col = temp_col.copy()
        else:
            sum_col = sum_col.add(temp_col,fill_value=0)
    print('{} of rows are {}'.format(sum_col[sum_col>0].sum(),name))

In [3]:
six_countries = ['Germany','Belgium', 'Vietnam' ,'Spain', 'United Kingdom','France']

### Data Processing

In [4]:
data = read_data()

In [5]:
data.shape

(3825317, 26)

In [6]:
def simple_process_data(data):
    # Basic data cleaning
    data = data.dropna(subset=['shipper_party_name','harmonized_number'])
    replace_char = ",.+=_-><\'\":;()!?~/\\@#$%^&*~`[]{}"
    replace_dict = {key:value for (key,value) in zip(replace_char,itertools.repeat(''))}
    data['cl_shipper_party_name'] = data['shipper_party_name'].str.translate(str.maketrans(replace_dict)).copy()
    data = data.assign(shipper_id=(data['cl_shipper_party_name']).astype('category').cat.codes)
    data['6_harmonized_number'] = data['harmonized_number'].apply(lambda x: str(x)[0:6])
    return data

In [7]:
data = simple_process_data(data)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [8]:
data.shape

(1315991, 29)

In [9]:
data.columns

Index(['identifier', 'port_of_unlading', 'estimated_arrival_date',
       'foreign_port_of_lading', 'record_status_indicator', 'place_of_receipt',
       'port_of_destination', 'foreign_port_of_destination',
       'actual_arrival_date', 'consignee_name', 'consignee_address',
       'consignee_contact_name', 'consignee_comm_number_qualifier',
       'consignee_comm_number', 'shipper_party_name', 'shipper_address',
       'shipper_contact_name', 'shipper_comm_number_qualifier',
       'shipper_comm_number', 'description_sequence_number', 'piece_count',
       'description_text', 'harmonized_number', 'harmonized_value',
       'harmonized_weight', 'harmonized_weight_unit', 'cl_shipper_party_name',
       'shipper_id', '6_harmonized_number'],
      dtype='object')

### Bag of Words

In [26]:
bag_of_words = data.groupby(by=['6_harmonized_number','shipper_party_name']).size()#.unstack(fill_value=0)

In [32]:
data['actual_arrival_date'].describe()

count                 1315456
unique                    307
top       2018-02-03 00:00:00
freq                    20390
first     2014-07-11 00:00:00
last      2018-05-28 00:00:00
Name: actual_arrival_date, dtype: object

In [33]:
data[data['actual_arrival_date'] >= '2018-01-01']

Unnamed: 0,identifier,port_of_unlading,estimated_arrival_date,foreign_port_of_lading,record_status_indicator,place_of_receipt,port_of_destination,foreign_port_of_destination,actual_arrival_date,consignee_name,...,description_sequence_number,piece_count,description_text,harmonized_number,harmonized_value,harmonized_weight,harmonized_weight_unit,cl_shipper_party_name,shipper_id,6_harmonized_number
0,2018022169540,"Norfolk, Virginia",2018-02-18,"Anvers,Belgium",New,ANTWERP,,,2018-02-20,"ECU WORLDWIDE (USA), INC.",...,1,27.0,27 PACKAGES ON 27 PALLETS BEING 15 BIGBAGS N ...,701911,0.0,0.0,,FCL MARINE AGENCIES BV,11820,701911
67,2018022833768,"Baltimore, Maryland",2018-02-25,"Anvers,Belgium",New,GOTHENBURG,,,2018-02-27,OVERSEAS PAPERBOARD CORPORATION,...,1,9.0,NEW BILLERUD FLUTE ORDER REF632931 CIP BALTIMO...,480256,46966.0,23483.0,Kilograms,BILLERUDKORSNAS AB,4099,480256
69,2018022833768,"Baltimore, Maryland",2018-02-25,"Anvers,Belgium",New,GOTHENBURG,,,2018-02-27,OVERSEAS PAPERBOARD CORPORATION,...,1,9.0,NEW BILLERUD FLUTE ORDER REF632931 CIP BALTIMO...,480256,46966.0,23483.0,Kilograms,BILLERUDKORSNAS AB,4099,480256
70,2018022833768,"Baltimore, Maryland",2018-02-25,"Anvers,Belgium",New,GOTHENBURG,,,2018-02-27,OVERSEAS PAPERBOARD CORPORATION,...,1,8.0,NEW BILLERUD FLUTE ORDER REF632931 CIP BALTIMO...,480256,42144.0,21072.0,Kilograms,BILLERUDKORSNAS AB,4099,480256
78,2018022838502,"New York, New York",2018-02-23,"Anvers,Belgium",Amended,ANTWERP,,,2018-02-25,JAGRO CUSTOMS BROKERS INC,...,5,2.0,CASES S.T.C. PARTS AND ACCESSORIES FOR CHEMICA...,730630,1986.0,993.0,Kilograms,SCHNEIDER CIE AG,30097,730630
88,2018022843025,"Norfolk, Virginia",2018-02-23,"Anvers,Belgium",New,TROISDORF DE,"Cincinnati, Ohio",,2018-02-27,CARGO BROKERS INTERNATIONAL INC.,...,1,10.0,HOT ROLLED STEEL PROFILES,721650,374920.0,18746.0,Kilograms,KUEHNE NAGEL AG CO KG,19440,721650
89,2018022843025,"Norfolk, Virginia",2018-02-23,"Anvers,Belgium",New,TROISDORF DE,"Cincinnati, Ohio",,2018-02-27,CARGO BROKERS INTERNATIONAL INC.,...,1,12.0,HOT ROLLED STEEL PROFILES,721650,371320.0,18566.0,Kilograms,KUEHNE NAGEL AG CO KG,19440,721650
325,2018030548912,"Baltimore, Maryland",2018-03-02,"Zeebrugge,Belgium",New,ZEEBRUGGE,,,2018-03-03,DELUXE GROUP LTD,...,1,1.0,G781C ESSENTIEL 2018 CHASSIS,8716100030,0.0,3001.0,Kilograms,GP SAS GROUPE PILOTE,14070,871610
326,2018030548912,"Baltimore, Maryland",2018-03-02,"Zeebrugge,Belgium",New,ZEEBRUGGE,,,2018-03-03,DELUXE GROUP LTD,...,3,1.0,G781C ESSENTIEL 2018 CHASSIS,8716100030,0.0,3001.0,Kilograms,GP SAS GROUPE PILOTE,14070,871610
327,2018030637395,"Charleston, South Carolina",2018-02-28,"Anvers,Belgium",Amended,ANTWERP,,,2018-03-03,DSV AIR & SEA SA DE CV,...,1,48.0,BODY PARTS HS CODE: 87082 990 COUNTR...,847910,393768.0,43752.0,Pounds,DSV AIR SEA GMBH,9614,847910


In [27]:
bag_of_words

6_harmonized_number  shipper_party_name                 
010121               INTERTEAM, S.A. DE C.V.                 1
                     PANALPINA WELTTRANSPORT                 1
                     TRAFIMAR RELOCATION S                   1
010129               DACOTRANS DE LATINOAMERICA COLOMBIA     6
                     INTERTEAM, S.A. DE C.V.                 1
                     JOHN MASON INTERNATIONAL LIMITED        2
                     OLDHAMS REMOVALS LTD                    1
                     ROY TREVOR LTD                          1
010190               DEN HARTOGH AMERICAS INC.               5
                     MICO VIRIDIS FEED SOLUTIONS LLC        10
010229               ARLANXEO SINGAPORE PTE. LTD.            2
                     IMPEXTRACO NV                           2
                     TRADEWINDS INTERNATIONAL, LLC          18
010612               WILHELM REUSS GMBH & CO.KG              1
010614               EJ FOODS                                

In [22]:
bag_of_words.describe()

count    76757.000000
mean        17.144899
std        170.115882
min          1.000000
25%          1.000000
50%          3.000000
75%          9.000000
max      26376.000000
dtype: float64

In [23]:
bag_of_words.sort_values()

6_harmonized_number  shipper_party_name                 
010121               INTERTEAM, S.A. DE C.V.                    1
740721               FILTO PROFILES SL                          1
                     GLOBELINK UNIEXCO, S.L. (BARCELONA)        1
                     PANALPINA WELTTRANSPORT                    1
390330               ELIX POLYMERS S.L.                         1
740721               TRANSGLORY S.A.                            1
390330               CEVA FREIGHT (ESPANA), S.L.U. AS AG        1
740721               URBATEK CERAMICS S.A.                      1
                     WALCOWNIA METALI DZIEDZICE                 1
920992               CARGO MARKETING SERVICES LIMITED           1
960330               SACO SHIPPING GMBH                         1
740729               ALLIANCE SHIPPING LONDON                   1
                     FEDEX TRADE NETWORKS TRANSPORT &           1
390319               TRANSPED SHIPPING LEVANTE S.L.             1
                   