In [201]:
import pandas as pd
import numpy as np
import spacy
import en_core_web_lg

import xgboost as xgb
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_val_score, GridSearchCV, KFold, RandomizedSearchCV, train_test_split
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix, ConfusionMatrixDisplay


In [202]:
data = pd.read_csv('./data/ds-project-train.csv',low_memory=False)

In [203]:
data.shape

(143280, 30)

In [204]:
data.head()

Unnamed: 0.1,Unnamed: 0,SHIPPER,SHIPPER.ADDRESS,CONSIGNEE,CONSIGNEE.ADDRESS,ZIPCODE,NOTIFY,NOTIFY.ADDRESS,BILL.OF.LADING,ARRIVAL.DATE,...,PRODUCT.DETAILS,MARKS.AND.NUMBERS,COUNTRY.OF.ORIGIN,DISTRIBUTION.PORT,CARRIER.CODE,CARRIER.NAME,CARRIER.ADDRESS,CARRIER.CITY,CARRIER.STATE,CARRIER.ZIP
0,1,,,-NOT AVAILABLE-,,0,,,CMDUSZ2359136,09/20/2012,...,PALLET SKIRTPALLET SKIRT FOR ITEM 948734PURCHA...,"DISTRIBUTED BY SAM-S WEST,INC. BENTONVILLE, AR...",China,,CMDU,COMPAGNIE MARITIME D-AFFRETEMENT,5701 LAKE WRIGHT DR,NORFOLK,VA,23502
1,3,,,-NOT AVAILABLE-,,0,,,CMDUSZ2359139,09/20/2012,...,AIR HOGS HELI CAGE ASTAIR HOGS HELI CAGE ASTP....,TO: WAL-MART CASE IDENTIFICATI ON NUMBER US DE...,China,,CMDU,COMPAGNIE MARITIME D-AFFRETEMENT,5701 LAKE WRIGHT DR,NORFOLK,VA,23502
2,6,,,-NOT AVAILABLE-,,0,,,CMDUSZ2359147,09/20/2012,...,BLACK FLAT PANEL TV MOUNTBLACK FLAT PANEL TV M...,TO WAL-MART USA DEPT 00072 PO 9352743173 ITEM ...,China,,CMDU,COMPAGNIE MARITIME D-AFFRETEMENT,5701 LAKE WRIGHT DR,NORFOLK,VA,23502
3,7,,,-NOT AVAILABLE-,,0,,,CMDUSZ2359149,09/20/2012,...,ON-WALL GLASS SHELFON-WALL GLASS SHELFP.O.NO.:...,TO WAL-MART USA DEPT 00072 PO 9352742880 ITEM ...,China,,CMDU,COMPAGNIE MARITIME D-AFFRETEMENT,5701 LAKE WRIGHT DR,NORFOLK,VA,23502
4,8,,,-NOT AVAILABLE-,,0,,,CMDUSZ2359150,09/20/2012,...,8PCS CONDIMENT SET WITH MDFLAZY SUSAN8PCS COND...,SWC P.O. 4895633105 CATEGORY 00014 SWC ITEM 63...,China,,CMDU,COMPAGNIE MARITIME D-AFFRETEMENT,5701 LAKE WRIGHT DR,NORFOLK,VA,23502


In [205]:
data.columns

Index(['Unnamed: 0', 'SHIPPER', 'SHIPPER.ADDRESS', 'CONSIGNEE',
       'CONSIGNEE.ADDRESS', 'ZIPCODE', 'NOTIFY', 'NOTIFY.ADDRESS',
       'BILL.OF.LADING', 'ARRIVAL.DATE', 'WEIGHT..LB.', 'WEIGHT..KG.',
       'US.PORT', 'QUANTITY', 'Q.UNIT', 'MEASUREMENT', 'M.UNIT',
       'SHIP.REGISTERED.IN', 'VESSEL.NAME', 'CONTAINER.COUNT',
       'PRODUCT.DETAILS', 'MARKS.AND.NUMBERS', 'COUNTRY.OF.ORIGIN',
       'DISTRIBUTION.PORT', 'CARRIER.CODE', 'CARRIER.NAME', 'CARRIER.ADDRESS',
       'CARRIER.CITY', 'CARRIER.STATE', 'CARRIER.ZIP'],
      dtype='object')

In [206]:
#rename columns as it contains dot
data = data.rename(columns={'SHIPPER.ADDRESS': 'SHIPPER_ADDRESS', 'CONSIGNEE.ADDRESS': 'CONSIGNEE_ADDRESS'})
data = data.rename(columns={'NOTIFY.ADDRESS': 'NOTIFY_ADDRESS', 'BILL.OF.LADING': 'BILL_OF_LADING','ARRIVAL.DATE':'ARRIVAL_DATE'})
data = data.rename(columns={'WEIGHT..LB.': 'WEIGHT_LB', 'WEIGHT..KG.': 'WEIGHT_KG','US.PORT':'US_PORT','Q.UNIT':'Q_UNIT'})
data = data.rename(columns={'M.UNIT': 'M_UNIT', 'SHIP.REGISTERED.IN': 'SHIP_REGISTERED_IN','VESSEL.NAME':'VESSEL_NAME'})


data = data.rename(columns={'CONTAINER.COUNT': 'CONTAINER_COUNT', 'PRODUCT.DETAILS': 'PRODUCT_DETAILS','MARKS.AND.NUMBERS':'MARKS_AND_NUMBERS'})
data = data.rename(columns={'COUNTRY.OF.ORIGIN': 'COUNTRY_OF_ORIGIN', 'DISTRIBUTION.PORT': 'DISTRIBUTION_PORT','CARRIER.CODE':'CARRIER_CODE'})
data = data.rename(columns={'CARRIER.NAME': 'CARRIER_NAME', 'CARRIER.ADDRESS': 'CARRIER_ADDRESS','CARRIER.CITY':'CARRIER_CITY'})
data = data.rename(columns={'CARRIER.STATE': 'CARRIER_STATE', 'CARRIER.ZIP': 'CARRIER_ZIP'})
data = data.rename(columns={'PRODUCT.DETAILS': 'PRODUCT_DETAILS'})


In [209]:
#print(data['ENTITY'])

In [210]:
data.COUNTRY_OF_ORIGIN.unique()

array(['China', 'Hong Kong', 'Egypt', 'India', 'Malaysia', 'Pakistan',
       'Thailand', 'Singapore', 'South Korea', 'Panama', 'Vietnam',
       'China Taiwan', 'Brazil', 'Chile', 'Oman', 'Italy', 'Sri Lanka',
       'Bahamas', 'Jamaica', 'Spain', 'Portugal', nan, 'Honduras',
       'South Africa', 'United Kingdom', 'United Arab Em',
       'Dominican Republic', 'Japan', 'Mexico', 'Netherlands',
       'Costa Rica', 'Guatemala', 'Belgium', 'France', 'Israel',
       'Unknown', 'Canada', 'Indonesia', 'Jordan', 'Germany', 'Colombia',
       'New Zealand', 'Philippines', 'Turkey', 'Saudi Arabia', 'Iceland',
       'Argentina', 'Sweden', 'Ecuador', 'US Virgin Is', 'Taiwan',
       'Australia', 'Bermuda', 'Cayman Isl', 'Belize', 'Malta', 'Peru',
       'Morocco', 'Greece', 'Guam', 'Romania', 'American Samoa',
       'Cambodia', 'Haiti', 'Guadeloupe', 'Trinidad', 'Venezuela',
       'Federal Republic of Germany', 'Neth Antilles', 'Denmark',
       'Kuwait'], dtype=object)

In [211]:
#remove row which has country of origin is nan and Unknown
data.drop(data[data['COUNTRY_OF_ORIGIN'] == 'nan'].index, inplace = True)
data.drop(data[data['COUNTRY_OF_ORIGIN'] == 'Unknown'].index, inplace = True)
data.drop(data[data['COUNTRY_OF_ORIGIN'] == 'NaN'].index, inplace = True)

In [212]:
data.loc[data['COUNTRY_OF_ORIGIN']=='nan']
data = data.dropna()
data.COUNTRY_OF_ORIGIN.unique()

array(['China', 'Honduras', 'Pakistan', 'United Arab Em', 'India',
       'Spain', 'Oman', 'Malaysia', 'Hong Kong', 'South Africa',
       'Vietnam', 'Thailand', 'Singapore', 'Brazil', 'South Korea',
       'Belgium', 'Sweden', 'Egypt', 'United Kingdom', 'Italy', 'Panama',
       'China Taiwan', 'Japan', 'Jordan', 'Germany', 'Australia'],
      dtype=object)

In [213]:
data.shape

(1453, 30)

In [214]:
nlp = spacy.load('en_core_web_lg')

data['ENTITY'] = data['PRODUCT_DETAILS'].apply(lambda x: list(nlp(x).ents))
#table = []
#for ent in doc.ents:
#    table.append([ent.text,ent.label_,spacy.explain(ent.label_)])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [215]:
feature_cols=['SHIPPER','SHIPPER_ADDRESS','CONSIGNEE','ZIPCODE','WEIGHT_LB',
             'US_PORT','SHIP_REGISTERED_IN','VESSEL_NAME', 'CONTAINER_COUNT',
              'DISTRIBUTION_PORT', 'CARRIER_CODE','COUNTRY_OF_ORIGIN']
df = data[feature_cols]

In [216]:
df.head()

Unnamed: 0,SHIPPER,SHIPPER_ADDRESS,CONSIGNEE,ZIPCODE,WEIGHT_LB,US_PORT,SHIP_REGISTERED_IN,VESSEL_NAME,CONTAINER_COUNT,DISTRIBUTION_PORT,CARRIER_CODE,COUNTRY_OF_ORIGIN
9308,ACE INTERNATIONAL ELECTRONICS LTD,"RILEY HOUSE, 88 LEI MUK ROAD KWAI CHUNG, NT UN...","WAL-MART STORES, INC.",72716,65962.0,"Seattle, Washington",DENMARK,CLEMENTINE MAERSK,3.0,"Chicago, Illinois",MAEU,China
9623,"AGROPECUARIA MONTELIBANO,S.A. DE C.","BARRIO LA CRUZ, CALLE PRINCIPAL SAN LORENZO,VA...",ORCHARD HOUSE FOODS LTD,63872,126715.0,"Norfolk, Virginia",GERMANY,ISLANDIA,3.0,"Norfolk, Virginia",MAEU,Honduras
9629,AIR SEA TRANSPORT INC,"3/F, GOLDSLAND BUILDING, NO.22-26 MINDEN AVENU...",AIR SEA TRANSPORT (CHICAGO) INC,60007,85948.0,"Los Angeles, California",DENMARK,METTE MAERSK,2.0,"Dallas/Ft. Worth Airport, Texas",MAEU,China
9630,AIR SEA TRANSPORT INC,"3/F, GOLDSLAND BUILDING, NO.22-26 MINDEN AVENU...",AIR SEA TRANSPORT (CHICAGO) INC,60007,115413.0,"Los Angeles, California",SINGAPORE,MAERSK ALGOL,3.0,"Dallas/Ft. Worth Airport, Texas",MAEU,China
9650,AL KARAM TOWEL INDUSTRIES (PVT) L,D-11 D-11 SITE SUPER HIGHWAY SCHEME 33 KARACHI...,WAL-MART CANADA CORP.,0,134337.0,"New York/Newark Area, Newark, New Jersey",UNITED STATES,MAERSK CAROLINA,6.0,"Buffalo-Niagara Falls, New York",MAEU,Pakistan


In [217]:
DUMMY_PORT = pd.get_dummies(df['US_PORT'])
df = pd.merge(
    left=df,
    right=DUMMY_PORT,
    left_index=True,
    right_index=True,
)
df.drop('US_PORT', axis=1, inplace=True)
df.head()

Unnamed: 0,SHIPPER,SHIPPER_ADDRESS,CONSIGNEE,ZIPCODE,WEIGHT_LB,SHIP_REGISTERED_IN,VESSEL_NAME,CONTAINER_COUNT,DISTRIBUTION_PORT,CARRIER_CODE,...,"Houston, Texas","Long Beach, California","Los Angeles, California","Miami, Florida","New York/Newark Area, Newark, New Jersey","Norfolk, Virginia","Oakland, California","Philadelphia, Pennsylvania","Savannah, Georgia","Seattle, Washington"
9308,ACE INTERNATIONAL ELECTRONICS LTD,"RILEY HOUSE, 88 LEI MUK ROAD KWAI CHUNG, NT UN...","WAL-MART STORES, INC.",72716,65962.0,DENMARK,CLEMENTINE MAERSK,3.0,"Chicago, Illinois",MAEU,...,0,0,0,0,0,0,0,0,0,1
9623,"AGROPECUARIA MONTELIBANO,S.A. DE C.","BARRIO LA CRUZ, CALLE PRINCIPAL SAN LORENZO,VA...",ORCHARD HOUSE FOODS LTD,63872,126715.0,GERMANY,ISLANDIA,3.0,"Norfolk, Virginia",MAEU,...,0,0,0,0,0,1,0,0,0,0
9629,AIR SEA TRANSPORT INC,"3/F, GOLDSLAND BUILDING, NO.22-26 MINDEN AVENU...",AIR SEA TRANSPORT (CHICAGO) INC,60007,85948.0,DENMARK,METTE MAERSK,2.0,"Dallas/Ft. Worth Airport, Texas",MAEU,...,0,0,1,0,0,0,0,0,0,0
9630,AIR SEA TRANSPORT INC,"3/F, GOLDSLAND BUILDING, NO.22-26 MINDEN AVENU...",AIR SEA TRANSPORT (CHICAGO) INC,60007,115413.0,SINGAPORE,MAERSK ALGOL,3.0,"Dallas/Ft. Worth Airport, Texas",MAEU,...,0,0,1,0,0,0,0,0,0,0
9650,AL KARAM TOWEL INDUSTRIES (PVT) L,D-11 D-11 SITE SUPER HIGHWAY SCHEME 33 KARACHI...,WAL-MART CANADA CORP.,0,134337.0,UNITED STATES,MAERSK CAROLINA,6.0,"Buffalo-Niagara Falls, New York",MAEU,...,0,0,0,0,1,0,0,0,0,0


In [218]:
df['ZIPCODE'].unique()

array(['72716', '63872', '60007', '0', '21224', '10314', '1-209836',
       '1-479273', '95376', '33014', '1-479277', '78044', '38672',
       '19201', '72712', '30354', '1-978879', '1-7146801-978879', '1930',
       '60191', '22210', '60056', '1-847228', '71330', '30297', '90220',
       '41018', '77031', '92647', '1-212840', '1-416391', '1-503288',
       '1-800308', '1-630810', '1-514685', '60433', '1-905673',
       '1-514956', '1-859538', '1-416766', '1-514905', '1-310216',
       '1-630307', '1-215922', '1-626281', '1-678553', '1-732640',
       '1-757626', '90703', '1-786336', '33182', '90012', '90230',
       '85282', '90670', '1-417823', '77042', '1-732283', '1-843554',
       '1-206448', '1-516280', '11530', '68138', '60106', '38671',
       '1-310469', '45241', '1-514420', '1-908704', '28273', '7102',
       '1-416255', '30022', '37214', '1-404675', '42303', '40004',
       '27101', '1-832295', '1-956727', '1-281582', '13124',
       '1-2815821-281582', '91748', '1-626810', 

In [219]:
df['WEIGHT_LB'] = df['WEIGHT_LB'].astype(float)
df['CONTAINER_COUNT'] = df['CONTAINER_COUNT'].astype(float)
#df['ZIPCODE']=df['ZIPCODE'].str.split('1-').str[0]
#df['ZIPCODE']=pd.to_numeric(df['ZIPCODE'])
#ZIP COde is a categorical

In [220]:
df.head()

Unnamed: 0,SHIPPER,SHIPPER_ADDRESS,CONSIGNEE,ZIPCODE,WEIGHT_LB,SHIP_REGISTERED_IN,VESSEL_NAME,CONTAINER_COUNT,DISTRIBUTION_PORT,CARRIER_CODE,...,"Houston, Texas","Long Beach, California","Los Angeles, California","Miami, Florida","New York/Newark Area, Newark, New Jersey","Norfolk, Virginia","Oakland, California","Philadelphia, Pennsylvania","Savannah, Georgia","Seattle, Washington"
9308,ACE INTERNATIONAL ELECTRONICS LTD,"RILEY HOUSE, 88 LEI MUK ROAD KWAI CHUNG, NT UN...","WAL-MART STORES, INC.",72716,65962.0,DENMARK,CLEMENTINE MAERSK,3.0,"Chicago, Illinois",MAEU,...,0,0,0,0,0,0,0,0,0,1
9623,"AGROPECUARIA MONTELIBANO,S.A. DE C.","BARRIO LA CRUZ, CALLE PRINCIPAL SAN LORENZO,VA...",ORCHARD HOUSE FOODS LTD,63872,126715.0,GERMANY,ISLANDIA,3.0,"Norfolk, Virginia",MAEU,...,0,0,0,0,0,1,0,0,0,0
9629,AIR SEA TRANSPORT INC,"3/F, GOLDSLAND BUILDING, NO.22-26 MINDEN AVENU...",AIR SEA TRANSPORT (CHICAGO) INC,60007,85948.0,DENMARK,METTE MAERSK,2.0,"Dallas/Ft. Worth Airport, Texas",MAEU,...,0,0,1,0,0,0,0,0,0,0
9630,AIR SEA TRANSPORT INC,"3/F, GOLDSLAND BUILDING, NO.22-26 MINDEN AVENU...",AIR SEA TRANSPORT (CHICAGO) INC,60007,115413.0,SINGAPORE,MAERSK ALGOL,3.0,"Dallas/Ft. Worth Airport, Texas",MAEU,...,0,0,1,0,0,0,0,0,0,0
9650,AL KARAM TOWEL INDUSTRIES (PVT) L,D-11 D-11 SITE SUPER HIGHWAY SCHEME 33 KARACHI...,WAL-MART CANADA CORP.,0,134337.0,UNITED STATES,MAERSK CAROLINA,6.0,"Buffalo-Niagara Falls, New York",MAEU,...,0,0,0,0,1,0,0,0,0,0


In [221]:
categorical=['SHIPPER','SHIPPER_ADDRESS','CONSIGNEE','SHIP_REGISTERED_IN','VESSEL_NAME','DISTRIBUTION_PORT','CARRIER_CODE','ZIPCODE','COUNTRY_OF_ORIGIN']

In [222]:
# Label encoding for categoricals
le = preprocessing.LabelEncoder()
for cat in categorical:
    df[cat] = le.fit_transform(df[cat])


In [223]:
df.head()

Unnamed: 0,SHIPPER,SHIPPER_ADDRESS,CONSIGNEE,ZIPCODE,WEIGHT_LB,SHIP_REGISTERED_IN,VESSEL_NAME,CONTAINER_COUNT,DISTRIBUTION_PORT,CARRIER_CODE,...,"Houston, Texas","Long Beach, California","Los Angeles, California","Miami, Florida","New York/Newark Area, Newark, New Jersey","Norfolk, Virginia","Oakland, California","Philadelphia, Pennsylvania","Savannah, Georgia","Seattle, Washington"
9308,1,403,135,88,65962.0,4,19,3.0,7,4,...,0,0,0,0,0,0,0,0,0,1
9623,3,145,77,82,126715.0,6,54,3.0,24,4,...,0,0,0,0,0,1,0,0,0,0
9629,4,85,3,76,85948.0,4,100,2.0,10,4,...,0,0,1,0,0,0,0,0,0,0
9630,4,85,3,76,115413.0,13,61,3.0,10,4,...,0,0,1,0,0,0,0,0,0,0
9650,5,166,131,0,134337.0,16,65,6.0,3,4,...,0,0,0,0,1,0,0,0,0,0


In [224]:
df['WEIGHT_LB'] = df['WEIGHT_LB'].astype(float)

In [225]:
X = df.copy()
y = X.pop('COUNTRY_OF_ORIGIN')

In [226]:
X.head()

Unnamed: 0,SHIPPER,SHIPPER_ADDRESS,CONSIGNEE,ZIPCODE,WEIGHT_LB,SHIP_REGISTERED_IN,VESSEL_NAME,CONTAINER_COUNT,DISTRIBUTION_PORT,CARRIER_CODE,...,"Houston, Texas","Long Beach, California","Los Angeles, California","Miami, Florida","New York/Newark Area, Newark, New Jersey","Norfolk, Virginia","Oakland, California","Philadelphia, Pennsylvania","Savannah, Georgia","Seattle, Washington"
9308,1,403,135,88,65962.0,4,19,3.0,7,4,...,0,0,0,0,0,0,0,0,0,1
9623,3,145,77,82,126715.0,6,54,3.0,24,4,...,0,0,0,0,0,1,0,0,0,0
9629,4,85,3,76,85948.0,4,100,2.0,10,4,...,0,0,1,0,0,0,0,0,0,0
9630,4,85,3,76,115413.0,13,61,3.0,10,4,...,0,0,1,0,0,0,0,0,0,0
9650,5,166,131,0,134337.0,16,65,6.0,3,4,...,0,0,0,0,1,0,0,0,0,0


In [227]:
svm = OneVsRestClassifier(LinearSVC(random_state=42))
X_train, X_test, y_train, y_test = train_test_split(
     X, y, train_size=0.80, test_size=0.20, random_state=42)
model = svm.fit(X_train,y_train)



In [228]:
y_pred = model.predict(X_test)

In [229]:
accuracy_score(y_test, y_pred, normalize=False)

36

In [230]:
print(classification_report(y_test, y_pred))
       

              precision    recall  f1-score   support

           3       0.93      0.14      0.25        97
           6       0.00      0.00      0.00         0
           8       0.00      0.00      0.00        13
           9       0.00      0.00      0.00        16
          10       0.00      0.00      0.00         1
          12       0.00      0.00      0.00         1
          13       0.00      0.00      0.00        68
          14       1.00      0.40      0.57        55
          15       0.00      0.00      0.00         2
          17       0.00      0.00      0.00         2
          18       0.00      0.00      0.00         1
          19       0.00      0.00      0.00         3
          20       0.00      0.00      0.00        11
          21       0.00      0.00      0.00         5
          22       0.00      0.00      0.00         3
          23       0.00      0.00      0.00         0
          25       0.00      0.00      0.00        13

    accuracy              

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [231]:
xgb_model = xgb.XGBClassifier(objective="multi:softprob", random_state=42)
xgb_model.fit(X_train, y_train)

y_pred = xgb_model.predict(X_test)

print(accuracy_score(y_test, y_pred))


0.9209621993127147


In [None]:
X = df.copy()
y = X.pop('COUNTRY_OF_ORIGIN')
kfold = KFold(n_splits=4, shuffle=True, random_state=42)
for train_index, test_index in kfold.split(X):
    print(train_index)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    xgb_model = xgb.XGBClassifier(objective="multi:softprob",random_state=42)
    xgb_model.fit(X_train, y_train)
    
    y_pred = xgb_model.predict(X_test)

    print(accuracy_score(y_test, y_pred))


In [None]:
X = df.copy()
y = X.pop('COUNTRY_OF_ORIGIN')

# if more than one evaluation metric are given the last one is used for early stopping
xgb_model = xgb.XGBClassifier(objective="multi:softprob", random_state=42, eval_metric="auc")

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

xgb_model.fit(X_train, y_train, early_stopping_rounds=5, eval_set=[(X_test, y_test)])

y_pred = xgb_model.predict(X_test)

accuracy_score(y_test, y_pred)