## Train an XGBoost Model on Features Obtained from TTA of Resnet-50 predictions (Model Stacking). 

In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
import pickle

In [13]:
cat_vars = ['Brand', 'Gender', 'Sub_category']
con_vars = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', 
            '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23']
drop_cols = ['Category', 'Link_to_the_image', 'filename', 'prediction']

In [3]:
train = pd.read_csv('data/myntra_train_valid_aug_combined.csv')
test = pd.read_csv('data/myntra_test_augmented.csv')

In [5]:
train.head()

Unnamed: 0,Brand,Category,Gender,Color,Link_to_the_image,Sub_category,filename,prediction,0,1,...,14,15,16,17,18,19,20,21,22,23
0,Roadster,Tshirts,Men,Black,http://myntra.myntassets.com/assets/images/182...,Graphic,11501068910736-Roadster-Men-Black-Printed-Roun...,Biker,0.034868,0.415532,...,0.000305,0.002058,0.004667,0.000403,0.000342,0.000919,0.000324,0.000952,0.155101,0.00176
1,Roadster,Tshirts,Men,Black,http://myntra.myntassets.com/assets/images/182...,Graphic,11501754749483-Roadster-Men-Black-Printed-Roun...,Graphic,0.029623,0.225534,...,0.000455,0.000942,0.00642,0.001491,0.00449,0.000503,0.000196,0.000613,0.203651,0.001086
2,Roadster,Tshirts,Men,Black,http://myntra.myntassets.com/assets/images/182...,Biker,11504765958740-Roadster-Men-Black-Printed-Henl...,Biker,0.00338,0.910267,...,2.4e-05,0.000183,0.000124,7e-06,7.7e-05,4.1e-05,0.000207,7.6e-05,0.009575,2.2e-05
3,Roadster,Tshirts,Men,Black,http://myntra.myntassets.com/assets/images/182...,Graphic,11501067533832-Roadster-Men-Black-Printed-Roun...,Abstract,0.501471,0.118449,...,0.000709,0.003532,0.001348,0.000128,0.001443,0.004189,0.000876,0.001844,0.028074,0.000283
4,Roadster,Tshirts,Men,Black,http://myntra.myntassets.com/assets/images/182...,Graphic,11501067504556-Roadster-Men-Black-Printed-Roun...,Graphic,0.098687,0.142579,...,0.000211,0.002586,0.007648,0.000116,0.00043,0.000477,0.000367,0.000626,0.10769,0.000909


In [6]:
test.head()

Unnamed: 0,Brand,Category,Gender,Color,Link_to_the_image,Sub_category,filename,prediction,0,1,...,14,15,16,17,18,19,20,21,22,23
0,Allen Solly,Tshirts,Men,Red,http://myntra.myntassets.com/assets/images/182...,,11490680940727-Allen-Solly-Men-Red-Solid-Round...,Solid,0.002838,0.000157,...,0.000379,0.973995,7.1e-05,4e-05,0.003027,0.000266,0.000614,7.1e-05,0.007357,5.4e-05
1,Celio,Tshirts,Men,Yellow,http://myntra.myntassets.com/assets/images/109...,,11475223023888-Celio-Men-Yellow-Solid-V-Neck-T...,Solid,0.000676,0.00012,...,0.000634,0.990198,3e-05,7.3e-05,0.001922,0.000142,0.000676,3.7e-05,0.001524,6e-05
2,CULT FICTION,Tshirts,Men,Rust,http://myntra.myntassets.com/assets/images/159...,,11480419683787-CULT-FICTION-Men-Rust-Solid-Rou...,Solid,0.010216,0.000287,...,0.003746,0.64906,0.001455,0.000369,0.01145,0.000271,0.000676,0.00108,0.186619,0.001524
3,Antigravity,Tshirts,Women,Blue,http://myntra.myntassets.com/assets/images/185...,,11493204669821-Antigravity-Women-Blue-Printed-...,Typography,0.12766,0.001227,...,0.00132,0.002337,0.003552,0.000402,0.004096,0.001114,0.000322,0.007528,0.387069,0.008698
4,Being Human,Tshirts,Men,Charcoal,http://myntra.myntassets.com/assets/images/100...,,11468221198445-Being-Human-Clothing-Charcoal-G...,Typography,0.150608,0.008593,...,0.000916,0.005162,0.007324,0.000806,0.001979,0.003907,0.000785,0.00108,0.538232,0.011181


In [7]:
train['type'] = 'train'
test['type'] = 'test'
test['Sub_category'] = 'Graphic'

In [21]:
combined = pd.concat([train, test], ignore_index=True)

In [22]:
combined.head()

Unnamed: 0,Brand,Category,Gender,Color,Link_to_the_image,Sub_category,filename,prediction,0,1,...,15,16,17,18,19,20,21,22,23,type
0,Roadster,Tshirts,Men,Black,http://myntra.myntassets.com/assets/images/182...,Graphic,11501068910736-Roadster-Men-Black-Printed-Roun...,Biker,0.034868,0.415532,...,0.002058,0.004667,0.000403,0.000342,0.000919,0.000324,0.000952,0.155101,0.00176,train
1,Roadster,Tshirts,Men,Black,http://myntra.myntassets.com/assets/images/182...,Graphic,11501754749483-Roadster-Men-Black-Printed-Roun...,Graphic,0.029623,0.225534,...,0.000942,0.00642,0.001491,0.00449,0.000503,0.000196,0.000613,0.203651,0.001086,train
2,Roadster,Tshirts,Men,Black,http://myntra.myntassets.com/assets/images/182...,Biker,11504765958740-Roadster-Men-Black-Printed-Henl...,Biker,0.00338,0.910267,...,0.000183,0.000124,7e-06,7.7e-05,4.1e-05,0.000207,7.6e-05,0.009575,2.2e-05,train
3,Roadster,Tshirts,Men,Black,http://myntra.myntassets.com/assets/images/182...,Graphic,11501067533832-Roadster-Men-Black-Printed-Roun...,Abstract,0.501471,0.118449,...,0.003532,0.001348,0.000128,0.001443,0.004189,0.000876,0.001844,0.028074,0.000283,train
4,Roadster,Tshirts,Men,Black,http://myntra.myntassets.com/assets/images/182...,Graphic,11501067504556-Roadster-Men-Black-Printed-Roun...,Graphic,0.098687,0.142579,...,0.002586,0.007648,0.000116,0.00043,0.000477,0.000367,0.000626,0.10769,0.000909,train


In [23]:
label_encoders = dict()
onehot_encoders = dict()

In [47]:
label_encoders = dict()
onehot_encoders= dict()
for col in cat_vars:
    print(col)
    label_encoders[col] = LabelEncoder()
    onehot_encoders[col] = OneHotEncoder()
    label_encoders[col].fit(combined[col])
    onehot_encoders[col].fit(label_encoders[col].transform(combined[col]).reshape(-1, 1))

Brand
Gender
Sub_category


In [25]:
def get_onehot_labels(int_labels, onehot_encoder):
    return onehot_encoder.transform(int_labels).toarray()

In [76]:
def get_onehot_df(dataframe, target, cat_columns, cont_columns, label_encoders, onehot_encoders):
    df = pd.DataFrame()
    if target is not None:
        df[target] = pd.Series(label_encoders[target].transform(dataframe[target]))
    for col in cat_columns:
        print(col)
        df = pd.concat([df, pd.DataFrame(get_onehot_labels
                                         (label_encoders[col].transform(dataframe[col]).reshape(-1, 1), 
                                          onehot_encoders[col]))], axis=1)    
    for col in cont_columns:
        print(col)
        df = pd.concat([df, pd.DataFrame(dataframe[col])], axis=1)
    return df

In [63]:
onehot_df = get_onehot_df(combined, 'Sub_category', ['Brand', 'Gender'], con_vars + ['type'], label_encoders, onehot_encoders)

Brand
Gender
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
type


In [64]:
onehot_df.head()

Unnamed: 0,Sub_category,0,1,2,3,4,5,6,7,8,...,15,16,17,18,19,20,21,22,23,type
0,8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.002058,0.004667,0.000403,0.000342,0.000919,0.000324,0.000952,0.155101,0.00176,train
1,8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000942,0.00642,0.001491,0.00449,0.000503,0.000196,0.000613,0.203651,0.001086,train
2,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000183,0.000124,7e-06,7.7e-05,4.1e-05,0.000207,7.6e-05,0.009575,2.2e-05,train
3,8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.003532,0.001348,0.000128,0.001443,0.004189,0.000876,0.001844,0.028074,0.000283,train
4,8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.002586,0.007648,0.000116,0.00043,0.000477,0.000367,0.000626,0.10769,0.000909,train


In [68]:
all_cols = list(onehot_df.columns)

In [66]:
train_data = onehot_df[onehot_df['type']=='train']

In [67]:
train_data.shape

(68759, 603)

In [69]:
xs = train_data[all_cols[1:-1]]

In [70]:
xs.head()

Unnamed: 0,0,0.1,1,1.1,2,2.1,3,3.1,4,4.1,...,14,15,16,17,18,19,20,21,22,23
0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.000305,0.002058,0.004667,0.000403,0.000342,0.000919,0.000324,0.000952,0.155101,0.00176
1,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.000455,0.000942,0.00642,0.001491,0.00449,0.000503,0.000196,0.000613,0.203651,0.001086
2,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,2.4e-05,0.000183,0.000124,7e-06,7.7e-05,4.1e-05,0.000207,7.6e-05,0.009575,2.2e-05
3,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.000709,0.003532,0.001348,0.000128,0.001443,0.004189,0.000876,0.001844,0.028074,0.000283
4,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.000211,0.002586,0.007648,0.000116,0.00043,0.000477,0.000367,0.000626,0.10769,0.000909


In [71]:
ys = train_data['Sub_category']

In [72]:
xgbc = XGBClassifier()

In [73]:
xgbc.fit(xs.as_matrix(), ys)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='multi:softprob', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [74]:
import pickle
pickle.dump(xgbc, open('xgbc-aug.pkl', 'wb'))

In [179]:
test = pd.read_csv('data/myntra_test_augmented.csv')

In [180]:
test.head()

Unnamed: 0,id,Brand,Category,Gender,Color,Link_to_the_image,Sub_category,filename,prediction,0,...,14,15,16,17,18,19,20,21,22,23
0,1,Allen Solly,Tshirts,Men,Red,http://myntra.myntassets.com/assets/images/182...,,11490680940727-Allen-Solly-Men-Red-Solid-Round...,Solid,0.002838,...,0.000379,0.973995,7.1e-05,4e-05,0.003027,0.000266,0.000614,7.1e-05,0.007357,5.4e-05
1,2,Celio,Tshirts,Men,Yellow,http://myntra.myntassets.com/assets/images/109...,,11475223023888-Celio-Men-Yellow-Solid-V-Neck-T...,Solid,0.000676,...,0.000634,0.990198,3e-05,7.3e-05,0.001922,0.000142,0.000676,3.7e-05,0.001524,6e-05
2,3,CULT FICTION,Tshirts,Men,Rust,http://myntra.myntassets.com/assets/images/159...,,11480419683787-CULT-FICTION-Men-Rust-Solid-Rou...,Solid,0.010216,...,0.003746,0.64906,0.001455,0.000369,0.01145,0.000271,0.000676,0.00108,0.186619,0.001524
3,4,Antigravity,Tshirts,Women,Blue,http://myntra.myntassets.com/assets/images/185...,,11493204669821-Antigravity-Women-Blue-Printed-...,Typography,0.12766,...,0.00132,0.002337,0.003552,0.000402,0.004096,0.001114,0.000322,0.007528,0.387069,0.008698
4,5,Being Human,Tshirts,Men,Charcoal,http://myntra.myntassets.com/assets/images/100...,,11468221198445-Being-Human-Clothing-Charcoal-G...,Typography,0.150608,...,0.000916,0.005162,0.007324,0.000806,0.001979,0.003907,0.000785,0.00108,0.538232,0.011181


In [181]:
keep = ['id', 'Brand', 'Category', 'Gender', 'Color', 'Link_to_the_image', 'filename']
onehot_df_test = get_onehot_df(test, None, ['Brand', 'Gender'], con_vars + keep, label_encoders, onehot_encoders)

Brand
Gender
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
id
Brand
Category
Gender
Color
Link_to_the_image
filename


In [182]:
print(list(onehot_df_test.columns))

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221,

In [183]:
feats = list(onehot_df_test.columns)

In [184]:
print(feats[:-7])

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221,

In [185]:
onehot_df_test_nna = onehot_df_test[~onehot_df_test['Link_to_the_image'].isna()]

In [186]:
onehot_df_test_nna = onehot_df_test_nna.reindex().copy()

In [187]:
len(onehot_df_test_nna)

14770

In [188]:
xs_test = onehot_df_test_nna[feats[:-7]]

In [189]:
predictions = xgbc.predict(xs_test.as_matrix())

  if diff:


In [190]:
pred_cats = label_encoders['Sub_category'].inverse_transform(predictions)

  if diff:


In [191]:
onehot_df_test_nna['predictions'] = pd.Series(pred_cats)

In [192]:
onehot_df_test_nna.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,22,23,id,Brand,Category,Gender,Color,Link_to_the_image,filename,predictions
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.007357,5.4e-05,1,Allen Solly,Tshirts,Men,Red,http://myntra.myntassets.com/assets/images/182...,11490680940727-Allen-Solly-Men-Red-Solid-Round...,Solid
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.001524,6e-05,2,Celio,Tshirts,Men,Yellow,http://myntra.myntassets.com/assets/images/109...,11475223023888-Celio-Men-Yellow-Solid-V-Neck-T...,Solid
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.186619,0.001524,3,CULT FICTION,Tshirts,Men,Rust,http://myntra.myntassets.com/assets/images/159...,11480419683787-CULT-FICTION-Men-Rust-Solid-Rou...,Solid
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.387069,0.008698,4,Antigravity,Tshirts,Women,Blue,http://myntra.myntassets.com/assets/images/185...,11493204669821-Antigravity-Women-Blue-Printed-...,Typography
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.538232,0.011181,5,Being Human,Tshirts,Men,Charcoal,http://myntra.myntassets.com/assets/images/100...,11468221198445-Being-Human-Clothing-Charcoal-G...,Typography


In [164]:
filenames = onehot_df_test_nna['filename'].as_matrix()

In [167]:
stacked = np.dstack([filenames, pred_cats])

In [170]:
stacked = stacked[0]

In [128]:
test.columns

Index(['Brand', 'Category', 'Gender', 'Color', 'Link_to_the_image',
       'Sub_category', 'filename', 'prediction', '0', '1', '2', '3', '4', '5',
       '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17',
       '18', '19', '20', '21', '22', '23'],
      dtype='object')

In [171]:
stacked

array([['11490680940727-Allen-Solly-Men-Red-Solid-Round-Neck-T-Shirt-2781490680940583-1.jpg',
        'Solid'],
       ['11475223023888-Celio-Men-Yellow-Solid-V-Neck-T-Shirt-8221475223023601-1.jpg',
        'Solid'],
       ['11480419683787-CULT-FICTION-Men-Rust-Solid-Round-Neck-T-Shirt-3791480419683584-1.jpg',
        'Solid'],
       ...,
       ['Basics-Men-Red-&-Grey-Melange-Striped-Muscle-Fit-Polo-T-shirt_55908b12600354329463cd8dbe60003e_images.jpg',
        'Striped'],
       ['11505468025600-Duke-Men-Tshirts-2051505468025398-1.jpg',
        'Striped'],
       ['612-league-Boys-Navy-Printed-T-shirt_1_431bc7ab42685c19511b994a56e62029.jpg',
        'Typography']], dtype=object)

In [193]:
matrix_df = onehot_df_test_nna[['id', 'predictions']]

In [194]:
len(matrix_df)

14770

In [195]:
matrix_df.head()

Unnamed: 0,id,predictions
0,1,Solid
1,2,Solid
2,3,Solid
3,4,Typography
4,5,Typography


In [196]:
output_filename = 'augmented_features_predictions.csv'
test_df = pd.read_csv('data/Submission_online_fnames.csv')
final = test_df.merge(matrix_df, on='id', how='left')
final['predictions'] = final['predictions'].fillna('Solid')
final.to_csv(output_filename, index=False)
print('Saved to {}'.format(output_filename))

Saved to augmented_features_predictions.csv


In [197]:
len(final)

15000

In [176]:
seta = set(list(test_df['filename']))
setb = set(list(final['filename']))