In [18]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder

#### Importing Data

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

#### Visualizing Dataset

In [3]:
train.head()

Unnamed: 0,portfolio_id,desk_id,office_id,pf_category,start_date,sold,country_code,euribor_rate,currency,libor_rate,bought,creation_date,indicator_code,sell_date,type,hedge_value,status,return
0,PF00001002,DSK00001001,OFF00001002,B,20040720,110000000.0,T,0.02074,USD,2.332216,109809700.0,20040720,,20040812,B,,,0.02496
1,PF00001003,DSK00001002,OFF00001001,A,20040709,176671000.0,N,0.02074,GBP,5.269617,176008400.0,20040723,,20040812,C,,,0.05496
2,PF00001005,DSK00001004,OFF00001001,A,20040723,56474000.0,T,0.02074,USD,2.332216,56379530.0,20040723,,20040817,A,,,0.02496
3,PF00001006,DSK00001005,OFF00001001,A,20040609,164813000.0,T,0.02074,USD,2.332216,164508800.0,20040723,,20040713,A,,,0.02496
4,PF00001007,DSK00001005,OFF00001002,B,20040609,140800000.0,T,0.02074,USD,2.332216,140540200.0,20040723,,20040713,B,,,0.02496


In [5]:
print('Shape of train data:', train.shape)
print('Shape of testing Data:', test.shape)

Shape of train data: (9366, 18)
Shape of testing Data: (4801, 17)


In [6]:
null_cols = [key for key, value in train.isnull().any().iteritems() if value==True]

In [9]:
# Count Null Values
null_list = train.isnull()
print ('Number of Null values:')
for col in null_cols:
    print ('{} = {}'.format(col, sum(null_list[col])))

Number of Null values:
desk_id = 3665
sold = 2
libor_rate = 474
bought = 2
indicator_code = 5699
hedge_value = 5701
status = 3084


#### Extract Features

In [10]:
#### Start Date
train['start_year'] = train['start_date'].apply(lambda x: int(str(x)[0:4]))
train['start_month'] = train['start_date'].apply(lambda x: int(str(x)[4:6]))
train['start_date'] = train['start_date'].apply(lambda x: int(str(x)[6:8]))

test['start_year'] = test['start_date'].apply(lambda x: int(str(x)[0:4]))
test['start_month'] = test['start_date'].apply(lambda x: int(str(x)[4:6]))
test['start_date'] = test['start_date'].apply(lambda x: int(str(x)[6:8]))

In [11]:
#### Creation Date
train['create_year'] = train['creation_date'].apply(lambda x: int(str(x)[0:4]))
train['create_month'] = train['creation_date'].apply(lambda x: int(str(x)[4:6]))
train['create_date'] = train['creation_date'].apply(lambda x: int(str(x)[6:8]))

test['create_year'] = test['creation_date'].apply(lambda x: int(str(x)[0:4]))
test['create_month'] = test['creation_date'].apply(lambda x: int(str(x)[4:6]))
test['create_date'] = test['creation_date'].apply(lambda x: int(str(x)[6:8]))

In [12]:
#### Sell date
train['sell_year'] = train['sell_date'].apply(lambda x: int(str(x)[0:4]))
train['sell_month'] = train['sell_date'].apply(lambda x: int(str(x)[4:6]))
train['sell_date'] = train['sell_date'].apply(lambda x: int(str(x)[6:8]))

test['sell_year'] = test['sell_date'].apply(lambda x: int(str(x)[0:4]))
test['sell_month'] = test['sell_date'].apply(lambda x: int(str(x)[4:6]))
test['sell_date'] = test['sell_date'].apply(lambda x: int(str(x)[6:8]))

In [13]:
train.dtypes

portfolio_id       object
desk_id            object
office_id          object
pf_category        object
start_date          int64
sold              float64
country_code       object
euribor_rate      float64
currency           object
libor_rate        float64
bought            float64
creation_date       int64
indicator_code     object
sell_date           int64
type               object
hedge_value        object
status             object
return            float64
start_year          int64
start_month         int64
create_year         int64
create_month        int64
create_date         int64
sell_year           int64
sell_month          int64
dtype: object

#### Preprocessing Data

In [14]:
train.fillna(-999, inplace=True)
test.fillna(-999, inplace=True)

In [15]:
train.columns

Index(['portfolio_id', 'desk_id', 'office_id', 'pf_category', 'start_date',
       'sold', 'country_code', 'euribor_rate', 'currency', 'libor_rate',
       'bought', 'creation_date', 'indicator_code', 'sell_date', 'type',
       'hedge_value', 'status', 'return', 'start_year', 'start_month',
       'create_year', 'create_month', 'create_date', 'sell_year',
       'sell_month'],
      dtype='object')

In [16]:
cols = list(train.columns)
cols.remove('return')
cols.remove('portfolio_id')

In [19]:
for col in tqdm(cols):
    if train[col].dtype == 'object':
        train[col] = train[col].apply(str)
        test[col] = test[col].apply(str)

        le = LabelEncoder()
        train_vals = list(train[col].unique())
        test_vals = list(test[col].unique())
        le.fit(train_vals + test_vals)
        train[col] = le.transform(train[col])
        test[col] = le.transform(test[col])

100%|██████████| 23/23 [00:00<00:00, 9014.11it/s]


#### Creating Train and Test Files

In [21]:
import lightgbm as lgb
import xgboost as xgb

In [22]:
X = train.drop(['return', 'portfolio_id'], axis=1)
y = train['return'].values

In [23]:
X_test = test.drop(['portfolio_id'], axis=1)
ids = test.portfolio_id

In [24]:
from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.15, random_state=47)

In [44]:
d_train = lgb.Dataset(data=X_train, label=y_train)
d_valid = lgb.Dataset(data=X_valid, label=y_valid)

In [45]:
d_train

<lightgbm.basic.Dataset at 0x7ff05ba0d2b0>

#### Training the model

In [46]:
from sklearn.metrics import r2_score
def r2_score_lgb(pred, dtrain):
    y = dtrain.get_label()
    score = r2_score(y_true=y, y_pred=pred)
    return 'r2-score', score, True

In [47]:
watchlist = [d_train, d_valid]

In [52]:
params = {
    'learning_rate': 0.001,
    'application': 'regression_l1',
    'max_depth': 25,
    'num_leaves': 2**10,
    'verbosity':0,
    'boosting': 'gbdt',
    'max_bin':256,
    'bagging_fraction': 0.90,
    'bagging_freq': 1,
    'bagging_seed':1,
    'feature_fraction':0.9,
    'feature_fraction_seed':1
}

In [53]:
model = lgb.train(params, train_set=d_train, num_boost_round=5000, feval=r2_score_lgb, valid_sets=watchlist, early_stopping_rounds=50, verbose_eval=10)

Training until validation scores don't improve for 50 rounds.
[10]	training's r2-score: 0.0396094	valid_1's r2-score: 0.0481417
[20]	training's r2-score: 0.0780759	valid_1's r2-score: 0.0952803
[30]	training's r2-score: 0.115416	valid_1's r2-score: 0.14104
[40]	training's r2-score: 0.151675	valid_1's r2-score: 0.185464
[50]	training's r2-score: 0.186798	valid_1's r2-score: 0.228464
[60]	training's r2-score: 0.220925	valid_1's r2-score: 0.270193
[70]	training's r2-score: 0.254002	valid_1's r2-score: 0.310628
[80]	training's r2-score: 0.286029	valid_1's r2-score: 0.349778
[90]	training's r2-score: 0.317038	valid_1's r2-score: 0.387608
[100]	training's r2-score: 0.347042	valid_1's r2-score: 0.42427
[110]	training's r2-score: 0.376034	valid_1's r2-score: 0.459687
[120]	training's r2-score: 0.40407	valid_1's r2-score: 0.493849
[130]	training's r2-score: 0.431084	valid_1's r2-score: 0.526747
[140]	training's r2-score: 0.457088	valid_1's r2-score: 0.558362
[150]	training's r2-score: 0.482067	

[1280]	training's r2-score: 0.798909	valid_1's r2-score: 0.968228
[1290]	training's r2-score: 0.798961	valid_1's r2-score: 0.968243
[1300]	training's r2-score: 0.799008	valid_1's r2-score: 0.968261
[1310]	training's r2-score: 0.799064	valid_1's r2-score: 0.968305
[1320]	training's r2-score: 0.799115	valid_1's r2-score: 0.968363
[1330]	training's r2-score: 0.799151	valid_1's r2-score: 0.96837
[1340]	training's r2-score: 0.799209	valid_1's r2-score: 0.968431
[1350]	training's r2-score: 0.799252	valid_1's r2-score: 0.968443
[1360]	training's r2-score: 0.799302	valid_1's r2-score: 0.9685
[1370]	training's r2-score: 0.79935	valid_1's r2-score: 0.968512
[1380]	training's r2-score: 0.799412	valid_1's r2-score: 0.968546
[1390]	training's r2-score: 0.799442	valid_1's r2-score: 0.968552
[1400]	training's r2-score: 0.799471	valid_1's r2-score: 0.968576
[1410]	training's r2-score: 0.799501	valid_1's r2-score: 0.968577
[1420]	training's r2-score: 0.79955	valid_1's r2-score: 0.968579
[1430]	training

[2540]	training's r2-score: 0.803627	valid_1's r2-score: 0.970491
[2550]	training's r2-score: 0.803647	valid_1's r2-score: 0.970487
[2560]	training's r2-score: 0.803686	valid_1's r2-score: 0.970475
[2570]	training's r2-score: 0.80371	valid_1's r2-score: 0.970481
[2580]	training's r2-score: 0.803753	valid_1's r2-score: 0.970472
Early stopping, best iteration is:
[2536]	training's r2-score: 0.803614	valid_1's r2-score: 0.970511


#### Predict Labels

In [54]:
p_test = model.predict(X_test)

#### Creating submission files

In [55]:
subm = pd.DataFrame()
subm['portfolio_id'] = ids
subm['return'] = p_test

#### Saving submission file

In [56]:
subm.to_csv('submit.csv', index=False)