In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split

#### Importing Data

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

#### Visualizing Dataset

In [3]:
train.head()

Unnamed: 0,portfolio_id,desk_id,office_id,pf_category,start_date,sold,country_code,euribor_rate,currency,libor_rate,bought,creation_date,indicator_code,sell_date,type,hedge_value,status,return
0,PF00001002,DSK00001001,OFF00001002,B,20040720,110000000.0,T,0.02074,USD,2.332216,109809700.0,20040720,,20040812,B,,,0.02496
1,PF00001003,DSK00001002,OFF00001001,A,20040709,176671000.0,N,0.02074,GBP,5.269617,176008400.0,20040723,,20040812,C,,,0.05496
2,PF00001005,DSK00001004,OFF00001001,A,20040723,56474000.0,T,0.02074,USD,2.332216,56379530.0,20040723,,20040817,A,,,0.02496
3,PF00001006,DSK00001005,OFF00001001,A,20040609,164813000.0,T,0.02074,USD,2.332216,164508800.0,20040723,,20040713,A,,,0.02496
4,PF00001007,DSK00001005,OFF00001002,B,20040609,140800000.0,T,0.02074,USD,2.332216,140540200.0,20040723,,20040713,B,,,0.02496


In [4]:
print('Shape of train data:', train.shape)
print('Shape of testing Data:', test.shape)

Shape of train data: (9366, 18)
Shape of testing Data: (4801, 17)


In [5]:
null_cols = [key for key, value in train.isnull().any().iteritems() if value==True]

In [6]:
# Count Null Values
null_list = train.isnull()
print ('Number of Null values:')
for col in null_cols:
    print ('{} = {}'.format(col, sum(null_list[col])))

Number of Null values:
desk_id = 3665
sold = 2
libor_rate = 474
bought = 2
indicator_code = 5699
hedge_value = 5701
status = 3084


#### Extract Features

In [7]:
#### Start Date
train['start_year'] = train['start_date'].apply(lambda x: int(str(x)[0:4]))
train['start_month'] = train['start_date'].apply(lambda x: int(str(x)[4:6]))
train['start_date'] = train['start_date'].apply(lambda x: int(str(x)[6:8]))

test['start_year'] = test['start_date'].apply(lambda x: int(str(x)[0:4]))
test['start_month'] = test['start_date'].apply(lambda x: int(str(x)[4:6]))
test['start_date'] = test['start_date'].apply(lambda x: int(str(x)[6:8]))

In [8]:
#### Creation Date
train['create_year'] = train['creation_date'].apply(lambda x: int(str(x)[0:4]))
train['create_month'] = train['creation_date'].apply(lambda x: int(str(x)[4:6]))
train['create_date'] = train['creation_date'].apply(lambda x: int(str(x)[6:8]))

test['create_year'] = test['creation_date'].apply(lambda x: int(str(x)[0:4]))
test['create_month'] = test['creation_date'].apply(lambda x: int(str(x)[4:6]))
test['create_date'] = test['creation_date'].apply(lambda x: int(str(x)[6:8]))

In [9]:
#### Sell date
train['sell_year'] = train['sell_date'].apply(lambda x: int(str(x)[0:4]))
train['sell_month'] = train['sell_date'].apply(lambda x: int(str(x)[4:6]))
train['sell_date'] = train['sell_date'].apply(lambda x: int(str(x)[6:8]))

test['sell_year'] = test['sell_date'].apply(lambda x: int(str(x)[0:4]))
test['sell_month'] = test['sell_date'].apply(lambda x: int(str(x)[4:6]))
test['sell_date'] = test['sell_date'].apply(lambda x: int(str(x)[6:8]))

In [10]:
train.dtypes

portfolio_id       object
desk_id            object
office_id          object
pf_category        object
start_date          int64
sold              float64
country_code       object
euribor_rate      float64
currency           object
libor_rate        float64
bought            float64
creation_date       int64
indicator_code     object
sell_date           int64
type               object
hedge_value        object
status             object
return            float64
start_year          int64
start_month         int64
create_year         int64
create_month        int64
create_date         int64
sell_year           int64
sell_month          int64
dtype: object

#### Preprocessing Data

In [11]:
train.fillna(-999, inplace=True)
test.fillna(-999, inplace=True)

In [12]:
train.columns

Index(['portfolio_id', 'desk_id', 'office_id', 'pf_category', 'start_date',
       'sold', 'country_code', 'euribor_rate', 'currency', 'libor_rate',
       'bought', 'creation_date', 'indicator_code', 'sell_date', 'type',
       'hedge_value', 'status', 'return', 'start_year', 'start_month',
       'create_year', 'create_month', 'create_date', 'sell_year',
       'sell_month'],
      dtype='object')

In [13]:
cols = list(train.columns)
cols.remove('return')
cols.remove('portfolio_id')

In [14]:
for col in tqdm(cols):
    if train[col].dtype == 'object':
        train[col] = train[col].apply(str)
        test[col] = test[col].apply(str)

        le = LabelEncoder()
        train_vals = list(train[col].unique())
        test_vals = list(test[col].unique())
        le.fit(train_vals + test_vals)
        train[col] = le.transform(train[col])
        test[col] = le.transform(test[col])

100%|██████████| 23/23 [00:00<00:00, 102.66it/s]


#### Creating Train and Test Files

In [16]:
import lightgbm as lgb
import xgboost as xgb

In [17]:
X = train.drop(['return', 'portfolio_id'], axis=1)
y = train['return']

In [18]:
X_test = test.drop(['portfolio_id'], axis=1)
ids = test.portfolio_id

In [19]:
from sklearn.metrics import r2_score
def r2_score_lgb(pred, dtrain):
    y = dtrain.get_label()
    score = r2_score(y_true=y, y_pred=pred)
    return 'r2-score', score

In [20]:
from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(X, y, random_state=47)

In [26]:
dtrain = xgb.DMatrix(data=X_train, label=y_train)
dvalid = xgb.DMatrix(data=X_valid, label=y_valid)

#### Training the model

In [40]:
params = {}
params['objective'] = 'reg:linear'
params['eta'] = 0.01
params['max_depth'] = 15
params['silent'] = 0
params['max_bin'] = 256
params['max_leaf_nodes'] = 2**10
params['subsample'] = 0.80
params['colsample_bytree'] = 0.80
params['scale_pos_weight'] = 1.6
params['gamma'] = 10
params['alpha'] = 8
params['reg_lambda'] = 1.3

In [43]:
eval_set = [(dtrain, 'train'), (dvalid, 'valid')]

In [42]:
model = xgb.train(params, dtrain=dtrain, evals=eval_set, num_boost_round=5000, verbose_eval=20, early_stopping_rounds=80, feval=r2_score_lgb, maximize=True)

[0]	train-r2-score:-515.645	valid-r2-score:-675.659
Multiple eval metrics have been passed: 'valid-r2-score' will be used for early stopping.

Will train until valid-r2-score hasn't improved in 80 rounds.
[20]	train-r2-score:-345.431	valid-r2-score:-452.848
[40]	train-r2-score:-231.475	valid-r2-score:-303.638
[60]	train-r2-score:-155.17	valid-r2-score:-203.694
[80]	train-r2-score:-104.058	valid-r2-score:-136.72
[100]	train-r2-score:-69.821	valid-r2-score:-91.8372
[120]	train-r2-score:-46.8834	valid-r2-score:-61.7488
[140]	train-r2-score:-31.5042	valid-r2-score:-41.5603
[160]	train-r2-score:-21.1882	valid-r2-score:-28.0064
[180]	train-r2-score:-14.2666	valid-r2-score:-18.9026
[200]	train-r2-score:-9.62162	valid-r2-score:-12.7851
[220]	train-r2-score:-6.49987	valid-r2-score:-8.66718
[240]	train-r2-score:-4.39994	valid-r2-score:-5.89188
[260]	train-r2-score:-2.98633	valid-r2-score:-4.0193
[280]	train-r2-score:-2.03275	valid-r2-score:-2.75262
[300]	train-r2-score:-1.38855	valid-r2-score:-1

[2880]	train-r2-score:-0.003268	valid-r2-score:-0.008881
[2900]	train-r2-score:-0.003268	valid-r2-score:-0.008881
[2920]	train-r2-score:-0.003264	valid-r2-score:-0.008874
[2940]	train-r2-score:-0.003263	valid-r2-score:-0.008872
[2960]	train-r2-score:-0.003263	valid-r2-score:-0.008872
[2980]	train-r2-score:-0.003263	valid-r2-score:-0.008872
[3000]	train-r2-score:-0.003263	valid-r2-score:-0.008872
Stopping. Best iteration:
[2936]	train-r2-score:-0.003263	valid-r2-score:-0.008872



In [None]:
from sklearn.svm import SVR

svr = SVR(kernel='linear')
svr.fit(X_train, y_train)

In [None]:
pred = svr.predict(X_valid)
print('R2_score:', r2_score(y_valid, pred))

#### Predict Labels

In [None]:
dtest= xgb.DMatrix(X_test)
p_test = model.predict(dtest)

#### Creating submission files

In [None]:
subm = pd.DataFrame()
subm['portfolio_id'] = ids
subm['return'] = p_test

#### Saving submission file

In [None]:
subm.to_csv('submit.csv', index=False)