In [57]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import  r2_score
from scipy import sparse as ssp


import lightgbm as lgb
from subprocess import check_output
print(check_output(['ls','input']).decode('utf8'))

sample_submission.csv
test.csv
train.csv



In [70]:
def lgb_r2(pred, drain):
    y = drain.get_label()
    score = r2_score(y, pred)
    return 'r2_score', score, True

cv_only = True
save_cv = True
full_train = False
NFOLDS = 5
kfold = KFold(n_splits=NFOLDS, shuffle=True, random_state=13)

In [3]:
train = pd.read_csv('input/train.csv', parse_dates=['start_date','creation_date','sell_date'])
test = pd.read_csv('input/test.csv', parse_dates=['start_date','creation_date','sell_date'])

print('Train data size: {} Test data size: {}'.format(train.shape, test.shape))

Train data size: (9366, 18) Test data size: (4801, 17)


In [4]:
train['missing'] = train.isnull().sum(axis=1).astype(float)
test['missing'] = test.isnull().sum(axis=1).astype(float)

In [5]:
train['desk_id'].fillna('DSK00000099', inplace=True)
test['desk_id'].fillna('DSK00000099', inplace=True)

train['desk_id']=train.desk_id.apply(lambda x : x[-4:]).astype(np.int16)
test['desk_id']=test.desk_id.apply(lambda x : x[-4:]).astype(np.int16)

In [6]:
cat_features = []
le = LabelEncoder()
le.fit(train['office_id'])
train['office_id'] = le.transform(train['office_id'])
test['office_id'] = le.transform(test['office_id'])
cat_features.append('office_id')

In [7]:
le.fit(train['pf_category'])
train['pf_category'] = le.transform(train['pf_category'])
test['pf_category'] = le.transform(test['pf_category'])
cat_features.append('pf_category')

In [8]:
le.fit(train['country_code'])
train['country_code'] = le.transform(train['country_code'])
test['country_code'] = le.transform(test['country_code'])
cat_features.append('country_code')

In [9]:
le.fit(train['currency'])
train['currency'] = le.transform(train['currency'])
test['currency'] = le.transform(test['currency'])
cat_features.append('currency')

In [10]:
train['indicator_code'] = train['indicator_code'].apply(lambda x : 0 if x==False else 1 if x== True else 2)
test['indicator_code']= test['indicator_code'].apply(lambda x : 0 if x==False else 1 if x== True else 2)
cat_features.append('indicator_code')

In [11]:
le.fit(train['type'])
train['type'] = le.transform(train['type'])
test['type'] = le.transform(test['type'])
cat_features.append('type')

In [12]:
train['hedge_value'] = train['hedge_value'].apply(lambda x : 0 if x==False else 1 if x== True else 2)
test['hedge_value'] = test['hedge_value'].apply(lambda x : 0 if x==False else 1 if x== True else 2)
cat_features.append('hedge_value')

In [17]:
train['status'] = train['status'].apply(lambda x : 0 if x==False else 1 if x== True else 2)
test['status'] = test['status'].apply(lambda x : 0 if x==False else 1 if x== True else 2)
cat_features.append('status')

In [19]:
enc = OneHotEncoder()
enc.fit(train[cat_features])
train_cat = enc.transform(train[cat_features])
test_cat = enc.transform(test[cat_features])

In [23]:
cat_count_features = []
for c in cat_features:
    d = pd.concat([train[c],test[c]]).value_counts().to_dict()
    train['%s_count'%c] = train[c].apply(lambda x:d.get(x,0))
    test['%s_count'%c] = test[c].apply(lambda x:d.get(x,0))
    cat_count_features.append('%s_count'%c)

In [25]:
data_all =[train, test]

In [27]:
for data in data_all:
    data['start_day'] = data.start_date.dt.day
    data['start_month'] = data.start_date.dt.month
    data['start_year'] = data.start_date.dt.year
    
    data['creation_day'] = data.creation_date.dt.day
    data['creation_month'] = data.creation_date.dt.month
    data['creation_year'] = data.creation_date.dt.year
    
    data['sell_day'] = data.sell_date.dt.day
    data['sell_month'] = data.sell_date.dt.month
    data['sell_year'] = data.start_date.dt.year
    
    data['start_creation'] = (data.start_date - data.creation_date).dt.days
    data['start_sell'] = (data.start_date - data.sell_date).dt.days
    data['creation_sell'] = (data.creation_date - data.sell_date).dt.days
    
    data['sold'].fillna(68552000.0, inplace= True)
    data['dff_sold_bought'] = (data.sold - data.bought)
    data['gain_loss'] = data.dff_sold_bought * data.euribor_rate 
    data['libor_rate'].fillna(0.0, inplace= True)
    data.drop(['start_date','creation_date', 'sell_date'], inplace=True, axis=1)

In [35]:
date_features = ['start_day','start_month','start_year','creation_day','creation_month','creation_year',
                'sell_day', 'sell_month']
amt_features = ['sold', 'bought', 'euribor_rate', 'libor_rate','dff_sold_bought','gain_loss' ]

In [38]:
train_label = train['return']
test_portfolio_id = test.portfolio_id.values
train_portfolio_id = train.portfolio_id.values

train.drop(['portfolio_id', 'return'], inplace=True, axis=1)
test.drop(['portfolio_id'], inplace=True, axis=1)

In [43]:
train_list = [train[cat_count_features + date_features+amt_featues].values, train_cat]
test_list = [test[cat_count_features + date_features+amt_featues].values, test_cat]

X = ssp.hstack(train_list).tocsr()
X_test = ssp.hstack(test_list).tocsr()

In [73]:
params = {'learning_rate': 0.04, 
          'num_leaves':78,
          'min_data_in_leaf': 160,
          'max_depth': 6, 
          'max_bin': 50,
          'colsample_bytree': 0.522,
          'boosting': 'gbdt', 
          'application': 'regression', 
          'min_child_samples': 10,
          #'min_child_weight': 150,
          'min_split_gain': 0,
          'subsample': 0.9,
          #'metric': 'mae',  
          'verbosity': 0,
          }
num_boost_round = 10000

In [74]:
x_score = []
final_cv_train = np.zeros(len(train_label))
final_cv_pred = np.zeros(len(test_portfolio_id))

In [76]:
for s in range(16):
    cv_train = np.zeros(len(train_label))
    cv_pred = np.zeros(len(test_portfolio_id))

    params['seed'] = s

    if cv_only:
        kf = kfold.split(X, train_label)

        best_trees = []
        fold_scores = []

        for i, (train_fold, valid_fold) in enumerate(kf):
            X_train, X_valid, y_train, y_valid = \
                X[train_fold, :], X[valid_fold, :], train_label[train_fold], train_label[valid_fold]
                
            dtrain = lgb.Dataset(X_train, y_train)
            dvalid = lgb.Dataset(X_valid, y_valid, reference=dtrain)
            
            bst = lgb.train(params, dtrain, num_boost_round, valid_sets=dvalid, feval=lgb_r2, verbose_eval=100,
                             early_stopping_rounds=100)            
            best_trees.append(bst.best_iteration)
            
            model = lgb.train(params,lgb.Dataset(X, train_label), num_boost_round=bst.best_iteration)
            cv_pred += model.predict(X_test)
            cv_train[valid_fold] += model.predict(X_valid)
            
            #cv_pred += bst.predict(X_test, num_iteration=bst.best_iteration)
            #cv_train[valid_fold] += bst.predict(X_valid)

            score = r2_score(y_valid, cv_train[valid_fold])
            print(score)
            fold_scores.append(score)

        cv_pred /= NFOLDS
        final_cv_train += cv_train
        final_cv_pred += cv_pred

        print("cv score:")
        print(r2_score(train_label, cv_train))
        print ("current score: {}, {}".format(r2_score(train_label, final_cv_train / (s + 1.)), s+1))
        print(fold_scores)
        print(best_trees, np.mean(best_trees))

        x_score.append(r2_score(train_label, cv_train))

print(x_score)

Training until validation scores don't improve for 100 rounds.
[100]	valid_0's r2_score: 0.957915
[200]	valid_0's r2_score: 0.966054
[300]	valid_0's r2_score: 0.969574
[400]	valid_0's r2_score: 0.970262
[500]	valid_0's r2_score: 0.971126
[600]	valid_0's r2_score: 0.971372
[700]	valid_0's r2_score: 0.971604
[800]	valid_0's r2_score: 0.971653
Early stopping, best iteration is:
[749]	valid_0's r2_score: 0.971886
0.983839411654
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's r2_score: 0.905298
[200]	valid_0's r2_score: 0.912348
[300]	valid_0's r2_score: 0.915459
[400]	valid_0's r2_score: 0.916788
[500]	valid_0's r2_score: 0.917395
[600]	valid_0's r2_score: 0.917836
[700]	valid_0's r2_score: 0.918138
[800]	valid_0's r2_score: 0.918233
[900]	valid_0's r2_score: 0.918372
[1000]	valid_0's r2_score: 0.918555
[1100]	valid_0's r2_score: 0.918606
[1200]	valid_0's r2_score: 0.918707
[1300]	valid_0's r2_score: 0.918667
[1400]	valid_0's r2_score: 0.918831
Early stopping

[500]	valid_0's r2_score: 0.917372
[600]	valid_0's r2_score: 0.917813
[700]	valid_0's r2_score: 0.918039
Early stopping, best iteration is:
[673]	valid_0's r2_score: 0.918061
0.937850124219
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's r2_score: 0.508584
[200]	valid_0's r2_score: 0.513336
[300]	valid_0's r2_score: 0.515406
[400]	valid_0's r2_score: 0.517869
[500]	valid_0's r2_score: 0.519021
[600]	valid_0's r2_score: 0.519703
[700]	valid_0's r2_score: 0.520407
[800]	valid_0's r2_score: 0.521125
[900]	valid_0's r2_score: 0.521435
[1000]	valid_0's r2_score: 0.521791
[1100]	valid_0's r2_score: 0.522368
[1200]	valid_0's r2_score: 0.523033
[1300]	valid_0's r2_score: 0.523178
[1400]	valid_0's r2_score: 0.523544
[1500]	valid_0's r2_score: 0.523728
[1600]	valid_0's r2_score: 0.523709
[1700]	valid_0's r2_score: 0.523812
[1800]	valid_0's r2_score: 0.524043
[1900]	valid_0's r2_score: 0.523949
Early stopping, best iteration is:
[1856]	valid_0's r2_score: 0.524145
0

[1400]	valid_0's r2_score: 0.521564
[1500]	valid_0's r2_score: 0.522339
[1600]	valid_0's r2_score: 0.522728
Early stopping, best iteration is:
[1574]	valid_0's r2_score: 0.522735
0.681911987589
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's r2_score: 0.94575
[200]	valid_0's r2_score: 0.952236
[300]	valid_0's r2_score: 0.953707
[400]	valid_0's r2_score: 0.954535
[500]	valid_0's r2_score: 0.954667
Early stopping, best iteration is:
[463]	valid_0's r2_score: 0.954943
0.968558490527
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's r2_score: 0.911864
[200]	valid_0's r2_score: 0.914203
Early stopping, best iteration is:
[167]	valid_0's r2_score: 0.914466
0.927991676008
cv score:
0.867726255002
current score: 0.8698181334322167, 7
[0.98509028500678797, 0.94302061293911188, 0.68191198758892546, 0.96855849052695964, 0.92799167600787158]
[809, 909, 1574, 463, 167] 784.4
Training until validation scores don't improve for 100 rounds.
[1

[200]	valid_0's r2_score: 0.967
[300]	valid_0's r2_score: 0.96921
[400]	valid_0's r2_score: 0.970173
[500]	valid_0's r2_score: 0.970964
[600]	valid_0's r2_score: 0.971607
[700]	valid_0's r2_score: 0.971667
Early stopping, best iteration is:
[645]	valid_0's r2_score: 0.971816
0.98398658392
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's r2_score: 0.907306
[200]	valid_0's r2_score: 0.913669
[300]	valid_0's r2_score: 0.916287
[400]	valid_0's r2_score: 0.917413
[500]	valid_0's r2_score: 0.917937
[600]	valid_0's r2_score: 0.918465
[700]	valid_0's r2_score: 0.918623
[800]	valid_0's r2_score: 0.918883
[900]	valid_0's r2_score: 0.918731
Early stopping, best iteration is:
[800]	valid_0's r2_score: 0.918883
0.941244396165
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's r2_score: 0.50835
[200]	valid_0's r2_score: 0.512968
[300]	valid_0's r2_score: 0.514835
[400]	valid_0's r2_score: 0.516365
[500]	valid_0's r2_score: 0.517471
[600]	vali

[1100]	valid_0's r2_score: 0.522601
[1200]	valid_0's r2_score: 0.522785
Early stopping, best iteration is:
[1134]	valid_0's r2_score: 0.522848
0.650637892183
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's r2_score: 0.943088
[200]	valid_0's r2_score: 0.949978
[300]	valid_0's r2_score: 0.951996
[400]	valid_0's r2_score: 0.953379
[500]	valid_0's r2_score: 0.954009
[600]	valid_0's r2_score: 0.953976
Early stopping, best iteration is:
[521]	valid_0's r2_score: 0.954159
0.96804988009
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's r2_score: 0.910686
[200]	valid_0's r2_score: 0.914352
[300]	valid_0's r2_score: 0.914029
Early stopping, best iteration is:
[203]	valid_0's r2_score: 0.914396
0.92709978289
cv score:
0.857033758794
current score: 0.8679273579255344, 14
[0.98450651757089314, 0.94093443023501089, 0.65063789218292767, 0.96804988008991022, 0.92709978289007622]
[803, 862, 1134, 521, 203] 704.6
Training until validation score

[0.80349530513470635, 0.8039473042563734, 0.80448919111181172, 0.80419915095861993, 0.80474068402778332, 0.80494204174302086, 0.8044695242240546, 0.80473218724268836, 0.80358720454025645, 0.80397121837732444, 0.80354318122148816, 0.80491646374082437, 0.80499737250244863, 0.80338203847812861, 0.80328379005776462, 0.80434391743912637, 0.80302947906255673, 0.80276325594428921, 0.80391398219772203, 0.80239363556266319, 0.80270574785605187, 0.80360201259231645, 0.80355928958237455, 0.80403982458980572, 0.80182961351662185, 0.80349180553133459, 0.80146816792541775, 0.80373744027418226, 0.80406613905910662, 0.80163064723346, 0.80221684558999262, 0.80333583341480674, 0.80645523439353473, 0.80691430711194989, 0.8069162011230655, 0.80661860497438143, 0.80753879751094337, 0.80735145097928518, 0.80684798617597431, 0.80784676186762683, 0.80673650717496659, 0.8067748469182715, 0.80726284452994912, 0.80669085359730075, 0.80740652597121021, 0.8062596546181845, 0.80738052309138575, 0.80705636748911413]

In [77]:
sub = pd.DataFrame({'portfolio_id':test_portfolio_id, 'return':final_cv_pred / 16.})
sub.head(5)

Unnamed: 0,portfolio_id,return
0,PF00001001,0.027708
1,PF00001004,0.028431
2,PF00001009,0.022581
3,PF00001013,0.028177
4,PF00001014,0.023978


In [78]:
sub.to_csv('lgb_cv_v2.csv', index=False)