In [147]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import  r2_score
from sklearn.linear_model import LinearRegression,Ridge,Lasso

from scipy import sparse as ssp
from tqdm import tqdm_notebook

import lightgbm as lgb
from subprocess import check_output
print(check_output(['ls','input']).decode('utf8'))

import warnings
warnings.filterwarnings('ignore')

sample_submission.csv
test.csv
train.csv



In [3]:
train = pd.read_csv('input/train.csv', parse_dates=['start_date','creation_date','sell_date'])
test = pd.read_csv('input/test.csv', parse_dates=['start_date','creation_date','sell_date'])

print('Train data size: {} Test data size: {}'.format(train.shape, test.shape))

Train data size: (9366, 18) Test data size: (4801, 17)


In [4]:
train['missing'] = train.isnull().sum(axis=1).astype(float)
test['missing'] = test.isnull().sum(axis=1).astype(float)

In [5]:
train['desk_id'].fillna('DSK00000099', inplace=True)
test['desk_id'].fillna('DSK00000099', inplace=True)

train['desk_id']=train.desk_id.apply(lambda x : x[-4:]).astype(np.int16)
test['desk_id']=test.desk_id.apply(lambda x : x[-4:]).astype(np.int16)

In [6]:
cat_features = []
le = LabelEncoder()
le.fit(train['office_id'])
train['office_id'] = le.transform(train['office_id'])
test['office_id'] = le.transform(test['office_id'])
cat_features.append('office_id')

In [7]:
le.fit(train['pf_category'])
train['pf_category'] = le.transform(train['pf_category'])
test['pf_category'] = le.transform(test['pf_category'])
cat_features.append('pf_category')

In [8]:
le.fit(train['country_code'])
train['country_code'] = le.transform(train['country_code'])
test['country_code'] = le.transform(test['country_code'])
cat_features.append('country_code')

In [9]:
train['curr_to_euro'] = train['currency'].apply(lambda x : 1.18 if x=='USD' else 1.17  if x=='CHF'
                                               else 133.28 if x=='JPY'  else 1)
test['curr_to_euro'] = test['currency'].apply(lambda x : 1.18 if x=='USD' else 1.17  if x=='CHF' 
                                                else 133.28 if x=='JPY'  else 1)

In [10]:
le.fit(train['currency'])
train['currency'] = le.transform(train['currency'])
test['currency'] = le.transform(test['currency'])
cat_features.append('currency')

In [11]:
train['indicator_code'] = train['indicator_code'].apply(lambda x : 0 if x==False else 1 if x== True else 2)
test['indicator_code']= test['indicator_code'].apply(lambda x : 0 if x==False else 1 if x== True else 2)
cat_features.append('indicator_code')

In [12]:
le.fit(train['type'])
train['type'] = le.transform(train['type'])
test['type'] = le.transform(test['type'])
cat_features.append('type')

In [13]:
train['hedge_value'] = train['hedge_value'].apply(lambda x : 0 if x==False else 1 if x== True else 2)
test['hedge_value'] = test['hedge_value'].apply(lambda x : 0 if x==False else 1 if x== True else 2)
cat_features.append('hedge_value')

In [14]:
train['status'] = train['status'].apply(lambda x : 0 if x==False else 1 if x== True else 2)
test['status'] = test['status'].apply(lambda x : 0 if x==False else 1 if x== True else 2)
cat_features.append('status')

In [15]:
enc = OneHotEncoder()
enc.fit(train[cat_features])
train_cat = enc.transform(train[cat_features])
test_cat = enc.transform(test[cat_features])

In [16]:
cat_count_features = []
for c in cat_features:
    d = pd.concat([train[c],test[c]]).value_counts().to_dict()
    train['%s_count'%c] = train[c].apply(lambda x:d.get(x,0))
    test['%s_count'%c] = test[c].apply(lambda x:d.get(x,0))
    cat_count_features.append('%s_count'%c)

In [17]:
data_all =[train, test]

In [20]:
for data in data_all:
    data['start_day'] = data.start_date.dt.day
    data['start_month'] = data.start_date.dt.month
    data['start_year'] = data.start_date.dt.year
    
    data['creation_day'] = data.creation_date.dt.day
    data['creation_month'] = data.creation_date.dt.month
    data['creation_year'] = data.creation_date.dt.year
    
    data['sell_day'] = data.sell_date.dt.day
    data['sell_month'] = data.sell_date.dt.month
    data['sell_year'] = data.start_date.dt.year
    
    data['start_creation'] = (data.start_date - data.creation_date).dt.days
    data['start_sell'] = (data.start_date - data.sell_date).dt.days
    data['creation_sell'] = (data.creation_date - data.sell_date).dt.days
    
    data['sold'].fillna(68552000.0, inplace= True)
    data['dff_sold_bought'] = (data.sold - data.bought)
    data['gain_loss'] = data.dff_sold_bought * data.euribor_rate 
    data['gain_loss_per_m'] = data.dff_sold_bought * data.euribor_rate \
                                *data['curr_to_euro']*data['start_sell']/30.
    data['libor_rate'].fillna(0.0, inplace= True)
    data['sold_libor'] = data['sold']*data['libor_rate']
    data['sold_euribor'] = data['sold']*data['euribor_rate']
    data['sold_in_euro'] = data['sold']*data['curr_to_euro']
    data['bought_in_euro'] = data['bought']*data['curr_to_euro']
    data.drop(['start_date','creation_date', 'sell_date'], inplace=True, axis=1)

In [31]:
date_features = ['start_day','start_month','start_year','creation_day','creation_month','creation_year',
                'sell_day', 'sell_month']
amt_features = ['curr_to_euro','sold', 'bought', 'euribor_rate', 'libor_rate','dff_sold_bought',
                'gain_loss','gain_loss_per_m', 'sold_libor','sold_euribor','sold_in_euro','bought_in_euro']

In [22]:
train_label = train['return']
test_portfolio_id = test.portfolio_id.values
train_portfolio_id = train.portfolio_id.values

train.drop(['portfolio_id', 'return'], inplace=True, axis=1)
test.drop(['portfolio_id'], inplace=True, axis=1)

In [194]:
train_list = [train[cat_count_features + date_features+amt_features].values, train_cat]
test_list = [test[cat_count_features + date_features+amt_features].values, test_cat]
X = ssp.hstack(train_list).tocsr()
X_test = ssp.hstack(test_list).tocsr()
#X      = train[cat_features + date_features + amt_features].values
#X_test = test[cat_features + date_features + amt_features].values

In [195]:
def lgb_r2(pred, drain):
    y = drain.get_label()
    score = r2_score(y, pred)
    return 'r2_score', score, True


NFOLDS = 6
kfold = KFold(n_splits=NFOLDS, shuffle=True, random_state=13)

In [203]:
params = {'learning_rate': 0.1, 
          'num_leaves':70,
          'min_data_in_leaf': 150,
          'max_depth': -1, 
          'max_bin': 50,
          'colsample_bytree': 0.8, #0.522,
          'boosting': 'gbdt', 
          'application': 'regression', 
          'min_child_samples': 10,
          #'min_child_weight': 150,
          'min_split_gain': 0,
          #'subsample': 0.9,         
          'verbosity': 1,
          }
num_boost_round = 10000

In [206]:
x_score = []
cv_train_df = pd.DataFrame()
cv_test_df = pd.DataFrame()
no_round = 4
for s in range(no_round):
    cv_train = np.zeros(len(train_label))
    cv_pred = np.zeros(len(test_portfolio_id))

    params['seed'] = s
    
    kf = kfold.split(X, train_label)

    best_trees = []
    fold_scores = []
    
    for i, (train_fold, valid_fold) in enumerate(kf):
        X_train, X_valid, y_train, y_valid = \
        X[train_fold, :], X[valid_fold, :], train_label[train_fold], train_label[valid_fold]
                
        dtrain = lgb.Dataset(X_train, y_train)
        dvalid = lgb.Dataset(X_valid, y_valid, reference=dtrain)
            
        bst = lgb.train(params, dtrain, num_boost_round, valid_sets=dvalid, feval=lgb_r2, verbose_eval=100,
                             early_stopping_rounds=100)            
        best_trees.append(bst.best_iteration)
            
        #model = lgb.train(params,lgb.Dataset(X, train_label), num_boost_round=bst.best_iteration)
        #cv_pred += model.predict(X_test)
        #cv_train[valid_fold] += model.predict(X_valid)
            
        cv_pred += bst.predict(X_test, num_iteration=bst.best_iteration)
        cv_train[valid_fold] += bst.predict(X_valid, num_iteration=bst.best_iteration)

        score = r2_score(y_valid, cv_train[valid_fold])
        print(score)
        fold_scores.append(score)

    cv_pred /= NFOLDS
    cv_train_df['cv_train_%s'%str(s)] = cv_train
    cv_test_df['cv_pred_%s'%str(s)] = cv_pred

    print("cv score: {}".format(r2_score(train_label, cv_train)))    
    print ("current score: {}, {}".format(r2_score(train_label, cv_train_df.mean(axis=1)), s+1))
    print(fold_scores)
    #print(best_trees, np.mean(best_trees))
    print("************************************************")
    print()
    x_score.append(r2_score(train_label, cv_train))

print(x_score)

Training until validation scores don't improve for 100 rounds.
[100]	valid_0's r2_score: 0.972646
[200]	valid_0's r2_score: 0.974845
[300]	valid_0's r2_score: 0.974906
Early stopping, best iteration is:
[267]	valid_0's r2_score: 0.975341
0.975340604684
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's r2_score: 0.918752
[200]	valid_0's r2_score: 0.920519
Early stopping, best iteration is:
[176]	valid_0's r2_score: 0.920773
0.920773463444
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's r2_score: 0.955859
[200]	valid_0's r2_score: 0.960268
[300]	valid_0's r2_score: 0.961241
[400]	valid_0's r2_score: 0.962226
Early stopping, best iteration is:
[399]	valid_0's r2_score: 0.962245
0.962244771
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's r2_score: 0.469135
[200]	valid_0's r2_score: 0.476399
[300]	valid_0's r2_score: 0.47884
Early stopping, best iteration is:
[285]	valid_0's r2_score: 0.47907
0.479070

In [198]:
lr = LinearRegression()
#lr = Ridge(alpha=0.001)

In [199]:
model = lr.fit(cv_train_df, train_label)

In [200]:
r2_score(train_label, model.predict(cv_train_df))

0.81315852117176635

In [113]:
pred =  model.predict(cv_test_df)

In [114]:
sub = pd.DataFrame({'portfolio_id':test_portfolio_id, 'return': pred })
sub.head(5)

Unnamed: 0,portfolio_id,return
0,PF00001001,0.027476
1,PF00001004,0.033373
2,PF00001009,0.023194
3,PF00001013,0.026436
4,PF00001014,0.026352


In [115]:
sub.to_csv('lgbcv_bag_en_v1.csv', index=False)