In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import r2_score
from tqdm import tqdm

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from sklearn.naive_bayes import BernoulliNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVR
from sklearn.model_selection import StratifiedKFold

In [4]:
import xgboost as xgb
import lightgbm as lgb
import time

#### Importing Files

In [5]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [6]:
print('Shape of train data:', train.shape)
print('Shape of testing Data:', test.shape)

Shape of train data: (9366, 18)
Shape of testing Data: (4801, 17)


In [7]:
null_cols = [key for key, value in train.isnull().any().iteritems() if value==True]

In [8]:
# Count Null Values
null_list = train.isnull()
print ('Number of Null values:')
for col in null_cols:
    print ('{} = {}'.format(col, sum(null_list[col])))

Number of Null values:
desk_id = 3665
sold = 2
libor_rate = 474
bought = 2
indicator_code = 5699
hedge_value = 5701
status = 3084


#### Extract Data

In [9]:
#### Start Date
train['start_year'] = train['start_date'].apply(lambda x: int(str(x)[0:4]))
train['start_month'] = train['start_date'].apply(lambda x: int(str(x)[4:6]))
train['start_date'] = train['start_date'].apply(lambda x: int(str(x)[6:8]))

test['start_year'] = test['start_date'].apply(lambda x: int(str(x)[0:4]))
test['start_month'] = test['start_date'].apply(lambda x: int(str(x)[4:6]))
test['start_date'] = test['start_date'].apply(lambda x: int(str(x)[6:8]))

In [10]:
#### Creation Date
train['create_year'] = train['creation_date'].apply(lambda x: int(str(x)[0:4]))
train['create_month'] = train['creation_date'].apply(lambda x: int(str(x)[4:6]))
train['create_date'] = train['creation_date'].apply(lambda x: int(str(x)[6:8]))

test['create_year'] = test['creation_date'].apply(lambda x: int(str(x)[0:4]))
test['create_month'] = test['creation_date'].apply(lambda x: int(str(x)[4:6]))
test['create_date'] = test['creation_date'].apply(lambda x: int(str(x)[6:8]))

In [11]:
#### Sell date
train['sell_year'] = train['sell_date'].apply(lambda x: int(str(x)[0:4]))
train['sell_month'] = train['sell_date'].apply(lambda x: int(str(x)[4:6]))
train['sell_date'] = train['sell_date'].apply(lambda x: int(str(x)[6:8]))

test['sell_year'] = test['sell_date'].apply(lambda x: int(str(x)[0:4]))
test['sell_month'] = test['sell_date'].apply(lambda x: int(str(x)[4:6]))
test['sell_date'] = test['sell_date'].apply(lambda x: int(str(x)[6:8]))

#### Preprocessing Features

In [12]:
cols = list(train.columns)
cols.remove('portfolio_id')
cols.remove('return')

In [13]:
num_cols = []
for col in tqdm(cols):
    if train[col].dtype == 'int64' or train[col].dtype == 'float64':
        num_cols.append(col)
        train[col].fillna(train[col].mean(), inplace =True)
        test[col].fillna(test[col].mean(), inplace = True)

100%|██████████| 23/23 [00:00<00:00, 1038.66it/s]


In [14]:
scalar = MinMaxScaler()
train[num_cols] = scalar.fit_transform(train[num_cols])
test[num_cols] = scalar.fit_transform(test[num_cols])

In [15]:
from sklearn.preprocessing import LabelEncoder
for col in tqdm(cols):
    if train[col].dtype == 'object':
        train[col] = train[col].apply(str)
        test[col] = test[col].apply(str)

        le = LabelEncoder()
        train_vals = list(train[col].unique())
        test_vals = list(test[col].unique())
        le.fit(train_vals + test_vals)
        train[col] = le.transform(train[col])
        test[col] = le.transform(test[col])

100%|██████████| 23/23 [00:00<00:00, 102.02it/s]


#### K-fold CV with Out-of-Fold Prediction

In [16]:
from sklearn.metrics import r2_score
def r2_score_lgb(pred, dtrain):
    y = dtrain.get_label()
    score = r2_score(y_true=y, y_pred=pred)
    return 'r2-score', score, True

In [17]:
def cross_validate_sklearn(clf, x_train, y_train, x_test, kf, scale=False, verbose=True):
    # Intialise the size of out of fold train on test prediction
    train_pred = np.zeros((x_train.shape[0]))
    test_pred = np.zeros((x_test.shape[0]))
    
    # use k-fold object to geerate the required folds
    for i, (train_index,test_index) in enumerate(kf.split(x_train, y_train)):
        # generate training and validation folds
        x_train_kf, x_val_kf = x_train.loc[train_index, :], x_train.loc[test_index, :]
        y_train_kf, y_val_kf = y_train[train_index], y_train[test_index]
        
        x_train_kf_values = x_train_kf.values
        x_val_kf_values = x_val_kf.values
        x_test_values = x_test.values
        
        # fit the input classifier and perform prediction
        clf.fit(x_train_kf_values, y_train_kf.values)
        val_pred = clf.predict(x_val_kf_values)[:,1]
        train_pred[test_index] += val_pred
        
        y_test_preds = clf.predict(x_test_values)[:,1]
        test_pred += y_test_preds
        
        fold_r2_score = r2_score(y_val_kf.values, val_pred)
        
        if verbose:
            print('fold cv {} R2_score is{:.6f}'.format(i, fold_r2_score))
    
    test_pred /= kf.n_splits
    
    cv_r2_score = r2_score(y_train, train_pred)
    if verbose:
        print('cv R2_score is {:.6f}'.format(cv_r2_score))
    
    return cv_r2_score, train_pred, test_pred

#### XGBoost K-Fold & OOF function

In [18]:
def probability_to_rank(prediction, scaler = 1):
    pred_df = pd.DataFrame(columns=['probability'])
    pred_df['probability'] = prediction
    pred_df['rank'] = pred_df['probability'].rank()/len(prediction)*scaler
    return pred_df['rank'].values

In [19]:
def cross_validate_xgb(params, x_train, y_train, x_test, kf, cat_cols=[], verbose=True, verbose_eval=50, num_boost_round=4000):
    train_pred = np.zeros((x_train.shape[0]))
    test_pred = np.zeros((x_test.shape[0]))
    
    # Use the k-fold object to enumerate indexes for each training and validation fold
    for i, (train_index, val_index) in enumerate(kf.split(x_train, y_train)):
        x_train_kf, x_val_kf = x_train.loc[train_index, :], x_train.loc[val_index, :]
        y_train_kf, y_val_kf = y_train[train_index], y_train_kf[val_index]
        x_test_kf = x_test.copy()
        
        d_train_kf = xgb.DMatrix(x_train_kf, label=y_train_kf)
        d_val_kf = xgb.DMatrix(x_val_kf, label=y_val_kf)
        d_test = xgb.DMatrix(x_test_kf)
        
        bst = xgb.train(params, d_train_kf, num_boost_round=num_boost_round, evals=[(d_train_kf, 'train'),(d_val_kf, 'val')], verbose_eval=verbose_eval, early_stopping_rounds=50)
        
        val_pred= bst.predict(d_val_kf, ntree_limit=bst.best_ntree_limit)
        
        train_pred[val_index] += val_pred
        test_pred += bst.predict(d_test)
         
        fold_r2_score = r2_score(y_val_kf.values, val_pred)
        
        if verbose:
            print('fold cv {} R2_score is{:.6f}'.format(i, fold_r2_score))
    
    test_pred /= kf.n_splits
    
    cv_r2_score = r2_score(y_train, train_pred)
    if verbose:
        print('cv R2_score is {:.6f}'.format(cv_r2_score))
    
    return cv_r2_score, train_pred, test_pred

#### LightGBM K-Fold & OOF Function

In [24]:
def cross_validate_lgb(params, x_train, y_train, x_test, kf, cat_cols=[], verbose=True, verbose_eval=50, num_boost_round=4000):
    train_pred = np.zeros((x_train.shape[0]))
    test_pred = np.zeros((x_test.shape[0]))
    
    if len(cat_cols)==0: use_cat = False
    # Use the k-fold object to enumerate indexes for each training and validation fold
    for i, (train_index, val_index) in enumerate(kf.split(x_train, y_train)):
        x_train_kf, x_val_kf = x_train.loc[train_index, :], x_train.loc[val_index, :]
        y_train_kf, y_val_kf = y_train[train_index], y_train_kf[val_index]
        x_test_kf = x_test.copy()
        
        if use_cat:
            lgb_train = lgb.Dataset(x_train_kf, label=y_train_kf, categorical_feature=cat_cols)
            lgb_val = lgb.Dataset(x_val_kf, label=y_val_kf, reference=lgb_train, categorical_feature=cat_cols)
        else:
            lgb_train = lgb.Dataset(x_train_kf, label=y_train_kf)
            lgb_val = lgb.Dataset(x_val_kf, label=y_val_kf, reference=lgb_train)
            
        bst = lgb.train(params, lgb_train, num_boost_round=num_boost_round, valid_sets=lgb_val, verbose_eval=verbose_eval, early_stopping_rounds=50)
        
        val_pred= bst.predict(x_val_kf)
        
        train_pred[val_index] += val_pred
        test_pred += bst.predict(x_test)
         
        fold_r2_score = r2_score(y_val_kf.values, val_pred)
        
        if verbose:
            print('fold cv {} R2_score is{:.6f}'.format(i, fold_r2_score))
    
    test_pred /= kf.n_splits
    
    cv_r2_score = r2_score(y_train, train_pred)
    if verbose:
        print('cv R2_score is {:.6f}'.format(cv_r2_score))
    
    return cv_r2_score, train_pred, test_pred

### Generate Level 1 OOF predictions

In [22]:
drop_cols = ['portfolio_id', 'return']
y_train = train['return']
x_train = train.drop(drop_cols, axis=1)
x_test  = test.drop(['portfolio_id'], axis=1)

In [23]:
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=47)

#### Random Forest