In [36]:
#Import Libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import Lasso
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasRegressor
from sklearn.preprocessing import StandardScaler

In [3]:
#Load Dataset
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [4]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9366 entries, 0 to 9365
Data columns (total 18 columns):
portfolio_id      9366 non-null object
desk_id           5701 non-null object
office_id         9366 non-null object
pf_category       9366 non-null object
start_date        9366 non-null int64
sold              9364 non-null float64
country_code      9366 non-null object
euribor_rate      9366 non-null float64
currency          9366 non-null object
libor_rate        8892 non-null float64
bought            9364 non-null float64
creation_date     9366 non-null int64
indicator_code    3667 non-null object
sell_date         9366 non-null int64
type              9366 non-null object
hedge_value       3665 non-null object
status            6282 non-null object
return            9366 non-null float64
dtypes: float64(5), int64(3), object(10)
memory usage: 1.3+ MB


In [5]:
#Drop the Portfolio id Since it will not effect the return rate Just a unique Id of an observation
sub_ids = test['portfolio_id']
test = test.drop('portfolio_id', axis=1)
y = train['return']
train = train.drop(['portfolio_id','return'], axis=1)

In [6]:
train.head()

Unnamed: 0,desk_id,office_id,pf_category,start_date,sold,country_code,euribor_rate,currency,libor_rate,bought,creation_date,indicator_code,sell_date,type,hedge_value,status
0,DSK00001001,OFF00001002,B,20040720,110000000.0,T,0.02074,USD,2.332216,109809700.0,20040720,,20040812,B,,
1,DSK00001002,OFF00001001,A,20040709,176671000.0,N,0.02074,GBP,5.269617,176008400.0,20040723,,20040812,C,,
2,DSK00001004,OFF00001001,A,20040723,56474000.0,T,0.02074,USD,2.332216,56379530.0,20040723,,20040817,A,,
3,DSK00001005,OFF00001001,A,20040609,164813000.0,T,0.02074,USD,2.332216,164508800.0,20040723,,20040713,A,,
4,DSK00001005,OFF00001002,B,20040609,140800000.0,T,0.02074,USD,2.332216,140540200.0,20040723,,20040713,B,,


In [7]:
# remove some  fields  for simplicity
train = train.drop(['start_date', 'creation_date', 'sell_date', 'indicator_code', 'status', 'desk_id'], axis=1)
test = test.drop(['start_date', 'creation_date', 'sell_date', 'indicator_code', 'status', 'desk_id'], axis=1)

In [8]:
train.head()

Unnamed: 0,office_id,pf_category,sold,country_code,euribor_rate,currency,libor_rate,bought,type,hedge_value
0,OFF00001002,B,110000000.0,T,0.02074,USD,2.332216,109809700.0,B,
1,OFF00001001,A,176671000.0,N,0.02074,GBP,5.269617,176008400.0,C,
2,OFF00001001,A,56474000.0,T,0.02074,USD,2.332216,56379530.0,A,
3,OFF00001001,A,164813000.0,T,0.02074,USD,2.332216,164508800.0,A,
4,OFF00001002,B,140800000.0,T,0.02074,USD,2.332216,140540200.0,B,


In [9]:
#Handle Missing Data 

#Hedge Value
train['hedge_value'].fillna(False, inplace=True)
test['hedge_value'].fillna(False, inplace=True)

#Missing values for numeric fields by Median 
train['sold'].fillna(train['sold'].median(), inplace=True)
train['bought'].fillna(train['bought'].median(), inplace=True)
train['libor_rate'].fillna(train['libor_rate'].median(), inplace=True)
test['libor_rate'].fillna(train['libor_rate'].median(), inplace=True)

In [10]:
train.head()

Unnamed: 0,office_id,pf_category,sold,country_code,euribor_rate,currency,libor_rate,bought,type,hedge_value
0,OFF00001002,B,110000000.0,T,0.02074,USD,2.332216,109809700.0,B,False
1,OFF00001001,A,176671000.0,N,0.02074,GBP,5.269617,176008400.0,C,False
2,OFF00001001,A,56474000.0,T,0.02074,USD,2.332216,56379530.0,A,False
3,OFF00001001,A,164813000.0,T,0.02074,USD,2.332216,164508800.0,A,False
4,OFF00001002,B,140800000.0,T,0.02074,USD,2.332216,140540200.0,B,False


In [11]:
# encode categorical fields
obj_cols = [x for x in train.columns if train[x].dtype == 'object']
encoder = LabelEncoder()
for x in obj_cols:
    encoder.fit(train[x])
    train[x] = encoder.transform(train[x])
    test[x] = encoder.transform(test[x])

In [12]:
train.head()

Unnamed: 0,office_id,pf_category,sold,country_code,euribor_rate,currency,libor_rate,bought,type,hedge_value
0,1,1,110000000.0,2,0.02074,4,2.332216,109809700.0,1,False
1,0,0,176671000.0,1,0.02074,2,5.269617,176008400.0,2,False
2,0,0,56474000.0,2,0.02074,4,2.332216,56379530.0,0,False
3,0,0,164813000.0,2,0.02074,4,2.332216,164508800.0,0,False
4,1,1,140800000.0,2,0.02074,4,2.332216,140540200.0,1,False


In [13]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9366 entries, 0 to 9365
Data columns (total 10 columns):
office_id       9366 non-null int64
pf_category     9366 non-null int64
sold            9366 non-null float64
country_code    9366 non-null int64
euribor_rate    9366 non-null float64
currency        9366 non-null int64
libor_rate      9366 non-null float64
bought          9366 non-null float64
type            9366 non-null int64
hedge_value     9366 non-null bool
dtypes: bool(1), float64(4), int64(5)
memory usage: 667.8 KB


In [18]:
#Random Forest Code 
forest_reg = RandomForestRegressor(random_state=7)
scores = cross_val_score(forest_reg, train, y, scoring='r2', cv=5)
print(scores)
print('mean r2:',np.mean(scores))

[-0.99699912  0.97393782  0.89965033  0.38801503  0.81863494]
('mean r2:', 0.41664779848853933)


In [19]:
#Random Forest Training and Testing of Data
forest_reg = RandomForestRegressor(random_state=7)
forest_reg.fit(train, y)
preds = forest_reg.predict(test)

In [20]:
resultRandomForest = pd.DataFrame({'portfolio_id': sub_ids, 'return': preds})

In [21]:
resultRandomForest.head()

Unnamed: 0,portfolio_id,return
0,PF00001001,0.024482
1,PF00001004,0.025172
2,PF00001009,0.025172
3,PF00001013,0.025172
4,PF00001014,0.025172


In [25]:
#XGB Regressor
regr = xgb.XGBRegressor(colsample_bytree=0.2, gamma=0.0, learning_rate=0.05, max_depth=6, min_child_weight=1.5, n_estimators=7200, reg_alpha=0.9, reg_lambda=0.6, subsample=0.2, seed=42, silent=1)
scoreXGB = cross_val_score(regr, train, y, scoring='r2', cv=5)
print(scoreXGB)
print('mean r2:',np.mean(scoreXGB))
regr.fit(train, y)
#run prediction on training set to get an idea of how well it does (XGB Regressor)
y_pred = regr.predict(test)
resultXGBRegressor = pd.DataFrame({'portfolio_id': sub_ids, 'return': y_pred})

[-1.01441597  0.96508098  0.95183333  0.29100437 -0.08865144]
('mean r2:', 0.22097025145669241)


In [26]:
resultXGBRegressor.head()

Unnamed: 0,portfolio_id,return
0,PF00001001,0.027446
1,PF00001004,0.02354
2,PF00001009,0.027923
3,PF00001013,0.026664
4,PF00001014,0.02675


In [30]:
#Lasso Regression 
#best_alpha = 0.0099
#Lasso_regr = Lasso(alpha=best_alpha, max_iter=500000)
#scoreLasso = cross_val_score(Lasso_regr, train, y, scoring='r2', cv=5)
#print(scoreLasso)
#print('mean r2:',np.mean(scoreLasso))
#regr.fit(train, y)
#y_pred = regr.predict(test)
#lasso_ex = np.exp(y_pred)
#resultLasso = pd.DataFrame({'portfolio_id': sub_ids, 'return': y_pred})

In [31]:
#resultLasso.head()

In [35]:
#Neural Network
#np.random.seed(10)
#create Model
#define base model
#def base_model():
#    model = Sequential()
#    model.add(Dense(20, input_dim=398, init='normal', activation='relu'))
#    model.add(Dense(10, init='normal', activation='relu'))
#    model.add(Dense(1, init='normal'))
#    model.compile(loss='mean_squared_error', optimizer = 'adam')
#    return model

#seed = 7
#np.random.seed(seed)
#scale = StandardScaler()
#
#X_train = scale.fit_transform(train)
#X_test = scale.fit_transform(test)

#keras_label = y.as_matrix()
#clf = KerasRegressor(build_fn=base_model, nb_epoch=1000, batch_size=5,verbose=0)
#scoreClf = cross_val_score(clf, train, y, scoring='r2', cv=5)
#print(scoreClf)
#print('mean r2:',np.mean(scoreClf))

#clf.fit(X_train, keras_label)

#make predictions
#kpred = clf.predict(X_test) 
#kpred = np.exp(kpred)
#resultNN = pd.DataFrame({'portfolio_id': sub_ids, 'return': kpred})