In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
pd.set_option('display.max_columns', 1000)

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [3]:
train.head()

Unnamed: 0,portfolio_id,desk_id,office_id,pf_category,start_date,sold,country_code,euribor_rate,currency,libor_rate,bought,creation_date,indicator_code,sell_date,type,hedge_value,status,return
0,PF00001002,DSK00001001,OFF00001002,B,20040720,110000000.0,T,0.02074,USD,2.332216,109809700.0,20040720,,20040812,B,,,0.02496
1,PF00001003,DSK00001002,OFF00001001,A,20040709,176671000.0,N,0.02074,GBP,5.269617,176008400.0,20040723,,20040812,C,,,0.05496
2,PF00001005,DSK00001004,OFF00001001,A,20040723,56474000.0,T,0.02074,USD,2.332216,56379530.0,20040723,,20040817,A,,,0.02496
3,PF00001006,DSK00001005,OFF00001001,A,20040609,164813000.0,T,0.02074,USD,2.332216,164508800.0,20040723,,20040713,A,,,0.02496
4,PF00001007,DSK00001005,OFF00001002,B,20040609,140800000.0,T,0.02074,USD,2.332216,140540200.0,20040723,,20040713,B,,,0.02496


In [4]:
print('The train data has {} rows and {} columns'.format(train.shape[0], train.shape[1]))
print('The test data has {} rows and {} columns'.format(test.shape[0], test.shape[1]))

The train data has 9366 rows and 18 columns
The test data has 4801 rows and 17 columns


In [5]:
## check target class
train['return'].value_counts(normalize=True)

0.005400    0.028721
0.006000    0.025198
0.001560    0.024984
0.003120    0.021140
0.003240    0.020820
0.002880    0.020286
0.003600    0.019859
0.003000    0.018044
0.009000    0.017830
0.002520    0.016976
0.003360    0.016656
0.002400    0.016122
0.002760    0.015802
0.003840    0.015695
0.003720    0.014841
0.003480    0.014627
0.003960    0.014200
0.001800    0.013560
0.004200    0.012065
0.002640    0.010784
0.004440    0.010570
0.002160    0.010570
0.002280    0.009930
0.001680    0.009716
0.004080    0.009609
0.009360    0.008862
0.004800    0.008221
0.006360    0.008008
0.007800    0.007474
0.004320    0.007047
              ...   
0.035160    0.000107
0.102000    0.000107
0.060720    0.000107
0.079200    0.000107
0.040488    0.000107
0.032053    0.000107
0.037008    0.000107
0.032160    0.000107
0.005340    0.000107
0.021200    0.000107
0.032040    0.000107
0.075480    0.000107
0.036720    0.000107
0.031716    0.000107
0.031692    0.000107
0.006766    0.000107
0.034560    0

In [6]:
## check missing values
train.isnull().sum(axis=0) 

portfolio_id         0
desk_id           3665
office_id            0
pf_category          0
start_date           0
sold                 2
country_code         0
euribor_rate         0
currency             0
libor_rate         474
bought               2
creation_date        0
indicator_code    5699
sell_date            0
type                 0
hedge_value       5701
status            3084
return               0
dtype: int64

In [7]:
train.fillna(-999, inplace=True)
test.fillna(-999,inplace=True)

In [8]:
## check missing values
train.isnull().sum(axis=0) 

portfolio_id      0
desk_id           0
office_id         0
pf_category       0
start_date        0
sold              0
country_code      0
euribor_rate      0
currency          0
libor_rate        0
bought            0
creation_date     0
indicator_code    0
sell_date         0
type              0
hedge_value       0
status            0
return            0
dtype: int64

In [9]:
train.replace(np.nan, '', regex=True)

Unnamed: 0,portfolio_id,desk_id,office_id,pf_category,start_date,sold,country_code,euribor_rate,currency,libor_rate,bought,creation_date,indicator_code,sell_date,type,hedge_value,status,return
0,PF00001002,DSK00001001,OFF00001002,B,20040720,110000000.0,T,0.02074,USD,2.332216,1.098097e+08,20040720,-999,20040812,B,-999,-999,0.02496
1,PF00001003,DSK00001002,OFF00001001,A,20040709,176671000.0,N,0.02074,GBP,5.269617,1.760084e+08,20040723,-999,20040812,C,-999,-999,0.05496
2,PF00001005,DSK00001004,OFF00001001,A,20040723,56474000.0,T,0.02074,USD,2.332216,5.637953e+07,20040723,-999,20040817,A,-999,-999,0.02496
3,PF00001006,DSK00001005,OFF00001001,A,20040609,164813000.0,T,0.02074,USD,2.332216,1.645088e+08,20040723,-999,20040713,A,-999,-999,0.02496
4,PF00001007,DSK00001005,OFF00001002,B,20040609,140800000.0,T,0.02074,USD,2.332216,1.405402e+08,20040723,-999,20040713,B,-999,-999,0.02496
5,PF00001008,DSK00001006,OFF00001001,A,20040707,48741000.0,T,0.02074,USD,2.332216,4.865127e+07,20040726,-999,20040810,A,-999,-999,0.02490
6,PF00001010,DSK00001009,OFF00001001,A,20040706,60593500.0,T,0.02074,USD,2.332216,6.048181e+07,20040726,-999,20040809,A,-999,-999,0.02493
7,PF00001011,DSK00001009,OFF00001002,B,20040706,134200000.0,T,0.02074,USD,2.332216,1.339526e+08,20040726,-999,20040809,B,-999,-999,0.02493
8,PF00001012,DSK00001010,OFF00001001,A,20040419,82929000.0,T,0.02074,USD,2.332216,8.250616e+07,20040726,-999,20040720,A,-999,-999,0.02460
9,PF00001016,DSK00001014,OFF00001001,A,20040414,212476000.0,T,0.02074,USD,2.332216,2.114140e+08,20040727,-999,20040713,A,-999,-999,0.02466


In [10]:
feature_names = [x for x in train.columns if x not in ['portfolio_id','desk_id','return']]
y_train = train['return']
X_train = train[feature_names]

In [11]:
X_train.dtypes

office_id          object
pf_category        object
start_date          int64
sold              float64
country_code       object
euribor_rate      float64
currency           object
libor_rate        float64
bought            float64
creation_date       int64
indicator_code     object
sell_date           int64
type               object
hedge_value        object
status             object
dtype: object

In [12]:
y_train.dtypes

dtype('float64')

In [13]:
categorical_features_indices = np.where(X_train.dtypes != np.float)[0]

In [14]:
X_train1, X_test1, y_train1, y_test1 = train_test_split(X_train, y_train, test_size = 0.7)

In [15]:
from catboost import CatBoostRegressor

In [16]:
model=CatBoostRegressor(iterations=10, depth=10, learning_rate=0.1, eval_metric='R2')

In [17]:
model.fit(X_train1,y_train1,cat_features=categorical_features_indices,eval_set=(X_test1, y_test1))

0: learn: 0.10284	test: 0.08777548	bestTest: 0.08777548 (0)	total: 215ms	remaining: 1.94s
1: learn: 0.1999248	test: 0.1701258	bestTest: 0.1701258 (1)	total: 316ms	remaining: 1.26s
2: learn: 0.2940477	test: 0.2484367	bestTest: 0.2484367 (2)	total: 437ms	remaining: 1.02s
3: learn: 0.3804358	test: 0.3189856	bestTest: 0.3189856 (3)	total: 581ms	remaining: 871ms
4: learn: 0.4577088	test: 0.380928	bestTest: 0.380928 (4)	total: 713ms	remaining: 713ms
5: learn: 0.518872	test: 0.4296151	bestTest: 0.4296151 (5)	total: 1.03s	remaining: 689ms
6: learn: 0.5782737	test: 0.4763012	bestTest: 0.4763012 (6)	total: 1.23s	remaining: 528ms
7: learn: 0.6257637	test: 0.5126217	bestTest: 0.5126217 (7)	total: 1.33s	remaining: 333ms
8: learn: 0.6671966	test: 0.5445119	bestTest: 0.5445119 (8)	total: 1.53s	remaining: 170ms
9: learn: 0.7040559	test: 0.57252	bestTest: 0.57252 (9)	total: 1.66s	remaining: 0us

bestTest = 0.5725199588
bestIteration = 9



<catboost.core.CatBoostRegressor at 0x1a606049e10>

In [18]:
preds_proba = model.predict(test[feature_names])

In [19]:
sub = pd.read_csv('sample_submission.csv')
sub['return'] = preds_proba
sub.to_csv('sub2.csv', index=False)