In [1]:
# Initial imports

import numpy as np
import pandas as pd 
from pandas import DataFrame, Series
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

%matplotlib inline

In [2]:
# Imports for better visualization

from collections import defaultdict
import json

import scipy as sp

from matplotlib import rcParams
import matplotlib.cm as cm
import matplotlib as mpl

#colorbrewer2 Dark2 qualitative color table
dark2_colors = [(0.10588235294117647, 0.6196078431372549, 0.4666666666666667),
                (0.8509803921568627, 0.37254901960784315, 0.00784313725490196),
                (0.4588235294117647, 0.4392156862745098, 0.7019607843137254),
                (0.9058823529411765, 0.1607843137254902, 0.5411764705882353),
                (0.4, 0.6509803921568628, 0.11764705882352941),
                (0.9019607843137255, 0.6705882352941176, 0.00784313725490196),
                (0.6509803921568628, 0.4627450980392157, 0.11372549019607843)]

rcParams['figure.figsize'] = (10, 6)
rcParams['figure.dpi'] = 150
rcParams['axes.color_cycle'] = dark2_colors
rcParams['lines.linewidth'] = 2
rcParams['axes.facecolor'] = 'white'
rcParams['font.size'] = 14
rcParams['patch.edgecolor'] = 'white'
rcParams['patch.facecolor'] = dark2_colors[0]
rcParams['font.family'] = 'StixGeneral'



In [3]:
# Load training data
train_data = pd.read_csv('../data/train.csv')

In [4]:
# Load testing data
test_data = pd.read_csv('../data/test.csv')

In [5]:
# View train data head
train_data.head(10)

Unnamed: 0,ID,timestamp,Stock_ID,Volume,Three_Day_Moving_Average,Five_Day_Moving_Average,Ten_Day_Moving_Average,Twenty_Day_Moving_Average,True_Range,Average_True_Range,Positive_Directional_Movement,Negative_Directional_Movement,Outcome
0,1_1,1,1,-0.33308,-0.99681,-0.98736,-0.94728,-0.91977,-0.21129,-0.38778,-0.39903,-0.14321,1
1,3_1,1,3,-0.67096,-2.08895,-2.08251,-2.02183,-1.93779,-0.46476,0.97308,-0.31078,0.80286,0
2,4_1,1,4,-0.2204,1.07839,1.09143,1.18737,,-0.51638,-0.52368,-0.2414,-0.3191,0
3,6_1,1,6,-0.23118,-0.55106,-0.53976,-0.47569,-0.42449,-0.60816,-0.69311,-0.40673,-0.45965,0
4,10_1,1,10,0.15076,-1.40937,-1.39805,-1.36756,-1.31365,-0.33206,-0.4369,-0.33347,0.29217,0
5,11_1,1,11,-0.57249,0.61078,0.62465,0.6451,0.7629,1.21998,1.02812,1.44735,-0.4043,1
6,12_1,1,12,0.9195,0.69282,0.69202,0.72082,0.7039,0.05071,-0.79388,-0.4392,-0.4778,0
7,13_1,1,13,0.36579,-0.61906,-0.59099,-0.50427,-0.45347,-0.17573,-0.18656,-0.42729,-0.05086,0
8,14_1,1,14,-0.37675,-1.07755,-1.06405,-1.02723,-0.99213,-0.45472,-0.78394,-0.3944,-0.39275,0
9,15_1,1,15,0.19038,3.73614,3.76318,3.81298,3.82002,1.3463,1.54563,-0.36749,0.18381,0


In [6]:
# View test data head
test_data.head(10)

Unnamed: 0,ID,timestamp,Stock_ID,Volume,Three_Day_Moving_Average,Five_Day_Moving_Average,Ten_Day_Moving_Average,Twenty_Day_Moving_Average,True_Range,Average_True_Range,Positive_Directional_Movement,Negative_Directional_Movement
0,1_554,554,1,0.43959,-1.09885,-1.08146,-1.02367,-0.95527,0.11595,0.17058,-0.39903,-0.33365
1,2_554,554,2,-0.38494,-0.64131,-0.6434,-0.63477,-0.61062,-0.47816,-0.48696,-0.28545,-0.30166
2,3_554,554,3,1.38417,-0.53956,-0.46261,-0.22662,-0.15185,0.27538,0.86787,-0.31078,-0.4377
3,4_554,554,4,-0.2204,-1.64317,-1.66702,-1.66951,-1.58663,-0.51638,0.04259,-0.2414,-0.3191
4,6_554,554,6,-0.215,0.14539,0.12088,0.15728,0.13933,0.10491,-0.63238,-0.40673,-0.45965
5,7_554,554,7,0.3054,0.50061,0.42719,0.55069,0.38243,3.18863,1.57867,-0.35039,-0.50291
6,9_554,554,9,-0.24666,-1.42346,-1.41889,-1.3901,-1.40196,-0.22637,0.06181,-0.33927,-0.46579
7,10_554,554,10,2.67942,-0.3291,-0.30959,-0.19959,-0.0602,1.67859,3.06438,-0.33347,-0.48861
8,11_554,554,11,-0.10141,-0.99765,-0.98638,-0.96689,-0.96781,-0.81608,-0.14644,-0.39786,-0.4043
9,12_554,554,12,0.11787,-2.15749,-2.16635,-2.16211,-2.14986,-0.42887,-0.18222,-0.4392,-0.4778


In [7]:
train_data.dtypes

ID                                object
timestamp                          int64
Stock_ID                           int64
Volume                           float64
Three_Day_Moving_Average         float64
Five_Day_Moving_Average          float64
Ten_Day_Moving_Average           float64
Twenty_Day_Moving_Average        float64
True_Range                       float64
Average_True_Range               float64
Positive_Directional_Movement    float64
Negative_Directional_Movement    float64
Outcome                            int64
dtype: object

In [8]:
test_data.dtypes

ID                                object
timestamp                          int64
Stock_ID                           int64
Volume                           float64
Three_Day_Moving_Average         float64
Five_Day_Moving_Average          float64
Ten_Day_Moving_Average           float64
Twenty_Day_Moving_Average        float64
True_Range                       float64
Average_True_Range               float64
Positive_Directional_Movement    float64
Negative_Directional_Movement    float64
dtype: object

In [9]:
# View train data shape
train_data.shape

(702739, 13)

In [10]:
# Check for null values in train data
train_data.isnull().any()

ID                               False
timestamp                        False
Stock_ID                         False
Volume                           False
Three_Day_Moving_Average          True
Five_Day_Moving_Average           True
Ten_Day_Moving_Average            True
Twenty_Day_Moving_Average         True
True_Range                       False
Average_True_Range                True
Positive_Directional_Movement     True
Negative_Directional_Movement     True
Outcome                          False
dtype: bool

In [11]:
# View test data shape
test_data.shape

(101946, 12)

In [12]:
# Check for null values in test data
test_data.isnull().any()

ID                               False
timestamp                        False
Stock_ID                         False
Volume                           False
Three_Day_Moving_Average          True
Five_Day_Moving_Average           True
Ten_Day_Moving_Average            True
Twenty_Day_Moving_Average         True
True_Range                       False
Average_True_Range                True
Positive_Directional_Movement     True
Negative_Directional_Movement     True
dtype: bool

In [13]:
def num_missing(x):
  return sum(x.isnull())

In [14]:
print "Missing values per column:"
print train_data.apply(num_missing, axis=0) # axis=0 defines that function is to be applied on each column

Missing values per column:
ID                                  0
timestamp                           0
Stock_ID                            0
Volume                              0
Three_Day_Moving_Average          448
Five_Day_Moving_Average           902
Ten_Day_Moving_Average           2047
Twenty_Day_Moving_Average        4404
True_Range                          0
Average_True_Range               2985
Positive_Directional_Movement     224
Negative_Directional_Movement     224
Outcome                             0
dtype: int64


In [15]:
print "Missing values per column:"
print test_data.apply(num_missing, axis=0)

Missing values per column:
ID                                 0
timestamp                          0
Stock_ID                           0
Volume                             0
Three_Day_Moving_Average          54
Five_Day_Moving_Average          127
Ten_Day_Moving_Average           280
Twenty_Day_Moving_Average        587
True_Range                         0
Average_True_Range               401
Positive_Directional_Movement    100
Negative_Directional_Movement     61
dtype: int64


In [16]:
def fill_na(df):
    for col in train_data.columns[train_data.isnull().any()]:
        df[col] = df.groupby('Stock_ID').transform(lambda x: x.fillna(x.mean()))[col]
    return df

In [17]:
train_data = fill_na(train_data)
test_data = fill_na(test_data)

In [18]:
print train_data.apply(num_missing, axis=0)

ID                                0
timestamp                         0
Stock_ID                          0
Volume                            0
Three_Day_Moving_Average          2
Five_Day_Moving_Average           2
Ten_Day_Moving_Average            2
Twenty_Day_Moving_Average        30
True_Range                        0
Average_True_Range               14
Positive_Directional_Movement     2
Negative_Directional_Movement     2
Outcome                           0
dtype: int64


In [19]:
print test_data.apply(num_missing, axis=0)

ID                                0
timestamp                         0
Stock_ID                          0
Volume                            0
Three_Day_Moving_Average          3
Five_Day_Moving_Average           3
Ten_Day_Moving_Average            3
Twenty_Day_Moving_Average         9
True_Range                        0
Average_True_Range                0
Positive_Directional_Movement    76
Negative_Directional_Movement    37
dtype: int64


In [20]:
for col in train_data.columns[train_data.isnull().any()]:
    if train_data[col].isnull().any():
        train_data[col].fillna(np.mean(train_data[col].dropna()), inplace=True)

In [21]:
for col in test_data.columns[test_data.isnull().any()]:
    if test_data[col].isnull().any():
        test_data[col].fillna(np.mean(test_data[col].dropna()), inplace=True)

In [22]:
print train_data.apply(num_missing, axis=0)

ID                               0
timestamp                        0
Stock_ID                         0
Volume                           0
Three_Day_Moving_Average         0
Five_Day_Moving_Average          0
Ten_Day_Moving_Average           0
Twenty_Day_Moving_Average        0
True_Range                       0
Average_True_Range               0
Positive_Directional_Movement    0
Negative_Directional_Movement    0
Outcome                          0
dtype: int64


In [23]:
print test_data.apply(num_missing, axis=0)

ID                               0
timestamp                        0
Stock_ID                         0
Volume                           0
Three_Day_Moving_Average         0
Five_Day_Moving_Average          0
Ten_Day_Moving_Average           0
Twenty_Day_Moving_Average        0
True_Range                       0
Average_True_Range               0
Positive_Directional_Movement    0
Negative_Directional_Movement    0
dtype: int64


In [24]:
train_data['Stock_ID'].value_counts()

201     394
1535    380
159     380
1690    380
155     380
1179    380
668     380
1180    380
1692    380
669     380
1181    380
1693    380
158     380
1182    380
671     380
1689    380
672     380
1184    380
1696    380
673     380
1185    380
1697    380
162     380
674     380
163     380
1187    380
676     380
666     380
1177    380
1700    380
       ... 
1842     67
1950     67
1342     63
1910     63
154      63
1841     60
1833     59
46       59
1220     58
1770     58
834      57
2199     57
1436     52
112      52
1778     51
706      51
1446     41
589      40
807      39
2111     36
93       35
2169     32
593      32
202      27
1404     26
291      26
1753     16
1781     12
1442      1
1579      1
Name: Stock_ID, dtype: int64

In [25]:
# Import for Oversampling
from imblearn.over_sampling import RandomOverSampler

In [26]:
train_features = train_data.copy().drop(['ID', 'Stock_ID'], axis=1)
target = train_data.copy()['Stock_ID']

In [27]:
# 'Random over-sampling'
OS = RandomOverSampler(ratio='auto')
train_os_features, train_os_target = OS.fit_sample(train_features, target.values.ravel())


# Convert the generated numpy arrays into data frames
train_os_features = DataFrame(train_os_features, columns=train_features.columns)
train_os_target = DataFrame(train_os_target, columns=['Stock_ID'])

In [28]:
print train_os_target.shape
print train_os_target['Stock_ID'].value_counts()

(770270, 1)
1535    394
655     394
1679    394
144     394
2193    394
656     394
1168    394
2192    394
145     394
657     394
1169    394
1681    394
146     394
2195    394
658     394
1170    394
1682    394
2194    394
147     394
659     394
1171    394
148     394
2197    394
660     394
1684    394
2196    394
149     394
661     394
1173    394
1167    394
       ... 
842     394
1865    394
1861    394
1858    394
323     394
835     394
1347    394
1859    394
324     394
836     394
1348    394
1860    394
325     394
837     394
1349    394
326     394
1353    394
838     394
1350    394
1862    394
327     394
839     394
1351    394
328     394
840     394
1352    394
1864    394
329     394
841     394
2049    394
Name: Stock_ID, dtype: int64


In [29]:
train_os_features.columns

Index([u'timestamp', u'Volume', u'Three_Day_Moving_Average',
       u'Five_Day_Moving_Average', u'Ten_Day_Moving_Average',
       u'Twenty_Day_Moving_Average', u'True_Range', u'Average_True_Range',
       u'Positive_Directional_Movement', u'Negative_Directional_Movement',
       u'Outcome'],
      dtype='object')

In [30]:
train = train_os_features
train['Stock_ID'] = train_os_target

In [31]:
test = test_data.copy().drop(['ID'], axis=1)

In [32]:
train.head()

Unnamed: 0,timestamp,Volume,Three_Day_Moving_Average,Five_Day_Moving_Average,Ten_Day_Moving_Average,Twenty_Day_Moving_Average,True_Range,Average_True_Range,Positive_Directional_Movement,Negative_Directional_Movement,Outcome,Stock_ID
0,1.0,-0.28294,1.2905,1.29969,1.28249,1.3062,0.1226,0.74343,-0.36854,-0.33701,0.0,201
1,2.0,-0.22219,1.29899,1.29308,1.28696,1.2946,-0.11239,0.49819,-0.36854,0.06859,1.0,201
2,3.0,-0.26969,1.28303,1.29554,1.29302,1.29177,0.1696,0.34582,0.93883,-0.33701,0.0,201
3,4.0,-0.28515,1.31525,1.29474,1.28865,1.2841,0.43278,0.30597,-0.36854,-0.33701,0.0,201
4,5.0,-0.33707,1.24322,1.25112,1.27274,1.27269,0.63957,0.37504,-0.36854,-0.33701,1.0,201


In [33]:
# Imports for model
from sklearn.metrics import log_loss
from xgboost.sklearn import XGBRegressor



In [34]:
# XGBoost Model

# Since this data is time-based, divide train data into two sets, one for training (till timestamp < 550), and other for testing (timestamp == 550)
# Then drop timestamp column (it is pure noise, since no year in train data matched that in test data)
# Fit XGBRegressor

xgbr = XGBRegressor(n_estimators=100, max_depth=8, silent=False)
x_train = train[train['timestamp'] < 549].drop(['Outcome'], axis=1)
y_train = train[train['timestamp'] < 549]['Outcome']
x_test = train[train['timestamp'] >= 549].drop(['Outcome'], axis=1)
y_test = train[train['timestamp'] >= 549]['Outcome']
x_train.drop(['timestamp'], axis=1, inplace=True)
x_test.drop(['timestamp'], axis=1, inplace=True)
xgbr.fit(x_train, y_train)

XGBRegressor(base_score=0.5, colsample_bylevel=1, colsample_bytree=1, gamma=0,
       learning_rate=0.1, max_delta_step=0, max_depth=8,
       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
       objective='reg:linear', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=False, subsample=1)

In [35]:
# Predict values on test data
predicted_train = xgbr.predict(x_test)

In [36]:
# Print log-loss
print log_loss(y_test, predicted_train)

0.667832626633


In [37]:
# Use complete data and predict values on actual test data
xgbr.fit(train.drop(['timestamp', 'Outcome'], axis=1), train['Outcome'])
predicted_xgbr = xgbr.predict(test[x_test.columns])

In [38]:
# Create solution dataframe
solution_xgbr = pd.DataFrame(test_data['ID'])
solution_xgbr = pd.concat([solution_xgbr, pd.DataFrame(predicted_xgbr, columns=['Outcome'])], axis=1)
solution_xgbr

Unnamed: 0,ID,Outcome
0,1_554,0.457646
1,2_554,0.386978
2,3_554,0.466758
3,4_554,0.211573
4,6_554,0.439988
5,7_554,0.453570
6,9_554,0.409505
7,10_554,0.512541
8,11_554,0.440608
9,12_554,0.614689


In [39]:
# Export solution to csv file
solution_xgbr.to_csv('../submissions/solution_xgbr.csv', index=False)

In [40]:
import xgboost as xgb
params = {"objective": "binary:logistic",
          "booster" : "gbtree",
          "eta": 0.05,
          "max_depth": 8,
          "subsample": 0.75,
          "colsample_bytree": 0.5,
          'gamma':1.0,
          'min_child_weight':5,
          "silent": 1,
          "seed": 1301,
          "eval_metric": 'logloss'
          }
num_boost_round = 200

In [41]:
print("Train a XGBoost model")
dtrain = xgb.DMatrix(x_train, y_train)
dvalid = xgb.DMatrix(x_test, y_test)

watchlist = [(dtrain, 'train'), (dvalid, 'eval')]
gbm = xgb.train(params, dtrain, num_boost_round, evals=watchlist, early_stopping_rounds=50, verbose_eval=True)

Train a XGBoost model
[0]	train-logloss:0.691776	eval-logloss:0.69171
Multiple eval metrics have been passed: 'eval-logloss' will be used for early stopping.

Will train until eval-logloss hasn't improved in 50 rounds.
[1]	train-logloss:0.69051	eval-logloss:0.688849
[2]	train-logloss:0.689551	eval-logloss:0.687767
[3]	train-logloss:0.688643	eval-logloss:0.686589
[4]	train-logloss:0.687744	eval-logloss:0.68557
[5]	train-logloss:0.686808	eval-logloss:0.684415
[6]	train-logloss:0.685961	eval-logloss:0.683644
[7]	train-logloss:0.685202	eval-logloss:0.684121
[8]	train-logloss:0.68452	eval-logloss:0.682736
[9]	train-logloss:0.68404	eval-logloss:0.681791
[10]	train-logloss:0.68348	eval-logloss:0.680854
[11]	train-logloss:0.682914	eval-logloss:0.680067
[12]	train-logloss:0.682564	eval-logloss:0.679785
[13]	train-logloss:0.682156	eval-logloss:0.679078
[14]	train-logloss:0.681645	eval-logloss:0.67857
[15]	train-logloss:0.681337	eval-logloss:0.678116
[16]	train-logloss:0.680801	eval-logloss:0.677

In [42]:
# Use complete data and predict values on actual test data

print("Train a XGBoost model")
dtrain = xgb.DMatrix(train.drop(['timestamp', 'Outcome'], axis=1), train['Outcome'])

watchlist = [(dtrain, 'train')]
gbm = xgb.train(params, dtrain, num_boost_round, evals=watchlist, early_stopping_rounds=50, verbose_eval=True)

Train a XGBoost model
[0]	train-logloss:0.692
Will train until train-logloss hasn't improved in 50 rounds.
[1]	train-logloss:0.691187
[2]	train-logloss:0.690055
[3]	train-logloss:0.68887
[4]	train-logloss:0.687999
[5]	train-logloss:0.687253
[6]	train-logloss:0.686332
[7]	train-logloss:0.685477
[8]	train-logloss:0.684914
[9]	train-logloss:0.684151
[10]	train-logloss:0.683589
[11]	train-logloss:0.683155
[12]	train-logloss:0.682657
[13]	train-logloss:0.682091
[14]	train-logloss:0.681556
[15]	train-logloss:0.681122
[16]	train-logloss:0.680586
[17]	train-logloss:0.680098
[18]	train-logloss:0.679787
[19]	train-logloss:0.679535
[20]	train-logloss:0.679141
[21]	train-logloss:0.678694
[22]	train-logloss:0.678354
[23]	train-logloss:0.678031
[24]	train-logloss:0.677747
[25]	train-logloss:0.677495
[26]	train-logloss:0.677229
[27]	train-logloss:0.677023
[28]	train-logloss:0.676788
[29]	train-logloss:0.676568
[30]	train-logloss:0.676334
[31]	train-logloss:0.676125
[32]	train-logloss:0.675963
[33]	tr

In [43]:
predicted_xgb = gbm.predict(xgb.DMatrix(test[x_train.columns]))

In [44]:
# Create solution dataframe
solution_xgb = pd.DataFrame(test_data['ID'])
solution_xgb = pd.concat([solution_xgb, pd.DataFrame(predicted_xgb, columns=['Outcome'])], axis=1)
solution_xgb

Unnamed: 0,ID,Outcome
0,1_554,0.524737
1,2_554,0.399330
2,3_554,0.472769
3,4_554,0.207324
4,6_554,0.446963
5,7_554,0.486172
6,9_554,0.460272
7,10_554,0.576876
8,11_554,0.471126
9,12_554,0.650570


In [45]:
# Export solution to csv file
solution_xgb.to_csv('../submissions/solution_xgb.csv', index=False)