In [1]:
#Import all the Libraries

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split
from sklearn import preprocessing 
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
import math
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import BaggingClassifier
import xgboost as xgb

In [2]:
#reading training and test set 
jainstreet_train_df=pd.read_csv('jainstreet_train.csv')
jainstreet_test_df =pd.read_csv('jainstreet_test.csv')

In [3]:
print("training  dataset shape:",jainstreet_train_df.shape)

print("testing  dataset shape:",jainstreet_test_df.shape)


training  dataset shape: (549044, 138)
testing  dataset shape: (15219, 133)


In [4]:
print("training  dataset columns:",jainstreet_train_df.columns)

print("\ntesting  dataset columns:",jainstreet_test_df.columns)

training  dataset columns: Index(['date', 'weight', 'resp_1', 'resp_2', 'resp_3', 'resp_4', 'resp',
       'feature_0', 'feature_1', 'feature_2',
       ...
       'feature_121', 'feature_122', 'feature_123', 'feature_124',
       'feature_125', 'feature_126', 'feature_127', 'feature_128',
       'feature_129', 'ts_id'],
      dtype='object', length=138)

testing  dataset columns: Index(['weight', 'feature_0', 'feature_1', 'feature_2', 'feature_3',
       'feature_4', 'feature_5', 'feature_6', 'feature_7', 'feature_8',
       ...
       'feature_122', 'feature_123', 'feature_124', 'feature_125',
       'feature_126', 'feature_127', 'feature_128', 'feature_129', 'date',
       'ts_id'],
      dtype='object', length=133)


In [5]:
jainstreet_train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 549044 entries, 0 to 549043
Columns: 138 entries, date to ts_id
dtypes: float64(136), int64(2)
memory usage: 578.1 MB


In [6]:
jainstreet_test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15219 entries, 0 to 15218
Columns: 133 entries, weight to ts_id
dtypes: float64(130), int64(3)
memory usage: 15.4 MB


In [7]:
jainstreet_train_df.head(3)

Unnamed: 0,date,weight,resp_1,resp_2,resp_3,resp_4,resp,feature_0,feature_1,feature_2,...,feature_121,feature_122,feature_123,feature_124,feature_125,feature_126,feature_127,feature_128,feature_129,ts_id
0,0,0.0,0.009916,0.014079,0.008773,0.00139,0.00627,1,-1.872746,-2.191242,...,,1.168391,8.313583,1.782433,14.018213,2.653056,12.600292,2.301488,11.445807,0.0
1,0,16.673515,-0.002828,-0.003226,-0.007319,-0.011114,-0.009792,-1,-1.349537,-1.704709,...,,-1.17885,1.777472,-0.915458,2.831612,-1.41701,2.297459,-1.304614,1.898684,1.0
2,0,0.0,0.025134,0.027607,0.033406,0.03438,0.02397,-1,0.81278,-0.256156,...,,6.115747,9.667908,5.542871,11.671595,7.281757,10.060014,6.638248,9.427299,2.0


In [8]:
jainstreet_test_df.head(3)

Unnamed: 0,weight,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,...,feature_122,feature_123,feature_124,feature_125,feature_126,feature_127,feature_128,feature_129,date,ts_id
0,0.0,1,-1.872746,-2.191242,-0.474163,-0.323046,0.014688,-0.002484,,,...,1.168391,8.313583,1.782433,14.018213,2.653056,12.600292,2.301488,11.445807,0,0
1,16.673515,-1,-1.349537,-1.704709,0.068058,0.028432,0.193794,0.138212,,,...,-1.17885,1.777472,-0.915458,2.831612,-1.41701,2.297459,-1.304614,1.898684,0,1
2,0.0,-1,0.81278,-0.256156,0.806463,0.400221,-0.614188,-0.3548,,,...,6.115747,9.667908,5.542871,11.671595,7.281757,10.060014,6.638248,9.427299,0,2


In [9]:
jainstreet_train_df["action"]=((jainstreet_train_df["weight"].values * (jainstreet_train_df["resp_1"]+\
jainstreet_train_df["resp_2"]+jainstreet_train_df["resp_3"]+\
jainstreet_train_df["resp_4"]+jainstreet_train_df["resp"]).values)/4>0).astype('int')

In [10]:
jainstreet_train_df.head()

Unnamed: 0,date,weight,resp_1,resp_2,resp_3,resp_4,resp,feature_0,feature_1,feature_2,...,feature_122,feature_123,feature_124,feature_125,feature_126,feature_127,feature_128,feature_129,ts_id,action
0,0,0.0,0.009916,0.014079,0.008773,0.00139,0.00627,1,-1.872746,-2.191242,...,1.168391,8.313583,1.782433,14.018213,2.653056,12.600292,2.301488,11.445807,0.0,0
1,0,16.673515,-0.002828,-0.003226,-0.007319,-0.011114,-0.009792,-1,-1.349537,-1.704709,...,-1.17885,1.777472,-0.915458,2.831612,-1.41701,2.297459,-1.304614,1.898684,1.0,0
2,0,0.0,0.025134,0.027607,0.033406,0.03438,0.02397,-1,0.81278,-0.256156,...,6.115747,9.667908,5.542871,11.671595,7.281757,10.060014,6.638248,9.427299,2.0,0
3,0,0.0,-0.00473,-0.003273,-0.000461,-0.000476,-0.0032,-1,1.174378,0.34464,...,2.838853,0.499251,3.033732,1.513488,4.397532,1.266037,3.856384,1.013469,3.0,0
4,0,0.138531,0.001252,0.002165,-0.001215,-0.006219,-0.002604,1,-3.172026,-3.093182,...,0.34485,4.101145,0.614252,6.623456,0.800129,5.233243,0.362636,3.926633,4.0,0


In [11]:
jainstreet_train_df.tail()

Unnamed: 0,date,weight,resp_1,resp_2,resp_3,resp_4,resp,feature_0,feature_1,feature_2,...,feature_122,feature_123,feature_124,feature_125,feature_126,feature_127,feature_128,feature_129,ts_id,action
549039,91,0.136693,-0.000433,-0.002033,-0.009515,-0.017946,-0.013704,-1,-1.735497,-1.273775,...,0.111983,-0.749697,0.10147,-0.894643,0.022253,-0.9599,0.068556,-0.805562,549039.0,0
549040,91,6.11259,2.5e-05,-0.00123,-0.007299,-0.024056,-0.016897,-1,-1.261235,-1.148765,...,-1.275057,-0.645166,-1.086997,-1.121311,-1.78508,-0.894832,-1.526469,-0.718331,549040.0,0
549041,91,0.44352,0.00051,0.000602,0.003118,0.005546,0.003296,-1,-2.488309,-2.42638,...,-0.939181,-0.735675,-0.797741,-1.032481,-1.334528,-0.973659,-1.128223,-0.792549,549041.0,1
549042,91,0.0,7.1e-05,0.000198,-0.008014,-0.015945,-0.008168,1,-1.446283,-0.875787,...,-0.870838,0.861704,-1.107797,-0.481247,-1.727085,-0.229617,-1.282271,0.234548,549042.0,0
549043,91,0.319476,0.005469,0.003734,0.004155,0.009226,0.006811,1,6.365317,6.080791,...,,,,,,,,,,1


In [12]:
cols=['resp_1','resp_2','resp_3','resp_4','resp']

jainstreet_train_df.drop(cols,axis=1 ,inplace=True)

In [14]:
jainstreet_train_df


Unnamed: 0,weight,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,...,feature_123,feature_124,feature_125,feature_126,feature_127,feature_128,feature_129,date,ts_id,action
0,0.000000,1,-1.872746,-2.191242,-0.474163,-0.323046,0.014688,-0.002484,,,...,8.313583,1.782433,14.018213,2.653056,12.600292,2.301488,11.445807,0,0.0,0
1,16.673515,-1,-1.349537,-1.704709,0.068058,0.028432,0.193794,0.138212,,,...,1.777472,-0.915458,2.831612,-1.417010,2.297459,-1.304614,1.898684,0,1.0,0
2,0.000000,-1,0.812780,-0.256156,0.806463,0.400221,-0.614188,-0.354800,,,...,9.667908,5.542871,11.671595,7.281757,10.060014,6.638248,9.427299,0,2.0,0
3,0.000000,-1,1.174378,0.344640,0.066872,0.009357,-1.006373,-0.676458,,,...,0.499251,3.033732,1.513488,4.397532,1.266037,3.856384,1.013469,0,3.0,0
4,0.138531,1,-3.172026,-3.093182,-0.161518,-0.128149,-0.195006,-0.143780,,,...,4.101145,0.614252,6.623456,0.800129,5.233243,0.362636,3.926633,0,4.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
549039,0.136693,-1,-1.735497,-1.273775,-2.191633,-3.329210,-2.646825,-4.095454,-1.018309,-1.610825,...,-0.749697,0.101470,-0.894643,0.022253,-0.959900,0.068556,-0.805562,91,549039.0,0
549040,6.112590,-1,-1.261235,-1.148765,-0.076187,-0.073584,-0.978200,-1.123366,0.435077,0.640957,...,-0.645166,-1.086997,-1.121311,-1.785080,-0.894832,-1.526469,-0.718331,91,549040.0,0
549041,0.443520,-1,-2.488309,-2.426380,-1.455696,-1.737868,-1.476611,-1.845737,0.334845,0.563210,...,-0.735675,-0.797741,-1.032481,-1.334528,-0.973659,-1.128223,-0.792549,91,549041.0,1
549042,0.000000,1,-1.446283,-0.875787,-0.041559,-0.029367,1.320933,2.105668,-0.880145,-1.369942,...,0.861704,-1.107797,-0.481247,-1.727085,-0.229617,-1.282271,0.234548,91,549042.0,0


In [18]:
jainstreet_train_df.isna().sum()

weight           0
feature_0        0
feature_1        0
feature_2        0
feature_3      325
              ... 
feature_128    418
feature_129    418
date             0
ts_id            1
action           0
Length: 134, dtype: int64

In [19]:
# Fill NaNs with mean of column:
jainstreet_train_df.fillna(jainstreet_train_df.median(), inplace = True)

In [20]:
jainstreet_test_df.isna().sum()

weight          0
feature_0       0
feature_1       0
feature_2       0
feature_3       0
               ..
feature_127    42
feature_128     5
feature_129     5
date            0
ts_id           0
Length: 133, dtype: int64

In [21]:
jainstreet_test_df.fillna(jainstreet_train_df.median(), inplace = True)

In [22]:
jainstreet_train_df = jainstreet_train_df[jainstreet_train_df['weight'] != 0]


jainstreet_train_df = jainstreet_train_df[jainstreet_train_df['date']>85]

x_train = jainstreet_train_df.loc[:, jainstreet_train_df.columns.str.contains('feature')]
y_train = jainstreet_train_df.loc[:, 'action']

In [23]:
#dividing the target and independent features  of the dataset 
X = jainstreet_train_df.drop('action', axis=1)
Y = jainstreet_train_df[['action']]

In [24]:
#Split the data into training and validation set 

x_train, x_cv, y_train, y_cv = train_test_split(X, Y, test_size=0.1, random_state=0)

In [25]:
# extreme gradient boosting Algorithm

xgbmodel1 = xgb.XGBClassifier(colsample_bylevel=0.9,
                              colsample_bytree=0.8,
                              gamma=0.99,
                              maxdepth=5,
                            min_child_weight=1,
                            n_estimators=10,
                            nthread=4,
                            random_state=2,
                            silent=True)
xgbmodel1.fit(x_train, y_train)
print("\n")
print("Extreme Gradient Boosting algorithms Score :",xgbmodel1.score(x_cv,y_cv))

Y_xgbpred = xgbmodel1.predict(x_cv)

# summarize the fit of the xgboost model
print(metrics.classification_report(y_cv, Y_xgbpred))

Parameters: { maxdepth, silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




Extreme Gradient Boosting algorithms Score : 0.6215297450424929
              precision    recall  f1-score   support

           0       0.63      0.57      0.60       871
           1       0.62      0.67      0.64       894

    accuracy                           0.62      1765
   macro avg       0.62      0.62      0.62      1765
weighted avg       0.62      0.62      0.62      1765



In [27]:
pred_xgb_test=xgbmodel1.predict(jainstreet_test_df)

In [29]:
final_submit_stockprediction = pd.DataFrame({'ts_id':jainstreet_test_df['ts_id'],'action':pred_xgb_test})
final_submit_stockprediction


Unnamed: 0,ts_id,action
0,0,0
1,1,1
2,2,1
3,3,1
4,4,1
...,...,...
15214,15214,0
15215,15215,1
15216,15216,0
15217,15217,1


In [31]:
final_submit_stockprediction.to_csv('submission_StockPrediction.csv',index=False)