In [1]:
import pandas as pd
import numpy as np
%matplotlib inline
import warnings # Ignores any warning
warnings.filterwarnings("ignore")

In [2]:
from sklearn import preprocessing 

In [3]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from math import sqrt
from sklearn.model_selection import GridSearchCV

### Train Factor data

In [3]:
data = pd.read_csv("train_factors-1573207730757.csv");

In [4]:
data.head()

Unnamed: 0,Id,date,ticker,SF1,SF2,SF3,SF4,SF5,SF6,SF7,alpha
0,1,21/08/18,$NTAP,-0.628652,0.988891,-0.055714,0.774379,0.551089,-1.329229,-0.995539,2
1,2,11/10/18,$WYNN,1.315786,1.438754,0.187327,0.608933,-1.15303,1.859441,0.730995,3
2,3,21/08/18,$DRI,-1.141388,-1.455016,0.332755,0.674502,0.111326,-0.478597,-1.488157,1
3,4,10/07/18,$ge,-0.054839,-1.454149,-0.162267,-0.68187,0.307869,-0.529987,0.404172,2
4,5,12/09/18,$FE,-0.686366,0.838865,0.07383,0.679024,0.329463,1.262782,-1.024042,2


In [5]:
data.describe()

Unnamed: 0,Id,SF1,SF2,SF3,SF4,SF5,SF6,SF7,alpha
count,27006.0,27006.0,27006.0,27006.0,27006.0,27006.0,27006.0,27006.0,27006.0
mean,13503.5,0.005429,-0.001005,-0.024421,-0.04242,0.041465,0.002597,0.036744,2.379582
std,7796.105021,0.941853,0.99613,0.539555,1.111309,0.976028,0.998839,1.204473,1.008681
min,1.0,-4.11467,-4.578587,-2.60915,-6.329798,-5.480268,-4.338751,-5.347869,1.0
25%,6752.25,-0.596792,-0.66631,-0.300187,-0.767247,-0.518641,-0.668027,-0.688198,2.0
50%,13503.5,0.012104,-0.013257,-0.040786,-0.058629,0.072127,0.007417,0.081374,2.0
75%,20254.75,0.611574,0.669094,0.241424,0.677433,0.616538,0.674592,0.807556,3.0
max,27006.0,4.415048,3.82227,3.452486,4.907407,4.779911,4.075428,5.182912,4.0


In [6]:
data.shape #checking the shape of the data

(27006, 11)

In [7]:
data.isnull().sum() #checking the null values

Id        0
date      0
ticker    0
SF1       0
SF2       0
SF3       0
SF4       0
SF5       0
SF6       0
SF7       0
alpha     0
dtype: int64

In [8]:
data['date']=pd.to_datetime(data['date'].astype(str), format='%d/%m/%y') #changing the format of the date 

In [9]:
data['Date'] = pd.to_datetime(data['date']).dt.day #separating the date from the date in a new column
data['Month'] = pd.to_datetime(data['date']).dt.month # separating the Month from the date in a new column
data['Year'] = pd.to_datetime(data['date']).dt.year # separating the Year from the date in a new column
data['Weekday'] = pd.to_datetime(data['date']).dt.weekday # separating the Weekday from the date in a new column

In [10]:
data.head()

Unnamed: 0,Id,date,ticker,SF1,SF2,SF3,SF4,SF5,SF6,SF7,alpha,Date,Month,Year,Weekday
0,1,2018-08-21,$NTAP,-0.628652,0.988891,-0.055714,0.774379,0.551089,-1.329229,-0.995539,2,21,8,2018,1
1,2,2018-10-11,$WYNN,1.315786,1.438754,0.187327,0.608933,-1.15303,1.859441,0.730995,3,11,10,2018,3
2,3,2018-08-21,$DRI,-1.141388,-1.455016,0.332755,0.674502,0.111326,-0.478597,-1.488157,1,21,8,2018,1
3,4,2018-07-10,$ge,-0.054839,-1.454149,-0.162267,-0.68187,0.307869,-0.529987,0.404172,2,10,7,2018,1
4,5,2018-09-12,$FE,-0.686366,0.838865,0.07383,0.679024,0.329463,1.262782,-1.024042,2,12,9,2018,2


In [14]:
#make double $ as single $ in ticker
data['ticker'] = data['ticker'].str.replace('$$','$',regex=False)
data['ticker'] = data['ticker'].str.replace('$','',regex = False)

In [15]:
#convert all the ticker to uppercase
data['ticker'] = data['ticker'].str.upper() 

In [16]:
data.head()

Unnamed: 0,Id,date,ticker,SF1,SF2,SF3,SF4,SF5,SF6,SF7,alpha,Date,Month,Year,Weekday
0,1,2018-08-21,NTAP,-0.628652,0.988891,-0.055714,0.774379,0.551089,-1.329229,-0.995539,2,21,8,2018,1
1,2,2018-10-11,WYNN,1.315786,1.438754,0.187327,0.608933,-1.15303,1.859441,0.730995,3,11,10,2018,3
2,3,2018-08-21,DRI,-1.141388,-1.455016,0.332755,0.674502,0.111326,-0.478597,-1.488157,1,21,8,2018,1
3,4,2018-07-10,GE,-0.054839,-1.454149,-0.162267,-0.68187,0.307869,-0.529987,0.404172,2,10,7,2018,1
4,5,2018-09-12,FE,-0.686366,0.838865,0.07383,0.679024,0.329463,1.262782,-1.024042,2,12,9,2018,2


In [17]:
data['ticker'].nunique()

872

In [18]:
data.dtypes

Id                  int64
date       datetime64[ns]
ticker             object
SF1               float64
SF2               float64
SF3               float64
SF4               float64
SF5               float64
SF6               float64
SF7               float64
alpha               int64
Date                int64
Month               int64
Year                int64
Weekday             int64
dtype: object

#### Call the cleaned json train data

In [19]:
final_cleaned  = pd.read_csv("Cleaned_Json_TrainData.csv")

In [20]:
final_cleaned.isnull().sum()

Tweet              0
Sentiment_score    0
Stock              0
date               0
time               0
Date               0
Month              0
Year               0
Weekday            0
word_count         0
char_count         0
numerics           0
dtype: int64

In [21]:
final_cleaned.shape

(1039131, 12)

In [22]:
final_cleaned.head()

Unnamed: 0,Tweet,Sentiment_score,Stock,date,time,Date,Month,Year,Weekday,word_count,char_count,numerics
0,amd going up but hesitating however chart is ...,3,AMD,2018-09-19,18:38:28+00:00,19,9,2018,2,13,74,0
1,despite china trade war cat held very well t...,3,CAT,2018-10-09,03:51:06+00:00,9,10,2018,1,11,69,0
2,avgo wtf,2,AVGO,2018-07-12,13:35:32+00:00,12,7,2018,3,2,10,0
3,ph new insider filing on muller klaus peter ...,2,PH,2018-07-19,03:32:50+00:00,19,7,2018,3,16,69,0
4,fb if it bounces tommorrow do the right thing...,3,FB,2018-08-23,19:07:54+00:00,23,8,2018,3,11,55,0


#### Group the json train data

In [24]:
final_cleaned.groupby(by=["date","Stock","Sentiment_score"]).count()["Tweet"].unstack().fillna(0).to_csv("GroupBy_Dummy_Count.csv")

In [25]:
train_tweets_group = pd.read_csv("GroupBy_Dummy_Count.csv")

In [26]:
train_tweets_group.head()

Unnamed: 0,date,Stock,0,1,2,3,4
0,2018-07-01,AABA,0.0,0.0,1.0,0.0,0.0
1,2018-07-01,AAL,0.0,1.0,1.0,3.0,1.0
2,2018-07-01,AAP,0.0,1.0,0.0,0.0,0.0
3,2018-07-01,AAPL,1.0,2.0,31.0,5.0,2.0
4,2018-07-01,ABBV,2.0,0.0,0.0,0.0,1.0


In [27]:
train_tweets_group['date']=train_tweets_group['date'].astype(str)
data['date']=data['date'].astype(str)

In [28]:
train_tweets_group.dtypes

date      object
Stock     object
0        float64
1        float64
2        float64
3        float64
4        float64
dtype: object

In [29]:
data.dtypes

Id           int64
date        object
ticker      object
SF1        float64
SF2        float64
SF3        float64
SF4        float64
SF5        float64
SF6        float64
SF7        float64
alpha        int64
Date         int64
Month        int64
Year         int64
Weekday      int64
dtype: object

In [30]:
newdf = pd.merge(data, train_tweets_group,  how='left', left_on=['date','ticker'], right_on = ['date','Stock'])
newdf.head()

Unnamed: 0,Id,date,ticker,SF1,SF2,SF3,SF4,SF5,SF6,SF7,...,Date,Month,Year,Weekday,Stock,0,1,2,3,4
0,1,2018-08-21,NTAP,-0.628652,0.988891,-0.055714,0.774379,0.551089,-1.329229,-0.995539,...,21,8,2018,1,NTAP,0.0,3.0,7.0,2.0,1.0
1,2,2018-10-11,WYNN,1.315786,1.438754,0.187327,0.608933,-1.15303,1.859441,0.730995,...,11,10,2018,3,WYNN,1.0,2.0,10.0,3.0,1.0
2,3,2018-08-21,DRI,-1.141388,-1.455016,0.332755,0.674502,0.111326,-0.478597,-1.488157,...,21,8,2018,1,DRI,0.0,1.0,2.0,0.0,0.0
3,4,2018-07-10,GE,-0.054839,-1.454149,-0.162267,-0.68187,0.307869,-0.529987,0.404172,...,10,7,2018,1,GE,5.0,11.0,58.0,29.0,23.0
4,5,2018-09-12,FE,-0.686366,0.838865,0.07383,0.679024,0.329463,1.262782,-1.024042,...,12,9,2018,2,FE,1.0,0.0,0.0,0.0,0.0


In [31]:
newdf.columns

Index(['Id', 'date', 'ticker', 'SF1', 'SF2', 'SF3', 'SF4', 'SF5', 'SF6', 'SF7',
       'alpha', 'Date', 'Month', 'Year', 'Weekday', 'Stock', '0', '1', '2',
       '3', '4'],
      dtype='object')

In [32]:
newdf.isnull().sum()

Id            0
date          0
ticker        0
SF1           0
SF2           0
SF3           0
SF4           0
SF5           0
SF6           0
SF7           0
alpha         0
Date          0
Month         0
Year          0
Weekday       0
Stock      2167
0          2167
1          2167
2          2167
3          2167
4          2167
dtype: int64

In [33]:
newdf.drop(['Stock'], axis='columns',inplace=True)

In [34]:
newdf = newdf.fillna(0)

In [35]:
newdf.shape

(27006, 20)

In [None]:
newdf.to_csv("Factor_Train_merged data",index=False)

### Validation on train data

In [58]:
x_t = data[['SF1','SF2','SF3','SF4','SF5','SF6','SF7']]
y_t = data['alpha']

In [59]:
from sklearn.model_selection import train_test_split
xTrain, xVal, yTrain, yVal = train_test_split(x_t, y_t, test_size = .3,stratify = y, random_state = 19)

In [60]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
rfc = RandomForestClassifier(n_estimators = 80)

rfc.fit(xTrain, yTrain)
y_pred_test_RF = rfc.predict(xVal)

print(f1_score(yVal, y_pred_test_RF,average='macro'))

0.6570792023194278


##### Using Sentiment data

In [36]:
x = newdf[['SF1', 'SF2', 'SF3', 'SF4', 'SF5', 'SF6', 'SF7','0','1','2','3', '4']]
y = newdf['alpha']

In [37]:
from sklearn.model_selection import train_test_split
xTrain, xVal, yTrain, yVal = train_test_split(x, y, test_size = .3, random_state = 0)

#### Logistic Regression

In [38]:
from sklearn.metrics import f1_score
from sklearn.linear_model import LogisticRegression
log_reg=LogisticRegression(solver='newton-cg')
log_reg.fit(xTrain,yTrain)
test_pred=log_reg.predict(xVal)

In [39]:
print(f1_score(yVal,test_pred,average='macro'))

0.267257750711596


#### Random Forest

In [40]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
rfc = RandomForestClassifier(n_estimators = 80)

rfc.fit(xTrain, yTrain)
y_pred_test_RF = rfc.predict(xVal)

print(f1_score(yVal, y_pred_test_RF,average='macro'))

0.6646709068257791


## Factor Test Data

In [42]:
test_data = pd.read_csv("TestData/test_factors.csv");

In [43]:
test_data.head()

Unnamed: 0,Id,date,ticker,SF1,SF2,SF3,SF4,SF5,SF6,SF7
0,270007,21/07/18,$INTC,-3.062194,1.223466,1.741714,2.279266,-1.323573,-0.274912,-4.504449
1,270008,05/10/18,$CTSH,0.816263,-2.184408,0.157975,-0.264743,-0.836282,0.046276,0.826353
2,270009,01/10/18,$CB,0.401281,0.091604,0.083411,-1.147041,-0.485223,-0.60106,1.012811
3,270010,24/10/18,$CTAS,-0.783521,1.192929,0.813831,-0.368166,-1.113656,-0.553581,-0.683803
4,270011,27/07/18,$intc,0.796507,0.455341,0.679032,0.354336,-1.799055,0.126153,0.297111


In [44]:
test_data.describe()

Unnamed: 0,Id,SF1,SF2,SF3,SF4,SF5,SF6,SF7
count,11575.0,11575.0,11575.0,11575.0,11575.0,11575.0,11575.0,11575.0
mean,275794.0,0.001546,0.003995,-0.029754,-0.047606,0.053956,-0.000428,0.037914
std,3341.559017,0.924519,0.997903,0.537651,1.097842,0.965109,1.003503,1.192223
min,270007.0,-4.13041,-4.034445,-4.259227,-4.704038,-4.545787,-4.317883,-5.031257
25%,272900.5,-0.58047,-0.673206,-0.308838,-0.754051,-0.50228,-0.682896,-0.684029
50%,275794.0,0.011865,-0.001464,-0.04459,-0.055108,0.084598,0.008303,0.081955
75%,278687.5,0.600965,0.685805,0.228484,0.649046,0.620426,0.6808,0.807105
max,281581.0,4.519223,4.108285,2.763214,4.794511,5.646556,3.900014,6.005315


In [45]:
test_data.shape#checking the shape of test data

(11575, 10)

In [46]:
test_data.isnull().sum() #checking the null values in test data

Id        0
date      0
ticker    0
SF1       0
SF2       0
SF3       0
SF4       0
SF5       0
SF6       0
SF7       0
dtype: int64

In [47]:
test_data['date']=pd.to_datetime(test_data['date'].astype(str), format='%d/%m/%y') #changing the format of the date in test data

In [48]:
test_data['Date'] = pd.to_datetime(test_data['date']).dt.day #separating the date from the date in a new column
test_data['Month'] = pd.to_datetime(test_data['date']).dt.month # separating the Month from the date in a new column
test_data['Year'] = pd.to_datetime(test_data['date']).dt.year # separating the Year from the date in a new column
test_data['Weekday'] = pd.to_datetime(test_data['date']).dt.weekday # separating the Weekday from the date in a new column

In [49]:
test_data.head()

Unnamed: 0,Id,date,ticker,SF1,SF2,SF3,SF4,SF5,SF6,SF7,Date,Month,Year,Weekday
0,270007,2018-07-21,$INTC,-3.062194,1.223466,1.741714,2.279266,-1.323573,-0.274912,-4.504449,21,7,2018,5
1,270008,2018-10-05,$CTSH,0.816263,-2.184408,0.157975,-0.264743,-0.836282,0.046276,0.826353,5,10,2018,4
2,270009,2018-10-01,$CB,0.401281,0.091604,0.083411,-1.147041,-0.485223,-0.60106,1.012811,1,10,2018,0
3,270010,2018-10-24,$CTAS,-0.783521,1.192929,0.813831,-0.368166,-1.113656,-0.553581,-0.683803,24,10,2018,2
4,270011,2018-07-27,$intc,0.796507,0.455341,0.679032,0.354336,-1.799055,0.126153,0.297111,27,7,2018,4


In [50]:
#remover $ from test data
test_data['ticker'] = test_data['ticker'].str.replace('$$','$',regex=False)
test_data['ticker'] = test_data['ticker'].str.replace('$','',regex=False)

In [51]:
#convert all the ticker to uppercase
test_data['ticker'] = test_data['ticker'].str.upper() 

In [52]:
test_data.head()

Unnamed: 0,Id,date,ticker,SF1,SF2,SF3,SF4,SF5,SF6,SF7,Date,Month,Year,Weekday
0,270007,2018-07-21,INTC,-3.062194,1.223466,1.741714,2.279266,-1.323573,-0.274912,-4.504449,21,7,2018,5
1,270008,2018-10-05,CTSH,0.816263,-2.184408,0.157975,-0.264743,-0.836282,0.046276,0.826353,5,10,2018,4
2,270009,2018-10-01,CB,0.401281,0.091604,0.083411,-1.147041,-0.485223,-0.60106,1.012811,1,10,2018,0
3,270010,2018-10-24,CTAS,-0.783521,1.192929,0.813831,-0.368166,-1.113656,-0.553581,-0.683803,24,10,2018,2
4,270011,2018-07-27,INTC,0.796507,0.455341,0.679032,0.354336,-1.799055,0.126153,0.297111,27,7,2018,4


### Calling json test file 

In [51]:
json_log  = pd.read_csv("Cleaned_Json_TestData_WithLogScore.csv")

In [53]:
json_log.isnull().sum()

Tweet              176
Stock              176
date               176
time               176
Date               176
Month              176
Year               176
Weekday            176
Sentiment_score    176
dtype: int64

In [54]:
json_log = json_log.fillna(0)

##### Group the json test data 

In [55]:
json_log.groupby(by=["date","Stock","Sentiment_score"]).count()["Tweet"].unstack().fillna(0).to_csv("GroupBy_Dummy_Count_Log.csv")

In [56]:
log_tweets_group = pd.read_csv("GroupBy_Dummy_Count_Log.csv")

In [57]:
log_tweets_group['date']= log_tweets_group['date'].astype(str)
json_log['date']=json_log['date'].astype(str)

In [59]:
final_test_log = pd.merge(test_data, log_tweets_group,  how='left', left_on=['date','ticker'], right_on = ['date','Stock'])
final_test_log.head()

Unnamed: 0,Id,date,ticker,SF1,SF2,SF3,SF4,SF5,SF6,SF7,Date,Month,Year,Weekday,Stock,0.0,1.0,2.0,3.0,4.0
0,270007,2018-07-21,INTC,-3.062194,1.223466,1.741714,2.279266,-1.323573,-0.274912,-4.504449,21,7,2018,5,INTC,0.0,1.0,1.0,0.0,0.0
1,270008,2018-10-05,CTSH,0.816263,-2.184408,0.157975,-0.264743,-0.836282,0.046276,0.826353,5,10,2018,4,CTSH,0.0,0.0,2.0,0.0,0.0
2,270009,2018-10-01,CB,0.401281,0.091604,0.083411,-1.147041,-0.485223,-0.60106,1.012811,1,10,2018,0,CB,0.0,0.0,1.0,0.0,0.0
3,270010,2018-10-24,CTAS,-0.783521,1.192929,0.813831,-0.368166,-1.113656,-0.553581,-0.683803,24,10,2018,2,CTAS,0.0,0.0,2.0,0.0,0.0
4,270011,2018-07-27,INTC,0.796507,0.455341,0.679032,0.354336,-1.799055,0.126153,0.297111,27,7,2018,4,INTC,10.0,2.0,83.0,14.0,17.0


In [60]:
final_test_log.isnull().sum()

Id         0
date       0
ticker     0
SF1        0
SF2        0
SF3        0
SF4        0
SF5        0
SF6        0
SF7        0
Date       0
Month      0
Year       0
Weekday    0
Stock      1
0.0        1
1.0        1
2.0        1
3.0        1
4.0        1
dtype: int64

In [61]:
final_test_log.drop(['Stock'], axis='columns',inplace=True)

In [62]:
final_test_log = final_test_log.fillna(0)

#### Prediction Using only 7 factors

In [53]:
xTrain_2 = data[['SF1','SF2','SF3','SF4','SF5','SF6','SF7']]
yTrain_2 = data['alpha']
xtest_2  = test_data[['SF1','SF2','SF3','SF4','SF5','SF6','SF7']]

In [54]:
#random forest
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators = 60)

rfc.fit(xTrain_2, yTrain_2)
y_pred_test_rf_2 = rfc.predict(xtest_2)

In [55]:
rf_new_pred_2 = pd.DataFrame({'id':test_data['Id'],'alpha':y_pred_test_rf_2})

In [56]:
rf_new_pred_2.head()

Unnamed: 0,id,alpha
0,270007,4
1,270008,3
2,270009,2
3,270010,1
4,270011,4


In [57]:
rf_new_pred_2.to_csv('Submission_2_RandomForestPred.csv',index=False)

In [None]:
## Score was 0.62

#### Prediction using sentiment Score

In [64]:
x_train = train_data[['SF1', 'SF2', 'SF3', 'SF4', 'SF5', 'SF6', 'SF7','0','1','2','3','4']]
y_train = train_data['alpha']
x_test_1 = final_test_log[['SF1', 'SF2', 'SF3', 'SF4', 'SF5', 'SF6', 'SF7','0.0', '1.0', '2.0', '3.0', '4.0']]

###### Using Random Forest

In [65]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
rfc = RandomForestClassifier(n_estimators = 80)

rfc.fit(x_train, y_train)
y_pred_test_rf = rfc.predict(x_test_1)

In [66]:
final_log = pd.DataFrame({"Id":final_test_log['Id'],"alpha":y_pred_test_rf})

In [67]:
final_log.to_csv('Submission_4_RandomForestPred_withSentiment_log.csv',index=False)

In [68]:
final_log.head()

Unnamed: 0,Id,alpha
0,270007,4
1,270008,4
2,270009,2
3,270010,1
4,270011,3


In [None]:
# Score was 0.66

######  With estimators 100

In [71]:
x_train_2 = train_data[['SF1', 'SF2', 'SF3', 'SF4', 'SF5', 'SF6', 'SF7','0','1','2','3','4']]
y_train_2 = train_data['alpha']
x_test_2 = final_test_log[['SF1', 'SF2', 'SF3', 'SF4', 'SF5', 'SF6', 'SF7','0.0', '1.0', '2.0', '3.0', '4.0']]

In [73]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
rfc = RandomForestClassifier(n_estimators = 100)

rfc.fit(x_train_2, y_train_2)
y_pred_test_rf_2 = rfc.predict(x_test_2)

In [74]:
final_log_2 = pd.DataFrame({"Id":final_test_log['Id'],"alpha":y_pred_test_rf_2})

In [75]:
final_log_2.to_csv('Submission_5_RandomForestPred_withSentiment_log.csv',index=False)

In [None]:
# Score was 0.66