In [39]:
from sklearn.metrics import confusion_matrix, precision_score
from sklearn.model_selection import train_test_split
from keras.layers import Dense,Dropout
from keras.models import Sequential
from keras.regularizers import l2
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler

In [40]:
featureDF_nw = pd.read_csv('nw_feature_woText_MSFT.csv')
featureDF_tw = pd.read_csv('tw_feature_woText_MSFT.csv')
featureFI = pd.read_csv('FIwithTargetwithFTT.csv')
bert_featureDF_nw = pd.read_csv('nw_BertVector.csv')
bert_featureDF_tw = pd.read_csv('tw_BertVector_MSFT.csv')

In [41]:
def getStartEndDateShape(dfz):
    print('---------------------------')   
    print('shape', dfz.shape) 
    try:
        startDate = str(min(dfz['date']))
        endDate = str(max(dfz['date']))
        print('startDate', startDate)
        print('endDate', endDate)

    except:
        print("No Date column")

In [42]:
def createComparableDataset(_df):
    _df.date = pd.to_datetime(_df.date)
    _df = _df[_df['date'] >= pd.to_datetime('2015/05/04')]
    _df = _df[_df['date'] <= pd.to_datetime('2019/03/28')]
    return _df
	
def createTrainingAndTestingSet(_df):
    train = _df[_df['date'] < pd.to_datetime('2019/01/01')]
    test = _df[_df['date'] >= pd.to_datetime('2019/01/01')]
    return train ,test
	
def getFeaturesAndTargetRemoveDate(_df):
    train_y = _df['Target']
    train = _df.drop(['date','Target'],axis =1)
    return train, train_y

In [43]:
getStartEndDateShape(featureDF_nw)
getStartEndDateShape(featureDF_tw)
getStartEndDateShape(featureFI)
getStartEndDateShape(bert_featureDF_nw)
getStartEndDateShape(bert_featureDF_tw)

---------------------------
shape (4020, 24)
startDate 2004/07/21
endDate 2019/03/28
---------------------------
shape (1426, 25)
startDate 2015/05/04
endDate 2019/04/04
---------------------------
shape (4010, 32)
startDate 2003/04/30
endDate 2019/04/03
---------------------------
shape (4020, 768)
No Date column
---------------------------
shape (1426, 768)
No Date column


# Formulation of overall dataset

In [44]:
nw_date = featureDF_nw.date.values.tolist()
tw_date = featureDF_tw.date.values.tolist()

In [45]:
bert_featureDF_nw['date'] = nw_date
bert_featureDF_tw['date'] = tw_date

In [46]:
result =pd.merge( featureFI,featureDF_nw, how='outer', on='date')
getStartEndDateShape(result)

---------------------------
shape (4749, 55)
startDate 2003/04/30
endDate 2019/04/03


In [47]:
result =pd.merge( result,featureDF_tw, how='outer', on='date')
getStartEndDateShape(result)

---------------------------
shape (5004, 79)
startDate 2003/04/30
endDate 2019/04/04


In [48]:
#start only scaling engineered features

In [49]:
resultScaled =result.copy()

In [50]:
tdate= resultScaled.date.tolist()
ttarget = resultScaled.Target.tolist()
resultScaled.drop(['date','Target'],axis =1,inplace=True)

In [51]:
scale = StandardScaler()
resultScaled = pd.DataFrame(scale.fit_transform(resultScaled.values), columns=resultScaled.columns, index=resultScaled.index)

In [52]:
resultScaled['date'] = tdate
resultScaled['target'] = ttarget

In [53]:
resultScaled =pd.merge( resultScaled,bert_featureDF_nw, how='outer', on='date')
getStartEndDateShape(resultScaled)

---------------------------
shape (5004, 847)
startDate 2003/04/30
endDate 2019/04/04


In [54]:
resultScaled =pd.merge( resultScaled,bert_featureDF_tw, how='outer', on='date')
getStartEndDateShape(resultScaled)

---------------------------
shape (5004, 1615)
startDate 2003/04/30
endDate 2019/04/04


In [55]:
resultScaled.sort_values(by='date',inplace=True) 
resultScaled = resultScaled.ffill()
resultScaled.fillna(0,inplace=True)

In [56]:
resultScaled.date = pd.to_datetime(resultScaled.date)

In [57]:
#end only scaling engineered features

In [58]:
result =pd.merge( result,bert_featureDF_nw, how='outer', on='date')
getStartEndDateShape(result)

---------------------------
shape (5004, 847)
startDate 2003/04/30
endDate 2019/04/04


In [59]:
result =pd.merge( result,bert_featureDF_tw, how='outer', on='date')
getStartEndDateShape(result)

---------------------------
shape (5004, 1615)
startDate 2003/04/30
endDate 2019/04/04


In [60]:
result.sort_values(by='date',inplace=True) 

In [61]:
result = result.ffill()
result = result.bfill()
result.fillna(0,inplace=True)

In [62]:
result.date = pd.to_datetime(result.date)

In [63]:
#start non Scaled Data Prep

In [64]:
resultNonScaledData = result.copy()

In [65]:
#end non Scaled Data Prep

In [66]:
date= result.date.tolist()
target = result.Target.tolist()
result.drop(['date','Target'],axis =1,inplace=True)

In [67]:
scale = StandardScaler()
result = pd.DataFrame(scale.fit_transform(result.values), columns=result.columns, index=result.index)

In [68]:
result['date'] = date
result['target'] = target

In [69]:
# Save data with without scaling
result.to_csv('DFScaledNTFI_with_scaling.csv',index= False)

# Test and Train Split

In [70]:
result =  createComparableDataset(result)
train , test = createTrainingAndTestingSet(result)

In [71]:
getStartEndDateShape(train)
getStartEndDateShape(test)

---------------------------
shape (1336, 1615)
startDate 2015-05-04 00:00:00
endDate 2018-12-31 00:00:00
---------------------------
shape (87, 1615)
startDate 2019-01-01 00:00:00
endDate 2019-03-28 00:00:00


In [72]:
train.to_csv('train_bert_tnfi.csv',index=False)
test.to_csv('test_bert_tnfi.csv',index=False)

In [73]:
#start only scaling engineered features

In [74]:
result= createComparableDataset(resultScaled)
train_s , test_s = createTrainingAndTestingSet(result)

In [75]:
getStartEndDateShape(train_s)
getStartEndDateShape(test_s)

---------------------------
shape (1336, 1615)
startDate 2015-05-04 00:00:00
endDate 2018-12-31 00:00:00
---------------------------
shape (87, 1615)
startDate 2019-01-01 00:00:00
endDate 2019-03-28 00:00:00


In [None]:
train_s

In [38]:
train_s.to_csv('train_SEFnNSB_bert_tnfi.csv',index=False)
test_s.to_csv('test_SEFnNSB_bert_tnfi.csv',index=False)

In [39]:
#end only scaling engineered features

In [40]:
#start only non-scaled dataset

In [41]:
resultNonScaledData = createComparableDataset(resultNonScaledData)
train_ns, test_ns  = createTrainingAndTestingSet(result)

In [42]:
getStartEndDateShape(train_ns)
getStartEndDateShape(test_ns)

---------------------------
shape (1336, 1615)
startDate 2015-05-04 00:00:00
endDate 2018-12-31 00:00:00
---------------------------
shape (87, 1615)
startDate 2019-01-01 00:00:00
endDate 2019-03-28 00:00:00


In [45]:
train_ns.to_csv('train_nS_bert_tnfi.csv',index=False)
test_ns.to_csv('test_nS_bert_tnfi.csv',index=False)

In [44]:
#end only non-scaled dataset