In [7]:
cd /home/prachi/Wikiled.Sentiment.Service

C:\Users\aman\Desktop\New folder\BtechProjSmallRes\src


In [9]:
import logging.config
from sklearn.utils import shuffle
import pandas as pd
import numpy as np
from os import path
import quandl
from keras.layers import LSTM, Dropout, Dense, Activation
from keras import Sequential, callbacks
from keras.layers import Conv1D, MaxPooling1D
from keras.optimizers import RMSprop
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from DataLoader import DataLoader
from MarketData import QuandlMarketDataSource, RedditMarketDataSource, BloombergMarketDataSource

from learning.BasicLearning import RbfClassifier
from utilities import Constants
from utilities.Utilities import Utilities

logging.config.fileConfig('logging.conf', disable_existing_loggers=False)
quandl.ApiConfig.api_key = '__YOUR_KEY__'


def build_model(inputs, model_type):

    model = Sequential()
    if model_type == 'Basic_LSTM':
        model.add(LSTM(400, input_shape=(inputs.shape[1], inputs.shape[2]), return_sequences=False))
        model.add(Dropout(0.5))
    elif model_type == 'Conv':
        model.add(LSTM(200, input_shape=(inputs.shape[1], inputs.shape[2]), return_sequences=True))
        model.add(Dropout(0.2))
        model.add(Conv1D(filters=32, kernel_size=3, padding='same', activation='relu'))
        model.add(MaxPooling1D(pool_size=2))
        model.add(LSTM(100,
                       return_sequences=False))
        model.add(Dropout(0.2))

    elif model_type == 'LSTM':
        model.add(LSTM(400, input_shape=(inputs.shape[1], inputs.shape[2]), return_sequences=True))
        model.add(Dropout(0.4))
        model.add(LSTM(400, return_sequences=False))
        model.add(Dropout(0.3))
    else:
        raise ValueError(model_type)

    model.add(Dense(2))
    model.add(Activation('softmax'))

    model.compile(loss="categorical_crossentropy", optimizer=RMSprop(), metrics=["accuracy"])
    return model


def get_data(full_articles, sentiment_location, price_source, stock):
    loader = DataLoader()
    if price_source == 'quandl':
        source = QuandlMarketDataSource()
    elif price_source == 'reddit':
        source = RedditMarketDataSource()
    else:
        source = BloombergMarketDataSource()
    x_data, y_data = loader.load_data(stock, 5,
                                      source=source,
                                      sentiment_location=sentiment_location,
                                      full_articles=full_articles,
                                      from_date='2011-04-01', to_date='2015-04-01')
#     pd.DataFrame(x_data).to_csv("x_data.csv")
#     pd.DataFrame(y_data).to_csv("y_data.csv")
    scaler = preprocessing.StandardScaler()
    scaler.fit(x_data)
    x_data = scaler.transform(x_data)
    return x_data, y_data


def lstm_prediction(model_type, x_train, x_test, y_train):

    y_train = Utilities.make_dual(y_train, 2)
    x_train = np.reshape(x_train, (x_train.shape[0], x_train.shape[1], 1))
    x_test = np.reshape(x_test, (x_test.shape[0], x_test.shape[1], 1))

    # initialise model architecture
    market_model = build_model(x_train, model_type)
    market_model.summary()
    # train model on data
    # note: eth_history contains information on the training error per epoch
    cbks = [callbacks.EarlyStopping(monitor='val_loss', patience=25)]
    market_model.fit(x_train, y_train, batch_size=1000, callbacks=cbks, epochs=50, validation_split=0.25, shuffle=True)
    y_result_prob = market_model.predict(x_test)
#     print(y_result_prob[:100])
    y_result = Utilities.make_single_dimension(y_result_prob)
    return y_result, y_result_prob


def svm_prediction(x_train, x_test, y_train):
    pipeline = Pipeline([
        ['clf', RbfClassifier()]])
    pipeline.fit(x_train, y_train)
    y_result = pipeline.predict(x_test)
    y_result_prob = pipeline.predict_proba(x_test)
    np.save(r".\x_train_classifiers18.npy", x_train)
    np.save(r".\x_test_classifiers18.npy", x_test)
    np.save(r".\y_train_classifiers18.npy", y_train)

    return y_result, y_result_prob


def processing(price_source, stock, load_articles, full_articles, processing_type='SVM'):

    x_data, y_data = get_data(full_articles, load_articles, price_source, stock)
    x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.25, random_state=42)

    if processing_type == 'SVM':
        y_result, y_result_prob = svm_prediction(x_train, x_test, y_train)
        np.save(r".\y_test_classifiers18.npy", y_test)

    else:
        y_result, y_result_prob = lstm_prediction(processing_type, x_train, x_test, y_train)
        
    Utilities.measure_performance(y_test, y_result)
    Utilities.measure_performance_auc(y_test, y_result, y_result)


if __name__ == '__main__':
    item = 'HPQ'
    price_source = 'quandl'
    processing_type = 'SVM'
    sentiment_location = path.join(Constants.DATASETS_MARKET, 'Twitter/psenti.csv')
#     technical analysis
    processing(price_source, item, None, False, processing_type)
#     # technical analysis + Sentiment
    sentiment_location = path.join(Constants.DATASETS_MARKET, 'FinArticles/psenti/all.results.csv')
    processing(price_source, item, sentiment_location, False, processing_type)
    sentiment_location = path.join(Constants.DATASETS_MARKET, 'FinArticles/psenti/reddit.results.csv')
    processing(price_source, item, sentiment_location, False, processing_type)
    # technical analysis + Sentiment + Mood
    processing(price_source, item, sentiment_location, True, processing_type)

2020-10-31 14:29:59,535 - utilities - INFO - Found 0 type with 366 records
2020-10-31 14:29:59,539 - utilities - INFO - Found 1 type with 384 records
2020-10-31 14:29:59,540 - learning - INFO - Searching classifier best parameters




2020-10-31 14:30:00,560 - learning - INFO - Best parameters found:
2020-10-31 14:30:00,563 - learning - INFO - 0.676
2020-10-31 14:30:00,563 - learning - INFO - {'C': 10, 'gamma': 0.1, 'kernel': 'rbf'}
2020-10-31 14:30:00,565 - learning - INFO - Creating calibrated...
2020-10-31 14:30:00,567 - learning - INFO - Training...




2020-10-31 14:30:01,044 - utilities - INFO - 
              precision    recall  f1-score   support

           0      0.682     0.608     0.643       120
           1      0.671     0.738     0.703       130

    accuracy                          0.676       250
   macro avg      0.677     0.673     0.673       250
weighted avg      0.677     0.676     0.674       250

2020-10-31 14:30:01,048 - utilities - INFO - Macro F1 0.673
2020-10-31 14:30:01,052 - utilities - INFO - Micro F1 0.676
2020-10-31 14:30:17,690 - utilities - INFO - Found 0 type with 366 records
2020-10-31 14:30:17,694 - utilities - INFO - Found 1 type with 384 records
2020-10-31 14:30:17,695 - learning - INFO - Searching classifier best parameters




2020-10-31 14:30:18,851 - learning - INFO - Best parameters found:
2020-10-31 14:30:18,853 - learning - INFO - 0.688
2020-10-31 14:30:18,854 - learning - INFO - {'C': 10, 'gamma': 0.1, 'kernel': 'rbf'}
2020-10-31 14:30:18,857 - learning - INFO - Creating calibrated...
2020-10-31 14:30:18,858 - learning - INFO - Training...




2020-10-31 14:30:19,198 - utilities - INFO - 
              precision    recall  f1-score   support

           0      0.675     0.642     0.658       120
           1      0.684     0.715     0.699       130

    accuracy                          0.680       250
   macro avg      0.680     0.679     0.679       250
weighted avg      0.680     0.680     0.680       250

2020-10-31 14:30:19,202 - utilities - INFO - Macro F1 0.679
2020-10-31 14:30:19,205 - utilities - INFO - Micro F1 0.680
2020-10-31 14:30:36,264 - utilities - INFO - Found 0 type with 366 records
2020-10-31 14:30:36,268 - utilities - INFO - Found 1 type with 384 records
2020-10-31 14:30:36,269 - learning - INFO - Searching classifier best parameters




2020-10-31 14:30:37,323 - learning - INFO - Best parameters found:
2020-10-31 14:30:37,324 - learning - INFO - 0.6293333333333333
2020-10-31 14:30:37,325 - learning - INFO - {'C': 10, 'gamma': 1, 'kernel': 'rbf'}
2020-10-31 14:30:37,327 - learning - INFO - Creating calibrated...
2020-10-31 14:30:37,328 - learning - INFO - Training...




2020-10-31 14:30:37,752 - utilities - INFO - 
              precision    recall  f1-score   support

           0      0.649     0.525     0.581       120
           1      0.627     0.738     0.678       130

    accuracy                          0.636       250
   macro avg      0.638     0.632     0.630       250
weighted avg      0.638     0.636     0.632       250

2020-10-31 14:30:37,755 - utilities - INFO - Macro F1 0.630
2020-10-31 14:30:37,757 - utilities - INFO - Micro F1 0.636
2020-10-31 14:30:54,462 - utilities - INFO - Found 0 type with 366 records
2020-10-31 14:30:54,465 - utilities - INFO - Found 1 type with 384 records
2020-10-31 14:30:54,466 - learning - INFO - Searching classifier best parameters




2020-10-31 14:30:55,646 - learning - INFO - Best parameters found:
2020-10-31 14:30:55,647 - learning - INFO - 0.5933333333333334
2020-10-31 14:30:55,649 - learning - INFO - {'C': 10, 'gamma': 0.01, 'kernel': 'rbf'}
2020-10-31 14:30:55,651 - learning - INFO - Creating calibrated...
2020-10-31 14:30:55,653 - learning - INFO - Training...




2020-10-31 14:30:56,159 - utilities - INFO - 
              precision    recall  f1-score   support

           0      0.577     0.533     0.554       120
           1      0.597     0.638     0.617       130

    accuracy                          0.588       250
   macro avg      0.587     0.586     0.586       250
weighted avg      0.587     0.588     0.587       250

2020-10-31 14:30:56,161 - utilities - INFO - Macro F1 0.586
2020-10-31 14:30:56,164 - utilities - INFO - Micro F1 0.588


In [5]:
#Trying on LSTM

acc = []
f1 = []
auc = []
import logging.config
from sklearn.utils import shuffle
import pandas as pd
import numpy as np
from os import path
import quandl
from keras.layers import LSTM, Dropout, Dense, Activation
from keras import Sequential, callbacks
from keras.layers import Conv1D, MaxPooling1D
from keras.optimizers import RMSprop
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from DataLoader import DataLoader
from MarketData import QuandlMarketDataSource, RedditMarketDataSource, BloombergMarketDataSource
import tensorflow as tf
from learning.BasicLearning import RbfClassifier
from utilities import Constants
from utilities.Utilities import Utilities
optimizer = tf.keras.optimizers.Adam()
logging.config.fileConfig('logging.conf', disable_existing_loggers=False)
quandl.ApiConfig.api_key = '__YOUR_KEY__'
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

def data_for_lstm(look_back, dataset):
    dataX, dataY = [], []
    for i in range(len(dataset)-look_back+1):
        a = dataset[i:(i+look_back), :]
        dataX.append(a)
    return np.array(dataX)

def model_select(model_type):
    if(model_type == "RandomForestClassifier"):
        pipeline = RandomForestClassifier()
    elif(model_type == "LogisticRegression"):
        pipeline = LogisticRegression()
    elif(model_type == "ExtraTreesClassifier"):
        pipeline = ExtraTreesClassifier()
    elif(model_type == "BaggingClassifier"):
        pipeline = BaggingClassifier()
    elif(model_type == "DecisionTreeClassifier"):
        pipeline = DecisionTreeClassifier()
    elif(model_type == "AdaBoostClassifier"):
        pipeline = AdaBoostClassifier()
    elif(model_type == 'SVM'):
        pipeline = Pipeline([['clf', RbfClassifier()]])
    else:
        raise ValueError(model_type)
    return pipeline


def build_model(inputs, model_type):

    model = Sequential()
    if model_type == 'Basic_LSTM':
        model.add(LSTM(4,  input_shape=(inputs.shape[1], inputs.shape[2])))
#         model.add(Dropout(0.5))
    elif model_type == 'Conv':
        model.add(LSTM(200, input_shape=(inputs.shape[1], inputs.shape[2]), return_sequences=True, activation = 'relu'))
        model.add(Dropout(0.2))
        model.add(Conv1D(filters=32, kernel_size=3, padding='same', activation='relu'))
        model.add(MaxPooling1D(pool_size=2))
        model.add(LSTM(100,
                       return_sequences=False))
        model.add(Dropout(0.2))

    elif model_type == 'LSTM':
        model.add(LSTM(400, input_shape=(inputs.shape[1], inputs.shape[2]), return_sequences=True))
        model.add(Dropout(0.4))
        model.add(LSTM(400, return_sequences=False))
        model.add(Dropout(0.3))
    else:
        raise ValueError(model_type)

    model.add(Dense(1))
#     model.add(Activation('softmax'))

    model.compile(loss="binary_crossentropy", optimizer='adam', metrics=["accuracy"])
    return model


def get_data(full_articles, sentiment_location, price_source, stock):
    loader = DataLoader()
    if price_source == 'quandl':
        source = QuandlMarketDataSource()
    elif price_source == 'reddit':
        source = RedditMarketDataSource()
    else:
        source = BloombergMarketDataSource()
    x_data, y_data = loader.load_data(stock, 5,
                                      source=source,
                                      sentiment_location=sentiment_location,
                                      full_articles=full_articles,
                                      from_date='2011-04-01', to_date='2015-04-01')
    scaler = preprocessing.StandardScaler()
    scaler.fit(x_data)
    x_data = scaler.transform(x_data)
    return x_data, y_data


def lstm_prediction(model_type, x_train, x_test, y_train, look_back):
    
    lookback = look_back
#     y_train = Utilities.make_dual(y_train, 2)
#     x_train = np.reshape(x_train, (x_train.shape[0], x_train.shape[1], 1))
#     x_test = np.reshape(x_test, (x_test.shape[0], x_test.shape[1], 1))
    # initialise model architecture
    market_model = build_model(x_train, model_type)
#     print(x_train[:100])
#     market_model.summary()
    # train model on data
    # note: eth_history contains information on the training error per epoch
    cbks = [callbacks.EarlyStopping(monitor='val_loss', patience=25)]
#     np.save(r"C:\Users\aman\Desktop\Btech Project\src\x_train_lstm.npy", x_train)
#     np.save(r"C:\Users\aman\Desktop\Btech Project\src\y_train_lstm.npy", y_train)
#     np.save(r"C:\Users\aman\Desktop\Btech Project\src\x_test_lstm.npy", x_test)
#     market_model.fit(x_train, y_train, batch_size=1000, callbacks=cbks, epochs=50, validation_split=0.25, shuffle=True)
    market_model.fit(x_train, y_train, epochs=50, validation_split=0.25, shuffle=True, verbose = 0, callbacks = cbks)

    y_result_prob = market_model.predict(x_test)
    y_result = Utilities.make_single_dimension(y_result_prob)
    return y_result, y_result_prob


def model_prediction(x_train, x_test, y_train, pipeline):
    pipeline.fit(x_train, y_train)
    y_result = pipeline.predict(x_test)
    y_result_prob = pipeline.predict_proba(x_test)
    return y_result, y_result_prob


def processing(price_source, stock, load_articles, full_articles, processing_type='SVM', look_back = 5):

    x_data, y_data = get_data(full_articles, load_articles, price_source, stock)
    x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.25, random_state=42)

    if processing_type in non_sequential_models:
        pipeline = model_select(processing_type)
        y_result, y_result_prob = model_prediction(x_train, x_test, y_train, pipeline)
#         Utilities.measure_performance(y_test, y_result)
        accuracy1, auc_score1, f1_score1= Utilities.measure_performance_auc(y_test, y_result, y_result)
        acc.append(accuracy1)
        f1.append(f1_score1)
        auc.append(auc_score1)
    else:
        look_backs = [2,3,5]
        for look_back in look_backs:
            y_train1 = y_train[look_back-1:]
            x_train1 = data_for_lstm(look_back, x_train)
            x_test1 = data_for_lstm(look_back, x_test)     
            print("For lookback", look_back)
            y_result, y_result_prob = lstm_prediction(processing_type, x_train1, x_test1, y_train1, look_back)
            y_test1 = y_test[look_back-1:]
    #     np.save(r"C:\Users\aman\Desktop\Btech Project\src\y_test_lstm.npy", y_test)
    #     print(y_result[:100])
    #     print(y_test[:100])
    #     print(y_result_prob[:100])
#             Utilities.measure_performance(y_test1, y_result)
            accuracy1, auc_score1, f1_score1= Utilities.measure_performance_auc(y_test1, y_result, y_result)
            acc.append(accuracy1)
            f1.append(f1_score1)
            auc.append(auc_score1)

if __name__ == '__main__':
    items = ['JPM', 'HPQ', 'GOOG', 'AAPL']
    price_source = 'quandl'
    processing_types = ["RandomForestClassifier",'SVM', 'LogisticRegression', 'ExtraTreesClassifier','BaggingClassifier','DecisionTreeClassifier','AdaBoostClassifier','Basic_LSTM','Conv',"LSTM" ]
    non_sequential_models = ["RandomForestClassifier",'SVM', 'LogisticRegression', 'ExtraTreesClassifier','BaggingClassifier','DecisionTreeClassifier','AdaBoostClassifier' ]
    sequential_models = ['Basic_LSTM','Conv',"LSTM"]
    for item in items:
        for processing_type in processing_types:
#             print(item)
#             print(price_source)
#             print(processing_type)
            path =  '/Twitter/' + 'results' + item + '.csv'
            sentiment_location = Constants.DATASETS_MARKET + path
            # technical analysis
#             print()
#             print("Technical analysis")
            processing(price_source, item, None, False, processing_type)
            # technical analysis + Sentiment
#             print()
#             print("technical analysis + Sentiment")
            processing(price_source, item, sentiment_location, False, processing_type)
            # technical analysis + Sentiment + Mood
#             print()
#             print("technical analysis + Sentiment + Mood")
            processing(price_source, item, sentiment_location, True, processing_type)
#             print()
#             print()
#             print()
        print(item, len(acc))

2020-10-31 14:15:00,820 - numexpr.utils - INFO - NumExpr defaulting to 8 threads.




2020-10-31 14:15:39,647 - utilities - INFO - Found 0 type with 338 records
2020-10-31 14:15:39,672 - utilities - INFO - Found 1 type with 412 records
2020-10-31 14:15:39,674 - learning - INFO - Searching classifier best parameters




2020-10-31 14:15:46,577 - learning - INFO - Best parameters found:
2020-10-31 14:15:46,579 - learning - INFO - 0.6666666666666666
2020-10-31 14:15:46,581 - learning - INFO - {'C': 10, 'gamma': 1, 'kernel': 'rbf'}
2020-10-31 14:15:46,582 - learning - INFO - Creating calibrated...
2020-10-31 14:15:46,890 - learning - INFO - Training...




2020-10-31 14:15:58,040 - utilities - INFO - Found 0 type with 338 records
2020-10-31 14:15:58,044 - utilities - INFO - Found 1 type with 412 records
2020-10-31 14:15:58,045 - learning - INFO - Searching classifier best parameters




2020-10-31 14:15:59,042 - learning - INFO - Best parameters found:
2020-10-31 14:15:59,045 - learning - INFO - 0.668
2020-10-31 14:15:59,047 - learning - INFO - {'C': 10, 'gamma': 1, 'kernel': 'rbf'}
2020-10-31 14:15:59,050 - learning - INFO - Creating calibrated...
2020-10-31 14:15:59,053 - learning - INFO - Training...




2020-10-31 14:16:09,679 - utilities - INFO - Found 0 type with 338 records
2020-10-31 14:16:09,684 - utilities - INFO - Found 1 type with 412 records
2020-10-31 14:16:09,684 - learning - INFO - Searching classifier best parameters




2020-10-31 14:16:10,917 - learning - INFO - Best parameters found:
2020-10-31 14:16:10,920 - learning - INFO - 0.6693333333333333
2020-10-31 14:16:10,921 - learning - INFO - {'C': 10, 'gamma': 1, 'kernel': 'rbf'}
2020-10-31 14:16:10,923 - learning - INFO - Creating calibrated...
2020-10-31 14:16:10,926 - learning - INFO - Training...




For lookback 2
For lookback 3
For lookback 5
For lookback 2
For lookback 3
For lookback 5
For lookback 2
For lookback 3
For lookback 5
For lookback 2
For lookback 3
For lookback 5
For lookback 2
For lookback 3
For lookback 5
For lookback 2
For lookback 3
For lookback 5
For lookback 2
For lookback 3
For lookback 5


KeyboardInterrupt: 