<a href="https://colab.research.google.com/github/rawatpremsingh999/Financial-Sentiment-Analysis/blob/main/Stocktwits_Sentiment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import files

uploaded = files.upload()

In [None]:
import pandas as pd
import numpy as np

In [None]:
from __future__ import division, print_function
from gensim import models
from keras.callbacks import ModelCheckpoint
from keras.layers import Dense, Dropout, Reshape, Flatten, concatenate, Input, Conv1D, GlobalMaxPooling1D, Embedding
from keras.layers.recurrent import LSTM
from keras.models import Sequential
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, precision_score, recall_score, f1_score,accuracy_score

In [None]:
import io
df2 = pd.read_csv(io.BytesIO(uploaded['Stocktwits__Cleaned.csv']))

In [None]:
df2.head()

In [None]:
df2.shape

In [None]:
X = np.array(df2['cleaned_message'].tolist()).reshape(-1,1)

In [None]:
y = np.array(df2['sentiment'].tolist())

In [None]:
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=777)

In [None]:
X,y = ros.fit_resample(X,y)

In [None]:
type(X)
type(y)

In [None]:
from collections import Counter
Counter(y)

In [None]:
numpy_data = np.concatenate((X,y.reshape(-1,1)),axis=1)

In [None]:
df2 = pd.DataFrame(data=numpy_data, columns=["cleaned_message", "sentiment"])

In [None]:
df2.sentiment.unique()

In [None]:
duplicate = df2[df2.duplicated()] 

duplicate.shape

In [None]:
#df2.drop_duplicates(inplace=True)

#df2.shape

In [None]:
bull = []
bear = []
for l in df2.sentiment:
    if l == 'Bearish':
        bull.append(0)
        bear.append(1)
    elif l == 'Bullish':
        bull.append(1)
        bear.append(0)

In [None]:
df2['Bullish']= bull
df2['Bearish']= bear

In [None]:
df2.shape

In [None]:
import nltk
nltk.download('punkt')

In [None]:
nltk.download('stopwords')

In [None]:
from nltk import word_tokenize, WordNetLemmatizer
tokens = [word_tokenize(sen) for sen in df2.cleaned_message]

In [None]:
from nltk.corpus import stopwords
stoplist = stopwords.words('english')

In [None]:
def remove_stop_words(tokens): 
    return [word for word in tokens if word not in stoplist]

In [None]:
filtered_words = [remove_stop_words(sen) for sen in tokens]

In [None]:
result = [' '.join(sen) for sen in filtered_words]

In [None]:
df2['Text_Final'] = result

In [None]:
df2['tokens'] = filtered_words

In [None]:
df2.head()

In [None]:
df2 = df2[['Text_Final', 'tokens', 'sentiment', 'Bullish', 'Bearish']]

In [None]:
df2.head()

### Split data into train and test

In [None]:
data_train, data_test = train_test_split(df2, test_size=0.20, random_state=42)

In [None]:
data_train.shape,data_test.shape

In [None]:
data_test.reset_index(inplace=True)

In [None]:
duplicate = data_test[data_test.duplicated(subset=['Text_Final','sentiment'])] 

duplicate.shape

In [None]:
data_test.drop_duplicates(subset=['Text_Final','sentiment'],inplace=True)
data_test.shape

In [None]:
data_test.drop(columns=['index'],inplace=True)
data_test.head()

In [None]:
data_train.head()

In [None]:
all_training_words = [word for tokens in data_train["tokens"] for word in tokens]
training_sentence_lengths = [len(tokens) for tokens in data_train["tokens"]]
TRAINING_VOCAB = sorted(list(set(all_training_words)))
print("%s words total, with a vocabulary size of %s" % (len(all_training_words), len(TRAINING_VOCAB)))
print("Max sentence length is %s" % max(training_sentence_lengths))

**Google News Word2Vec**

In [None]:
!wget -P /root/input/ -c "https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz"

In [None]:
EMBEDDING_FILE = '/root/input/GoogleNews-vectors-negative300.bin.gz' # from above
word2vec = models.KeyedVectors.load_word2vec_format(EMBEDDING_FILE, binary=True)

In [None]:
def get_average_word2vec(tokens_list, vector, generate_missing=False, k=300):
    if len(tokens_list)<1:
        return np.zeros(k)
    if generate_missing:
        vectorized = [vector[word] if word in vector else np.random.rand(k) for word in tokens_list]
    else:
        vectorized = [vector[word] if word in vector else np.zeros(k) for word in tokens_list]
    length = len(vectorized)
    summed = np.sum(vectorized, axis=0)
    averaged = np.divide(summed, length)
    return averaged

def get_word2vec_embeddings(vectors, clean_comments, generate_missing=False):
    embeddings = clean_comments['tokens'].apply(lambda x: get_average_word2vec(x, vectors, 
                                                                                generate_missing=generate_missing))
    return list(embeddings)

In [None]:
training_embeddings = get_word2vec_embeddings(word2vec, data_train, generate_missing=True)

In [None]:
MAX_SEQUENCE_LENGTH = 70
EMBEDDING_DIM = 300

In [None]:
tokenizer = Tokenizer(num_words=len(TRAINING_VOCAB), lower=True, char_level=False)
tokenizer.fit_on_texts(data_train["Text_Final"].tolist())
training_sequences = tokenizer.texts_to_sequences(data_train["Text_Final"].tolist())

train_word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(train_word_index))

In [None]:
train_cnn_data = pad_sequences(training_sequences, maxlen=MAX_SEQUENCE_LENGTH)

In [None]:
train_embedding_weights = np.zeros((len(train_word_index)+1, EMBEDDING_DIM))
for word,index in train_word_index.items():
    train_embedding_weights[index,:] = word2vec[word] if word in word2vec else np.random.rand(EMBEDDING_DIM)
print(train_embedding_weights.shape)

In [None]:
test_sequences = tokenizer.texts_to_sequences(data_test["Text_Final"].tolist())
test_cnn_data = pad_sequences(test_sequences, maxlen=MAX_SEQUENCE_LENGTH)

In [None]:
def ConvNet(embeddings, max_sequence_length, num_words, embedding_dim, labels_index):
    embedding_layer = Embedding(num_words,
                            embedding_dim,
                            weights=[embeddings],
                            input_length=max_sequence_length,
                            trainable=False)
   
    embedding_layer = Embedding(num_words,
                            embedding_dim,
                            weights=[embeddings],
                            input_length=max_sequence_length,
                            trainable=False)
    
    sequence_input = Input(shape=(max_sequence_length,), dtype='int32')
    embedded_sequences = embedding_layer(sequence_input)


    convs = []
    filter_sizes = [3,5,7]

    for filter_size in filter_sizes:
        l_conv = Conv1D(filters=200, kernel_size=filter_size, activation='relu')(embedded_sequences)
        l_pool = GlobalMaxPooling1D()(l_conv)
        convs.append(l_pool)


    l_merge = concatenate(convs, axis=1)

    x = Dropout(0.1)(l_merge)  
    x = Dense(128, activation='relu')(x)
    x = Dropout(0.2)(x)
    preds = Dense(labels_index, activation='softmax')(x)

    model = Model(sequence_input, preds)
    model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['acc'])
    model.summary()
    
    return model

In [None]:
label_names = ['Bullish', 'Bearish']

In [None]:
y_train = data_train[label_names].values

In [None]:
x_train = train_cnn_data
y_tr = y_train

In [None]:
y_tr.shape

In [None]:
model = ConvNet(train_embedding_weights, MAX_SEQUENCE_LENGTH, len(train_word_index)+1, EMBEDDING_DIM, 
                len(list(label_names)))

In [None]:
import tensorflow as tf
tf.keras.utils.plot_model(model, "multi_input_and_output_model.png", show_shapes=True)

In [None]:
num_epochs = 25
batch_size = 100

In [None]:
hist = model.fit(x_train, y_tr, epochs=num_epochs, validation_split=0.1, shuffle=True, batch_size=batch_size)

In [None]:
hist

In [None]:
predictions = model.predict(test_cnn_data, batch_size=1024, verbose=1)

In [None]:
labels = [1, 0]

In [None]:
predictions

In [None]:
prediction_labels=[]
for p in predictions:
    prediction_labels.append(labels[np.argmax(p)])
'''
prediction_prob=[]
for i in range(len(predictions)):
    prediction_prob.append(max(predictions[i]))
prediction_prob
'''

In [None]:
test_labels = []
for sen in data_test.sentiment:
  if sen == 'Bullish':
    test_labels.append(1)
  else:
    test_labels.append(0)

In [None]:
data_test['Label'] = test_labels

In [None]:
data_test.head()

In [None]:
sum(data_test.Label==prediction_labels)/len(prediction_labels)

In [None]:
print(classification_report(data_test.Label, prediction_labels))

In [None]:
print(accuracy_score(data_test.Label, prediction_labels))
print(precision_score(data_test.Label, prediction_labels))
print(recall_score(data_test.Label, prediction_labels))
print(f1_score(data_test.Label, prediction_labels))

In [None]:
confusion_matrix(data_test.Label,prediction_labels)

In [None]:
data_test['prediction_label'] = prediction_labels

In [None]:
#data_test.head(15)

In [None]:
data_test.Label.value_counts()

In [None]:
data_test.prediction_label.value_counts()

In [None]:
from sklearn.metrics import roc_curve
fpr_keras, tpr_keras, thresholds_keras = roc_curve(data_test['Label'].tolist(), prediction_labels)

In [None]:
ROC_CNN_df = pd.DataFrame()
ROC_CNN_df['FPR'] = fpr_keras
ROC_CNN_df['TPR'] = tpr_keras
ROC_CNN_df.head()

In [None]:
ROC_CNN_df.to_csv("ROC_CNN_df.csv")

In [None]:
import matplotlib.pyplot as plt

In [None]:
# Plot the ROC curve
plt.figure(2)
plt.plot(fpr_keras, tpr_keras, color='green',
         lw=2, label='CNN')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--', label="Mean")
plt.xlim([-0.01, 1.0])
plt.ylim([0.0, 1.01])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title("ROC curve")
plt.legend(loc="best")
plt.show()

In [None]:
from sklearn.metrics import auc
auc_keras = auc(fpr_keras, tpr_keras)

In [None]:
auc_keras