In [2]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from keras.preprocessing.text import Tokenizer

from keras.initializers import Constant
from keras.models import Model
from keras.layers import *
from keras.utils.np_utils import to_categorical
import re
import os
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
import pathlib
df=pd.read_csv('/content/drive/MyDrive/archive (6)/all-data.csv')
df

Unnamed: 0,Sentiment,text_line
0,neutral,"According to Gran , the company has no plans t..."
1,neutral,Technopolis plans to develop in stages an area...
2,negative,The international electronic industry company ...
3,positive,With the new production plant the company woul...
4,positive,According to the company 's updated strategy f...
...,...,...
4841,negative,LONDON MarketWatch -- Share prices ended lower...
4842,neutral,Rinkuskiai 's beer sales fell by 6.5 per cent ...
4843,negative,Operating profit fell to EUR 35.4 mn from EUR ...
4844,negative,Net sales of the Paper segment decreased to EU...


In [5]:
def clean_str(in_str):
    in_str = str(in_str)
    # replace urls with 'url'
    in_str = re.sub(r"(https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|www\.[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9]\.[^\s]{2,}|www\.[a-zA-Z0-9]\.[^\s]{2,})", "url", in_str)
    in_str = re.sub(r'([^\s\w]|_)+', '', in_str)
    return in_str.strip().lower()


df['text'] = df['text_line'].apply(clean_str)

In [6]:
df.Sentiment.value_counts()

neutral     2879
positive    1363
negative     604
Name: Sentiment, dtype: int64

In [7]:
df_0 = df[df['Sentiment'] == 'positive'].sample(frac=1)
df_1 = df[df['Sentiment'] == 'negative'].sample(frac=1)
df_2 = df[df['Sentiment'] == 'neutral'].sample(frac=1)
sample_size = 604

data = pd.concat([df_0.head(sample_size), df_1.head(sample_size), df_2.head(sample_size)]).sample(frac=1)

In [8]:
from keras.utils import pad_sequences
from tensorflow.python.keras import regularizers

In [9]:
data['l'] = data['text'].apply(lambda x: len(str(x).split(' ')))
print("mean length of sentence: " + str(data.l.mean()))
print("max length of sentence: " + str(data.l.max()))
print("std dev length of sentence: " + str(data.l.std()))

mean length of sentence: 22.4939293598234
max length of sentence: 55
std dev length of sentence: 9.968889575682285


In [10]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(data['text'])
num_unique_tokens = len(tokenizer.word_index)
print(num_unique_tokens)

6167


In [11]:
max_features = num_unique_tokens+1
tokenizer = Tokenizer(num_words=max_features, split=' ', oov_token='<unw>')
tokenizer.fit_on_texts(data['text'].values)


In [12]:
import json
word_index = tokenizer.word_index

word_to_index = tokenizer.word_index
file_path = 'w2i1.json'
with open(file_path, 'w') as json_file:
    json.dump(word_to_index, json_file)

In [13]:
sequence_length=55
num_filters=100
embedding_dim=200

In [14]:
inputs = Input(shape=(sequence_length,), dtype='int32')


embedding_layer = Embedding(input_dim=max_features, output_dim=embedding_dim, input_length=sequence_length)(inputs)

reshape = Reshape((sequence_length, embedding_dim, 1))(embedding_layer)


conv_0 = Conv2D(num_filters, kernel_size=(3, embedding_dim), activation='relu', kernel_regularizer=regularizers.l2(3))(reshape)
conv_1 = Conv2D(num_filters, kernel_size=(4, embedding_dim), activation='relu', kernel_regularizer=regularizers.l2(3))(reshape)
conv_2 = Conv2D(num_filters, kernel_size=(5, embedding_dim), activation='relu', kernel_regularizer=regularizers.l2(3))(reshape)
maxpool_0 = MaxPool2D(pool_size=(sequence_length - 3 + 1, 1), strides=(1,1), padding='valid')(conv_0)
maxpool_1 = MaxPool2D(pool_size=(sequence_length - 4 + 1, 1), strides=(1,1), padding='valid')(conv_1)
maxpool_2 = MaxPool2D(pool_size=(sequence_length - 5 + 1, 1), strides=(1,1), padding='valid')(conv_2)

# concat and flatten
concatenated_tensor = Concatenate(axis=1)([maxpool_0, maxpool_1, maxpool_2])
flatten = Flatten()(concatenated_tensor)

# do dropout and predict
dropout = Dropout(0.5)(flatten)
output = Dense(units=3, activation='softmax')(dropout)

In [15]:
temp_model = tf.keras.Model(inputs=inputs, outputs=embedding_layer)
embedding_weights = temp_model.get_weights()[0]
embedding_matrix_file = 'matrics1.npy'
np.save(embedding_matrix_file, embedding_weights)

In [16]:
from tensorflow.keras.callbacks import ModelCheckpoint
ck=ModelCheckpoint("model3.h5",monitor="val_accuracy",save_best_only=True,verbose=1)

In [17]:
model = Model(inputs=inputs, outputs=output)
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 55)]         0           []                               
                                                                                                  
 embedding (Embedding)          (None, 55, 200)      1233600     ['input_1[0][0]']                
                                                                                                  
 reshape (Reshape)              (None, 55, 200, 1)   0           ['embedding[0][0]']              
                                                                                                  
 conv2d (Conv2D)                (None, 53, 1, 100)   60100       ['reshape[0][0]']                
                                                                                            

In [18]:
X = tokenizer.texts_to_sequences(data['text'].values)


X = pad_sequences(X, sequence_length,padding='post')

y = pd.get_dummies(data['Sentiment']).values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)

In [19]:
batch_size =50
history = model.fit(X, y, epochs=100, batch_size=batch_size, verbose=1, validation_data=(X_test,y_test), shuffle=True, callbacks=[ck])

Epoch 1/100
Epoch 1: val_accuracy improved from -inf to 0.34615, saving model to model3.h5
Epoch 2/100
Epoch 2: val_accuracy did not improve from 0.34615
Epoch 3/100
Epoch 3: val_accuracy improved from 0.34615 to 0.37912, saving model to model3.h5
Epoch 4/100
Epoch 4: val_accuracy did not improve from 0.37912
Epoch 5/100
Epoch 5: val_accuracy did not improve from 0.37912
Epoch 6/100
Epoch 6: val_accuracy did not improve from 0.37912
Epoch 7/100
Epoch 7: val_accuracy did not improve from 0.37912
Epoch 8/100
Epoch 8: val_accuracy did not improve from 0.37912
Epoch 9/100
Epoch 9: val_accuracy did not improve from 0.37912
Epoch 10/100
Epoch 10: val_accuracy did not improve from 0.37912
Epoch 11/100
Epoch 11: val_accuracy did not improve from 0.37912
Epoch 12/100
Epoch 12: val_accuracy did not improve from 0.37912
Epoch 13/100
Epoch 13: val_accuracy did not improve from 0.37912
Epoch 14/100
Epoch 14: val_accuracy did not improve from 0.37912
Epoch 15/100
Epoch 15: val_accuracy improved from

In [20]:
from keras.saving.saving_api import load_model
model_1=load_model('model3.h5')

In [21]:
dff=pd.read_excel('/content/drive/MyDrive/DryBulk_MarketNews_v2_20220916.xlsx')
dff = dff[['Sent_Text', 'Sentiment']]
dff.head(5)

Unnamed: 0,Sent_Text,Sentiment
0,"VIETNAM PRODUCED 4.48 MT OF COAL IN JUL, UP 3....",Negative
1,INDIA'S WHEAT CROP IN 2022-23 IS REVISED DOWN ...,Negative
2,CHINA’S STEEL INDUSTRY WHICH RELIES AT LEAST 1...,Negative
3,"RECORD LOW LVL OF RHINE RIVER(IN SOME CASES, V...",Negative
4,FIRST GRAIN SHIPMENT EX UKRAINE PASSED INSPECT...,Positive


In [22]:
def clean_str(in_str):
    in_str = str(in_str)
    # replace urls with 'url'
    in_str = re.sub(r"(https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|www\.[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9]\.[^\s]{2,}|www\.[a-zA-Z0-9]\.[^\s]{2,})", "url", in_str)
    in_str = re.sub(r'([^\s\w]|_)+', '', in_str)
    return in_str.strip().lower()


dff['SentText'] = dff['Sent_Text'].apply(clean_str)

In [23]:
dff[dff['Sentiment'] == 'Nagative']

Unnamed: 0,Sent_Text,Sentiment,SentText
29,"COAL EXPORTS FROM Q'LAND IN JUL WAS 15.26 MT, ...",Nagative,coal exports from qland in jul was 1526 mt dow...


In [24]:
dff.iloc[29]["Sentiment"]="Negative"

In [25]:

dff['Sentiment']=dff['Sentiment'].str.lower()

In [26]:
#dff[dff['Sentiment'] == 'negative']

In [27]:
test_seq = tokenizer.texts_to_sequences(dff['SentText'].values)
X_test = pad_sequences(test_seq, maxlen=sequence_length,padding='post')
y_test = pd.get_dummies(dff['Sentiment']).values
y_pred = model_1.predict(X_test)



In [28]:
accuracy_score(list(map(lambda x: np.argmax(x), y_test)), list(map(lambda x: np.argmax(x), y_pred)))*100

56.22119815668203

In [29]:
print(classification_report(list(map(lambda x: np.argmax(x), y_test)), list(map(lambda x: np.argmax(x), y_pred))))

              precision    recall  f1-score   support

           0       0.60      0.74      0.66        97
           1       0.15      0.09      0.11        22
           2       0.58      0.49      0.53        98

    accuracy                           0.56       217
   macro avg       0.44      0.44      0.44       217
weighted avg       0.54      0.56      0.55       217



In [30]:
embeddings_index = {}
f = open(os.path.join('/content/drive/MyDrive/Colab Notebooks/archive (11)/glove.6B.200d.txt'))
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

Found 400000 word vectors.


In [31]:
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 6168 unique tokens.


In [32]:
num_words = min(max_features, len(word_index)) + 1
print(num_words)

# first create a matrix of zeros, this is our embedding matrix
embedding_matrix = np.zeros((num_words, embedding_dim))

# for each word in out tokenizer lets try to find that work in our w2v model
for word, i in word_index.items():
    if i > max_features:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # we found the word - add that words vector to the matrix
        embedding_matrix[i] = embedding_vector
    else:
        # doesn't exist, assign a random vector
        embedding_matrix[i] = np.random.randn(embedding_dim)


6169


In [33]:
inputs_2 = Input(shape=(sequence_length,), dtype='int32')

# note the `trainable=False`, later we will make this layer trainable
embedding_layer_2 = Embedding(num_words,
                            embedding_dim,
                            embeddings_initializer=Constant(embedding_matrix),
                            input_length=sequence_length,
                            trainable=False)(inputs_2)

reshape_2 = Reshape((sequence_length, embedding_dim, 1))(embedding_layer_2)

conv_0_2 = Conv2D(num_filters, kernel_size=(3, embedding_dim), activation='relu', kernel_regularizer=regularizers.l2(3))(reshape_2)
conv_1_2 = Conv2D(num_filters, kernel_size=(4, embedding_dim), activation='relu', kernel_regularizer=regularizers.l2(3))(reshape_2)
conv_2_2 = Conv2D(num_filters, kernel_size=(5, embedding_dim), activation='relu', kernel_regularizer=regularizers.l2(3))(reshape_2)
maxpool_0_2 = MaxPool2D(pool_size=(sequence_length - 3 + 1, 1), strides=(1,1), padding='valid')(conv_0_2)
maxpool_1_2 = MaxPool2D(pool_size=(sequence_length - 4 + 1, 1), strides=(1,1), padding='valid')(conv_1_2)
maxpool_2_2 = MaxPool2D(pool_size=(sequence_length - 5 + 1, 1), strides=(1,1), padding='valid')(conv_2_2)

concatenated_tensor_2 = Concatenate(axis=1)([maxpool_0_2, maxpool_1_2, maxpool_2_2])
flatten_2 = Flatten()(concatenated_tensor_2)

dropout_2 = Dropout(0.5)(flatten_2)
output_2 = Dense(units=3, activation='softmax')(dropout_2)

In [34]:
from tensorflow.keras.callbacks import ModelCheckpoint
ck=ModelCheckpoint("model4.h5",monitor="val_accuracy",save_best_only=True,verbose=1)

In [35]:
model_2 = Model(inputs=inputs_2, outputs=output_2)
model_2.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model_2.summary())

Model: "model_2"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_2 (InputLayer)           [(None, 55)]         0           []                               
                                                                                                  
 embedding_1 (Embedding)        (None, 55, 200)      1233800     ['input_2[0][0]']                
                                                                                                  
 reshape_1 (Reshape)            (None, 55, 200, 1)   0           ['embedding_1[0][0]']            
                                                                                                  
 conv2d_3 (Conv2D)              (None, 53, 1, 100)   60100       ['reshape_1[0][0]']              
                                                                                            

In [36]:
batch_size=50
history_2 = model_2.fit(X, y, epochs=100, batch_size=batch_size, verbose=1, validation_data=(X_test,y_test), shuffle=True, callbacks=[ck])

Epoch 1/100
Epoch 1: val_accuracy improved from -inf to 0.44700, saving model to model4.h5
Epoch 2/100
Epoch 2: val_accuracy did not improve from 0.44700
Epoch 3/100
Epoch 3: val_accuracy did not improve from 0.44700
Epoch 4/100
Epoch 4: val_accuracy did not improve from 0.44700
Epoch 5/100
Epoch 5: val_accuracy did not improve from 0.44700
Epoch 6/100
Epoch 6: val_accuracy improved from 0.44700 to 0.45161, saving model to model4.h5
Epoch 7/100
Epoch 7: val_accuracy did not improve from 0.45161
Epoch 8/100
Epoch 8: val_accuracy did not improve from 0.45161
Epoch 9/100
Epoch 9: val_accuracy did not improve from 0.45161
Epoch 10/100
Epoch 10: val_accuracy did not improve from 0.45161
Epoch 11/100
Epoch 11: val_accuracy did not improve from 0.45161
Epoch 12/100
Epoch 12: val_accuracy did not improve from 0.45161
Epoch 13/100
Epoch 13: val_accuracy did not improve from 0.45161
Epoch 14/100
Epoch 14: val_accuracy did not improve from 0.45161
Epoch 15/100
Epoch 15: val_accuracy did not impro

In [38]:
from keras.saving.saving_api import load_model
model_2=load_model('model4.h5')

In [39]:
test_seq = tokenizer.texts_to_sequences(dff['SentText'].values)
X_test = pad_sequences(test_seq, maxlen=sequence_length,padding='post')
y_test = pd.get_dummies(dff['Sentiment']).values
y_pred = model_2.predict(X_test)




In [40]:
# y_test = pd.get_dummies(dff['Sentiment'])
# print(y_test)

In [41]:
#print(y_pred)

In [42]:
accuracy_score(list(map(lambda x: np.argmax(x), y_test)), list(map(lambda x: np.argmax(x), y_pred)))*100

54.83870967741935

In [47]:
inputs_3 = Input(shape=(sequence_length,), dtype='int32')
embedding_layer_3 = Embedding(num_words,
                            embedding_dim,
                            embeddings_initializer=Constant(embedding_matrix),
                            input_length=sequence_length,
                            trainable=True)(inputs_3)

reshape_3 = Reshape((sequence_length, embedding_dim, 1))(embedding_layer_3)

# note the relu activation
conv_0_3 = Conv2D(num_filters, kernel_size=(3, embedding_dim), activation='relu', kernel_regularizer=regularizers.l2(3))(reshape_3)
conv_1_3 = Conv2D(num_filters, kernel_size=(4, embedding_dim), activation='relu', kernel_regularizer=regularizers.l2(3))(reshape_3)
conv_2_3 = Conv2D(num_filters, kernel_size=(5, embedding_dim), activation='relu', kernel_regularizer=regularizers.l2(3))(reshape_3)
maxpool_0_3 = MaxPool2D(pool_size=(sequence_length - 3 + 1, 1), strides=(1,1), padding='valid')(conv_0_3)
maxpool_1_3 = MaxPool2D(pool_size=(sequence_length - 4 + 1, 1), strides=(1,1), padding='valid')(conv_1_3)
maxpool_2_3 = MaxPool2D(pool_size=(sequence_length - 5 + 1, 1), strides=(1,1), padding='valid')(conv_2_3)

concatenated_tensor_3 = Concatenate(axis=1)([maxpool_0_3, maxpool_1_3, maxpool_2_3])
flatten_3 = Flatten()(concatenated_tensor_3)

dropout_3 = Dropout(0.5)(flatten_3)
output_3 = Dense(units=3, activation='softmax')(dropout_3)

In [48]:
from tensorflow.keras.callbacks import ModelCheckpoint
ck=ModelCheckpoint("model5.h5",monitor="val_accuracy",save_best_only=True,verbose=1)

In [49]:
model_3 = Model(inputs=inputs_3, outputs=output_3)
model_3.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model_3.summary())

Model: "model_4"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_4 (InputLayer)           [(None, 55)]         0           []                               
                                                                                                  
 embedding_3 (Embedding)        (None, 55, 200)      1233800     ['input_4[0][0]']                
                                                                                                  
 reshape_3 (Reshape)            (None, 55, 200, 1)   0           ['embedding_3[0][0]']            
                                                                                                  
 conv2d_9 (Conv2D)              (None, 53, 1, 100)   60100       ['reshape_3[0][0]']              
                                                                                            

In [50]:
batch_size=50
history_3 = model_3.fit(X, y, epochs=150, batch_size=batch_size, verbose=1, validation_data=(X_test,y_test), shuffle=True, callbacks=[ck])

Epoch 1/150
Epoch 1: val_accuracy improved from -inf to 0.45161, saving model to model5.h5
Epoch 2/150
Epoch 2: val_accuracy improved from 0.45161 to 0.50230, saving model to model5.h5
Epoch 3/150
Epoch 3: val_accuracy did not improve from 0.50230
Epoch 4/150
Epoch 4: val_accuracy did not improve from 0.50230
Epoch 5/150
Epoch 5: val_accuracy did not improve from 0.50230
Epoch 6/150
Epoch 6: val_accuracy did not improve from 0.50230
Epoch 7/150
Epoch 7: val_accuracy did not improve from 0.50230
Epoch 8/150
Epoch 8: val_accuracy did not improve from 0.50230
Epoch 9/150
Epoch 9: val_accuracy improved from 0.50230 to 0.64055, saving model to model5.h5
Epoch 10/150
Epoch 10: val_accuracy did not improve from 0.64055
Epoch 11/150
Epoch 11: val_accuracy did not improve from 0.64055
Epoch 12/150
Epoch 12: val_accuracy did not improve from 0.64055
Epoch 13/150
Epoch 13: val_accuracy did not improve from 0.64055
Epoch 14/150
Epoch 14: val_accuracy improved from 0.64055 to 0.71429, saving model 

In [51]:
from keras.saving.saving_api import load_model
model_3=load_model('model5.h5')

In [52]:
test_seq = tokenizer.texts_to_sequences(dff['SentText'].values)
X_test = pad_sequences(test_seq, maxlen=sequence_length,padding='post')
y_test = pd.get_dummies(dff['Sentiment']).values
y_pred = model_3.predict(X_test)



In [53]:
accuracy_score(list(map(lambda x: np.argmax(x), y_test)), list(map(lambda x: np.argmax(x), y_pred)))*100

71.42857142857143

In [54]:
print(classification_report(list(map(lambda x: np.argmax(x), y_test)), list(map(lambda x: np.argmax(x), y_pred))))

              precision    recall  f1-score   support

           0       0.76      0.78      0.77        97
           1       0.00      0.00      0.00        22
           2       0.68      0.81      0.74        98

    accuracy                           0.71       217
   macro avg       0.48      0.53      0.50       217
weighted avg       0.65      0.71      0.68       217

