In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
from nltk import tokenize
import random
import re

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

In [None]:
df = pd.read_csv('../input/sentiment-analysis-for-financial-news/all-data.csv', names=['Label', 'Text'], encoding='latin-1')


In [None]:
df = df.drop(df[df['Label'] == 'neutral'].index)

In [None]:
df = df.reset_index()

In [None]:
df.Label.unique()

In [None]:
df.Label.value_counts()

# Tokenization and cleaning special characters

In [None]:
def replace_special_words(sentence):
    
    sentence = re.sub('[^A-Za-z0-9]+', ' ', sentence)
    
    return sentence
    
def tokenize_(sentences):
    
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(sentences)
    
    seq = tokenizer.texts_to_sequences(sentences)
    
    print(tokenizer.word_index)
    
    max_sentences_len = np.max([*map(lambda x: len(x) , seq)])
    
    print('max_len :', max_sentences_len)
    
    sequences = pad_sequences(seq, maxlen=max_sentences_len, padding='post')
    
    return sequences
    
    
    

    

In [None]:
df_text = [*map(replace_special_words , df['Text'].tolist())]

In [None]:
df_text_tokenized = tokenize_(df_text)

In [None]:
def visualize_word_with_its_tokens(n):
    print(df_text[n])
    print(df_text_tokenized[n])

In [None]:
visualize_word_with_its_tokens(2)

# Preparing and splitting data

In [None]:
def preprocessing_data(seqs , label):
    labels = np.array([*map(lambda x : 1 if x == 'positive' else 0 , label)])
    
    X_train, X_test, y_train, y_test = train_test_split(seqs, labels, 
                                                                        train_size=0.75, 
                                                                        shuffle=True, 
                                                                        random_state=1)
    return X_train, X_test, y_train, y_test
    
    
        

In [None]:
X_train, X_test, y_train, y_test = preprocessing_data(df_text_tokenized, df.Label.tolist())

# Training data

In [None]:
X_test.shape

In [None]:
import tensorflow as tf

In [None]:
inputs = tf.keras.Input(shape=(X_train.shape[1],))

x = tf.keras.layers.Embedding(input_dim=5527, #length of word index dictionary + 1 (void)
                              output_dim=128,
                              input_length=X_train.shape[1])(inputs)

x = tf.keras.layers.GRU(256, return_sequences=True, activation='relu')(x)
x = tf.keras.layers.Flatten()(x)
outputs = tf.keras.layers.Dense(2, activation='softmax')(x)

model = tf.keras.Model(inputs=inputs, outputs=outputs)

model.compile(
    optimizer='adam',
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy'],
    
)

In [None]:
history = model.fit(
    X_train,
    y_train,
    validation_split=0.2,
    batch_size=32,
    epochs=20,
    
)

In [None]:
results = model.evaluate(X_test, y_test, verbose=0)

print("    Test Loss: {:.5f}".format(results[0]))
print("Test Accuracy: {:.2f}%".format(results[1] * 100))

In [None]:
y_predict = model.predict(X_test)