<a href="https://colab.research.google.com/github/projjal1/Neural_Networks_Projects/blob/master/Twitter_Sentiment_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%pip install tensorflow-gpu

In [None]:
import tensorflow as tf
import pandas as pd
import numpy as np
from tensorflow import keras

In [None]:
url='https://raw.githubusercontent.com/projjal1/datasets/master/twitter_unprocessed_sentiment.csv'
file=tf.keras.utils.get_file('twitter_unprocessed_sentiment.csv',url)

Downloading data from https://raw.githubusercontent.com/projjal1/datasets/master/twitter_unprocessed_sentiment.csv


In [None]:
#Plotting labels 
idx2class={0:'negative',2:'neutral',4:'positive'}
class_names={'negative':0,'neutral':2,'positive':4}

In [None]:
def load_data():
    """
    Loads dataset
    """  
    df=pd.read_csv(file,encoding='ISO-8859-1')
    labels,texts=df['review'],df['content']
    return texts,labels

text,label=load_data()

In [None]:
for each in range(4):
  print(text[each],label[each])

is upset that he can't update his Facebook by texting it... and might cry as a result  School today also. Blah! 0
@Kenichan I dived many times for the ball. Managed to save 50%  The rest go out of bounds 0
my whole body feels itchy and like its on fire  0
@nationwideclass no, it's not behaving at all. i'm mad. why am i here? because I can't see you all over there.  0


In [None]:
#Now let us tokenize the text
tokenizer=keras.preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(text)

#Convert text sequence to integer
text=tokenizer.texts_to_sequences(text)

In [None]:
text[5]

[86, 4, 892]

Let us define hyperparameters

In [None]:
SEQUENCE_LENGTH = 100 # the length of all sequences (number of words per sample)
EMBEDDING_SIZE = 100  # Using 100-Dimensional GloVe embedding vectors
TEST_SIZE = 0.25 # ratio of testing set
BATCH_SIZE=64 #batch size for data

In [None]:
#Now lets convert both the labels and texts to numpy 
texts=np.array(text)
labels=np.array(label)

#Now we need to pad the texts to make it of uniform size
texts=keras.preprocessing.sequence.pad_sequences(texts,maxlen=SEQUENCE_LENGTH)

In [None]:
#Now lets one-hot encode the labels 

#Now lets categorize labels
labels=keras.utils.to_categorical(labels)

In [None]:
labels[6]

array([1., 0., 0., 0., 0.], dtype=float32)

Now we split the dataset into train and test vectors

In [None]:
from sklearn.model_selection import train_test_split

X_train,X_test,Y_train,Y_test=train_test_split(texts,labels,test_size=TEST_SIZE,random_state=7)

In [None]:
import tqdm

We will load pretrained model's weights and embed that layer

In [None]:
def get_embedding_vectors(tokenizer, dim=100):
    embedding_index = {}
    with open(f"data/glove.6B.{dim}d.txt", encoding='utf8') as f:
        for line in tqdm.tqdm(f, "Reading GloVe"):
            values = line.split()
            word = values[0]
            vectors = np.asarray(values[1:], dtype='float32')
            embedding_index[word] = vectors

    word_index = tokenizer.word_index
    embedding_matrix = np.zeros((len(word_index)+1, dim))
    for word, i in word_index.items():
        embedding_vector = embedding_index.get(word)
        if embedding_vector is not None:
            # words not found will be 0s
            embedding_matrix[i] = embedding_vector
            
    return embedding_matrix

In [None]:
def get_model(tokenizer, lstm_units):
    """
    Constructs the model,
    Embedding vectors => LSTM => 2 output Fully-Connected neurons with softmax activation
    """
    # get the GloVe embedding vectors
    embedding_matrix = get_embedding_vectors(tokenizer)
    model = keras.models.Sequential()
    model.add(keras.layers.Embedding(len(tokenizer.word_index)+1,
              EMBEDDING_SIZE,
              weights=[embedding_matrix],
              trainable=False,
              input_length=SEQUENCE_LENGTH))

    model.add(keras.layers.LSTM(lstm_units, recurrent_dropout=0.2))
    model.add(keras.layers.Dropout(0.3))
    model.add(keras.layers.Dense(5, activation="softmax"))
    # compile as rmsprop optimizer
    # aswell as with recall metric
    model.compile(optimizer="rmsprop", loss="categorical_crossentropy",
                  metrics=["accuracy"])
    model.summary()
    return model

In [None]:
# constructs the model with 128 LSTM units
model = get_model(tokenizer=tokenizer, lstm_units=128)

Reading GloVe: 143726it [00:04, 31302.18it/s]


Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 100, 100)          48628500  
_________________________________________________________________
lstm_1 (LSTM)                (None, 128)               117248    
_________________________________________________________________
dropout_1 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 5)                 645       
Total params: 48,746,393
Trainable params: 117,893
Non-trainable params: 48,628,500
_________________________________________________________________


In [None]:
model.fit(X_train, Y_train,batch_size=1024, epochs=2)

Epoch 1/2
Epoch 2/2


<tensorflow.python.keras.callbacks.History at 0x7f5cabc32128>

In [None]:
def get_predictions(text):
    sequence = tokenizer.texts_to_sequences([text])
    # pad the sequence
    sequence = keras.preprocessing.sequence.pad_sequences(sequence, maxlen=SEQUENCE_LENGTH)
    # get the prediction
    prediction = model.predict(sequence)[0]
    # one-hot encoded vector, revert using np.argmax
    return idx2class[np.argmax(prediction)]

In [None]:
text = "Congratulations! you have won 100,000$ this week, click here to claim fast"
print(get_predictions(text))

positive


In [None]:
text = "a dear friend of mine commited suicide with a shotgun two years ago"
print(get_predictions(text))

negative


In [None]:
text="@reda Hello from Texas"
print(get_predictions(text))

positive


In [None]:
text = "Trump attempts to clarify 'blood coming out of wherever' remark about Megan Kelly."
print(get_predictions(text))

negative
