### The Objective:
In this notebook I will look into comparing models and their accuracies. Which Natural Language Processing model is best suited to extracct sentiment from a text.


In [2]:
import pandas as pd
import numpy as np
import tensorflow as tf
import re
import string

#NLTK
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import TweetTokenizer
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorboard.plugins import projector
import IPython
from sklearn.model_selection import train_test_split
# from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [17]:
df=pd.read_csv('D:\Codes\SentimentAnalysis\Twitter_Data.csv')
df=df.dropna()

In [18]:
df=df[df.category!=0.0]

In [19]:
df.category.value_counts()

 1.0    72249
-1.0    35509
Name: category, dtype: int64

In [20]:
df.isna().sum()

clean_text    0
category      0
dtype: int64

In [21]:
df

Unnamed: 0,clean_text,category
0,when modi promised “minimum government maximum...,-1.0
2,what did just say vote for modi welcome bjp t...,1.0
3,asking his supporters prefix chowkidar their n...,1.0
4,answer who among these the most powerful world...,1.0
8,with upcoming election india saga going import...,1.0
...,...,...
162972,engine growth modi unveils indias first 12000 ...,1.0
162973,modi promised 2014 lok sabha elections that be...,1.0
162975,why these 456 crores paid neerav modi not reco...,-1.0
162976,dear rss terrorist payal gawar what about modi...,-1.0


#### Train-test-split


In [22]:
training_sentences_unprocessed, testing_sentences_unprocessed, training_labels, testing_labels=train_test_split(df.clean_text, df.category, test_size=0.33, random_state=42)

#### Preprocessing the tweets

In [23]:
# Processing the tweets
def process_tweet(tweet):
    vocab_size = 10000
    oov_tok = '<OOV>'
 
    stemmer = PorterStemmer()
    stopwords_english = stopwords.words('english')
    # remove stock market tickers
    tweet = re.sub(r'\$\w*', '', tweet)
    # remove twitter abbreviations
    tweet = re.sub(r'^RT[\s]+', '', tweet)
    # remove hyperlinks
    tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)
    # only removing the hash # sign from the word
    tweet = re.sub(r'#', '', tweet)
    
    # tokenize tweets
    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True,
                               reduce_len=True)
    tweet_tokens = tokenizer.tokenize(tweet)

    tweets_clean = []
    for word in tweet:
        if (word not in stopwords_english and  # remove stopwords
                word not in string.punctuation):  # remove punctuation
            # tweets_clean.append(word)
            stem_word = stemmer.stem(word)  # stemming word
            tweets_clean.append(stem_word)

    return tweet

In [24]:
training_sentences = []
for i, tweet in enumerate(training_sentences_unprocessed): 
    training_sentences.append(process_tweet(tweet))

testing_sentences = []
for i, tweet in enumerate(testing_sentences_unprocessed): 
    testing_sentences.append(process_tweet(tweet))

# Convert labels lists to numpy array
training_labels_final = []
testing_labels_final = []

for i in training_labels:
    training_labels_final.append(tf.cast(i, tf.int32))

for i in testing_labels:
    testing_labels_final.append(tf.cast(i, tf.int32))


training_labels_final = np.array(training_labels_final)
testing_labels_final = np.array(testing_labels_final)

In [25]:
# Parameters

vocab_size = 7000
max_length = 120
embedding_dim = 16
trunc_type='post'
oov_tok = "<OOV>"

In [26]:
# Initialize the Tokenizer class
tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_tok)

# Generate the word index dictionary for the training sentences
tokenizer.fit_on_texts(training_sentences)
word_index = tokenizer.word_index

# Generate and pad the training sequences
sequences = tokenizer.texts_to_sequences(training_sentences)
padded = pad_sequences(sequences,maxlen=max_length, truncating=trunc_type)

# Generate and pad the test sequences
testing_sequences = tokenizer.texts_to_sequences(testing_sentences)
testing_padded = pad_sequences(testing_sequences,maxlen=max_length)

In [29]:
import tensorflow as tf

# Build the model
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(6, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

# Setup the training parameters
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

# Print the model summary
model.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     (None, 120, 16)           112000    
                                                                 
 flatten_3 (Flatten)         (None, 1920)              0         
                                                                 
 dense_6 (Dense)             (None, 6)                 11526     
                                                                 
 dense_7 (Dense)             (None, 1)                 7         
                                                                 
Total params: 123,533
Trainable params: 123,533
Non-trainable params: 0
_________________________________________________________________


In [30]:
num_epochs = 10

# Train the model
model.fit(padded, training_labels_final, epochs=num_epochs, validation_data=(testing_padded, testing_labels_final))

Epoch 1/10
 425/2257 [====>.........................] - ETA: 11s - loss: 0.5475 - accuracy: 0.0119

KeyboardInterrupt: 

In [73]:
training_labels_final

array([-1,  1,  1, ...,  1,  1,  1])