<a href="https://colab.research.google.com/github/shivammehta007/NLPResearch/blob/master/Tutorials/Natural%20Language%20Processing/SNA_Assignemnt.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [194]:
import random
import os
import re

import torch
import torch.nn.functional as F
import torch.optim as optim
import torch.nn as nn
import numpy as np
import pandas as pd
from torchtext import data

from string import punctuation
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize

import spacy
nlp = spacy.load('en')

! pip install revtok

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [0]:
# Setting up custom random seeds,
SEED = 1234
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
np.random.seed(SEED)
torch.backends.cudnn.deterministic = True
random.seed(SEED)

In [196]:
# Mounting Google Drive
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [197]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [0]:
data_location = 'gdrive/My Drive/Colab Notebooks/SNA Lab'
train_file = os.path.join(data_location, 'Train.csv')
test_file = os.path.join(data_location, 'Test.csv')
train_data_pd = pd.read_csv(train_file)
test_data_pd = pd.read_csv(test_file)

In [223]:
train_data_pd.head()

Unnamed: 0,Topic,Sentiment,TweetDate,TweetText
0,apple,positive,Tue Oct 18 21:53:25 +0000 2011,Now all @Apple has to do is get swype on the i...
1,apple,positive,Tue Oct 18 21:09:33 +0000 2011,@Apple will be adding more carrier support to ...
2,apple,positive,Tue Oct 18 21:02:20 +0000 2011,Hilarious @youtube video - guy does a duet wit...
3,apple,positive,Tue Oct 18 20:40:10 +0000 2011,@RIM you made it too easy for me to switch to ...
4,apple,positive,Tue Oct 18 20:34:00 +0000 2011,I just realized that the reason I got into twi...


In [224]:
train_data_pd['Sentiment'].unique()

array(['positive', 'negative', 'neutral', 'irrelevant'], dtype=object)

In [0]:
# punc = punctuation.replace('@', '') 
# punc

In [0]:
class PreprocessingTweet:
    
    def __init__(self):
        self.sentiment_dict = {'positive' : 0,
                 'negative' : 1,
                 'neutral' : 2,
                 'irrelevant' : 9}

        self.company_dict = {'apple' : 1,
             'google' : 2,
             'microsoft': 3,
             'twitter': 4 }

    def __process_tweets__(self, tweets):
        tweets_ = []
        for tweet in tweets:
            tweet = tweet.lower() # convert text to lower-case
            tweet = re.sub('((www\.[^\s]+)|(https?://[^\s]+))', 'URL', tweet) # remove URLs
            # tweet = re.sub('@[^\s]+', 'AT_USER', tweet) # remove usernames
            tweet = re.sub(r'#([^\s]+)', r'\1', tweet) # remove the # in #hashtag
            tweet = ''.join([i for i in tweet if i not in punctuation])
            tweet = ' '.join(word_tokenize(tweet))
            tweets_.append(tweet)
        return pd.Series(tweets_)
    
    def __process_sentiment__(self, sentiments):
        return sentiments.replace(self.sentiment_dict)

    def __process_companies__(self, companies):
        return companies.replace(self.company_dict)

    def fit(self, dataframe):
        dataframe['TweetText'] = self.__process_tweets__(dataframe['TweetText'])
        dataframe['Topic'] = self.__process_companies__(dataframe['Topic'])
        dataframe['Sentiment'] = self.__process_sentiment__(dataframe['Sentiment'])
        return dataframe



In [0]:
preprocessor = PreprocessingTweet()

In [0]:
train_data_pd = preprocessor.fit(train_data_pd)
test_data_pd = preprocessor.fit(test_data_pd)

In [228]:
train_data_pd.head()

Unnamed: 0,Topic,Sentiment,TweetDate,TweetText
0,1,0,Tue Oct 18 21:53:25 +0000 2011,now all apple has to do is get swype on the ip...
1,1,0,Tue Oct 18 21:09:33 +0000 2011,apple will be adding more carrier support to t...
2,1,0,Tue Oct 18 21:02:20 +0000 2011,hilarious youtube video guy does a duet with a...
3,1,0,Tue Oct 18 20:40:10 +0000 2011,rim you made it too easy for me to switch to a...
4,1,0,Tue Oct 18 20:34:00 +0000 2011,i just realized that the reason i got into twi...


In [0]:
TWEET_TEXT = data.Field(tokenize='spacy', include_lengths = True)
TWEET_COMPANY = data.LabelField(dtype=torch.long)
SENTIMENT_LABEL = data.LabelField(dtype=torch.long)
TWEET_REVERSE = data.ReversibleField(sequential=True, lower=True, include_lengths=True)

# For Sentimental Analysis

In [0]:
sa_train_data_pd = train_data_pd[train_data_pd['Sentiment'] != 9].loc[:, ['TweetText', 'Sentiment']]
sa_test_data_pd = test_data_pd.loc[:, ['TweetText', 'Sentiment']]

In [231]:
sa_train_data_pd.head()

Unnamed: 0,TweetText,Sentiment
0,now all apple has to do is get swype on the ip...,0
1,apple will be adding more carrier support to t...,0
2,hilarious youtube video guy does a duet with a...,0
3,rim you made it too easy for me to switch to a...,0
4,i just realized that the reason i got into twi...,0


In [0]:
sa_train_data_pd.to_csv('Train_sa.csv', index=False)
sa_test_data_pd.to_csv('Test_sa.csv', index=False)

In [0]:
sa_data_fields = [  ('TweetText', TWEET_TEXT),  ('Sentiment', SENTIMENT_LABEL)]

sa_train_data, sa_test_data = data.TabularDataset.splits(
                                        path = '.',
                                        train = 'Train_sa.csv',
                                        test = 'Test_sa.csv',
                                        format = 'csv',
                                        fields = sa_data_fields,
                                        skip_header = True
                                        )

In [0]:
MAX_VOCAB_SIZE = 25000
TWEET_TEXT.build_vocab(sa_train_data.TweetText, max_size=MAX_VOCAB_SIZE)
                       #vectors='glove.twitter.27B.100d',
                       #unk_init=torch.Tensor.normal_)
SENTIMENT_LABEL.build_vocab(sa_train_data.Sentiment)
TWEET_REVERSE.build_vocab(sa_train_data.TweetText, max_size=MAX_VOCAB_SIZE)


In [237]:
TWEET_TEXT.vocab.freqs.most_common(10)

[('apple', 838),
 ('the', 707),
 ('to', 586),
 ('URL', 566),
 ('i', 516),
 ('a', 344),
 ('on', 340),
 ('rt', 333),
 ('is', 326),
 ('for', 325)]

In [0]:
# Create Iterator

BATCH_SIZE = 64

train_iterator_sa, test_iterator_sa = data.BucketIterator.splits(
    (sa_train_data, sa_test_data),
    batch_size=BATCH_SIZE,
    sort_key=lambda x: len(x.TweetText),
    sort_within_batch=True,
    device=device
)

# Defining the Model

In [0]:
class RNN(nn.Module):

    def __init__(self, vocab_size, embedded_dimensions, hidden_dimension,
                 output_dimension, n_layers, bidirectional, dropout, pad_index):
        super(RNN, self).__init__()

        self.embedded = nn.Embedding(vocab_size, embedded_dimensions, padding_idx=pad_index)
        self.lstm = nn.LSTM(embedded_dimensions,
                            hidden_dimension,
                            num_layers=n_layers,
                            bidirectional=bidirectional,
                            dropout=dropout)
        self.linear1 = nn.Linear(hidden_dimension * 2, output_dimension)
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, text, text_length):
        
        embedded = self.embedded(text)
        packed_sequence = nn.utils.rnn.pack_padded_sequence(embedded, text_length)

        packed_output, (hidden, output) = self.lstm(packed_sequence)

        output, output_length = nn.utils.rnn.pad_packed_sequence(packed_output)

        hidden = self.dropout(torch.cat((hidden[-2, :, :], hidden[-1, :, :]), dim = 1))
        fc1 = self.dropout(self.linear1(hidden.squeeze(0)))
        return fc1



# Part: 1 Training Sentimental Analyser

In [0]:
# HyperParamters 

VOCAB_SIZE = len(TWEET_TEXT.vocab)
EMBEDDING_DIMENSIONS = 100
HIDDEN_DIMENSIONS = 256
OUTPUT_DIMENSIONS = 3
N_LAYERS = 2
BIDIRECTIONAL = True
DROPOUT = 0.7
PAD_IDX = TWEET_TEXT.vocab.stoi[TWEET_TEXT.pad_token]


model = RNN(VOCAB_SIZE, EMBEDDING_DIMENSIONS, HIDDEN_DIMENSIONS,  OUTPUT_DIMENSIONS
            , N_LAYERS, BIDIRECTIONAL,DROPOUT, PAD_IDX)

In [258]:
# Parameter count : 
total_parameters = sum(l.numel() for l in model.parameters() if l.requires_grad)
print('Total Paramters : {:,}'.format(total_parameters))

Total Paramters : 2,810,783


In [0]:
# # Update initial weight of embedding Layers
# pretrained_embeddings = TWEET_TEXT.vocab.vectors
# print(pretrained_embeddings)
# # model.embedded.weight.data.copy_(pretrained_embeddings)

In [260]:
# Setting Padding indexes to zero to not to determine sentiment
model.embedded.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIMENSIONS)
print(model.embedded.weight.data)


tensor([[ 0.1044, -0.8876,  0.0635,  ..., -0.4982, -0.3775, -0.3877],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [-1.4714,  0.5284,  0.1526,  ...,  0.6863, -1.0461,  0.3615],
        ...,
        [-1.1001,  0.2496,  0.0566,  ...,  0.9288, -0.5557, -0.0671],
        [-0.4051,  2.2133,  0.3324,  ..., -0.5168,  0.2595,  0.3480],
        [-0.3920,  0.0077,  0.0081,  ...,  0.0091, -0.8764, -2.8003]])


## Training the model

In [0]:
optimizer = optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss()
model = model.to(device)
criterion = criterion.to(device)

In [0]:
# value, index = torch.max(torch.softmax(torch.Tensor([1,2,3,4]), dim=0), 0)
# index.item()

In [0]:
def classification_accuracy(predictions, label):
    # rounded_preds = torch.round(torch.tanh(predictions))
    # correct = (rounded_preds == label).float() #convert into float for division 
    # acc = correct.sum() / len(correct)
    # return acc

    value, index = torch.max(torch.softmax(predictions,1), 1)

    correct = (index == label).float()
    acc =   correct.sum() / len(correct)
    return acc

In [0]:
def train(model, iterator, optimizer, criterion):
    epoch_loss = 0
    epoch_accuracy = 0

    model.train()

    for batch in iterator:

        optimizer.zero_grad()
        text, text_length = batch.TweetText
        predictions = model(text, text_length).squeeze(1)
        loss = criterion(predictions, batch.Sentiment)
        acc = classification_accuracy(predictions, batch.Sentiment)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
        epoch_accuracy += acc.item()

    return epoch_loss / len(iterator), epoch_accuracy / len(iterator)






In [0]:
def evaluate(model, iterator, criterion):
    epoch_loss = 0
    epoch_accuracy = 0
    model.eval()
    f_score = 0
    with torch.no_grad():
        for batch in iterator:
            text, text_length = batch.TweetText
            # (x, x_lengths), y = batch.TweetText, batch.Sentiment
            # orig_text = TWEET_REVERSE.reverse(x.data)
            # print(orig_text)          
            predictions = model(text, text_length).squeeze(1)
            # print(predictions, batch.Sentiment)
            loss = criterion(predictions, batch.Sentiment)
            acc = classification_accuracy(predictions, batch.Sentiment)
            f_score += f1_score(np.argmax(predictions.cpu().numpy(), axis=1), batch.Sentiment.cpu().numpy(), average='weighted' )
            epoch_loss += loss.item()
            epoch_accuracy += acc.item()
    
    return epoch_loss / len(iterator), epoch_accuracy / len(iterator), f_score/ len(iterator)

In [0]:
import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [267]:
N_EPOCHS = 25

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):

    start_time = time.time()
    
    train_loss, train_acc = train(model, train_iterator_sa, optimizer, criterion)
    # valid_loss, valid_acc = evaluate(model, valid_iterator_sa, criterion)
    
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    # if valid_loss < best_valid_loss:
    #     best_valid_loss = valid_loss
    #     torch.save(model.state_dict(), 'tut2-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    # print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

Epoch: 01 | Epoch Time: 0m 0s
	Train Loss: 1.029 | Train Acc: 64.02%
Epoch: 02 | Epoch Time: 0m 0s
	Train Loss: 0.986 | Train Acc: 64.71%
Epoch: 03 | Epoch Time: 0m 0s
	Train Loss: 0.958 | Train Acc: 65.19%
Epoch: 04 | Epoch Time: 0m 0s
	Train Loss: 0.943 | Train Acc: 66.49%
Epoch: 05 | Epoch Time: 0m 0s
	Train Loss: 0.911 | Train Acc: 67.23%
Epoch: 06 | Epoch Time: 0m 0s
	Train Loss: 0.887 | Train Acc: 69.05%
Epoch: 07 | Epoch Time: 0m 0s
	Train Loss: 0.857 | Train Acc: 70.18%
Epoch: 08 | Epoch Time: 0m 0s
	Train Loss: 0.820 | Train Acc: 72.09%
Epoch: 09 | Epoch Time: 0m 0s
	Train Loss: 0.800 | Train Acc: 72.14%
Epoch: 10 | Epoch Time: 0m 0s
	Train Loss: 0.784 | Train Acc: 73.09%
Epoch: 11 | Epoch Time: 0m 0s
	Train Loss: 0.748 | Train Acc: 75.04%
Epoch: 12 | Epoch Time: 0m 0s
	Train Loss: 0.723 | Train Acc: 74.83%
Epoch: 13 | Epoch Time: 0m 0s
	Train Loss: 0.743 | Train Acc: 74.44%
Epoch: 14 | Epoch Time: 0m 0s
	Train Loss: 0.706 | Train Acc: 74.78%
Epoch: 15 | Epoch Time: 0m 0s
	Tra

In [268]:
test_loss, test_acc, f_score = evaluate(model, test_iterator_sa, criterion)

print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%, F-fscore: {f_score:.4f}')

Test Loss: 0.700 | Test Acc: 78.05%, F-fscore: 0.7859


In [0]:

def predict_sentiment(model, sentence):
    model.eval()
    tokenized = [tok.text for tok in nlp.tokenizer(sentence.lower())]
    indexed = [TWEET_TEXT.vocab.stoi[t] for t in tokenized]
    length = [len(indexed)]
    tensor = torch.LongTensor(indexed).to(device)
    tensor = tensor.unsqueeze(1)
    length_tensor = torch.LongTensor(length)
    value, prediction = torch.max(model(tensor, length_tensor), 0)
    # prediction = model(tensor, length_tensor)
    reverse_key = { 0: 'Positive', 1: 'Negative', 2: 'Neutral'}
    return prediction.item() , reverse_key[prediction.item()]
    # return prediction

In [270]:
predict_sentiment(model, "@apple  why don't you guys test your upgrades before you put them out. 15 1/2 hrs u guys wasted of my time yesterday. Thanks.")

(1, 'Negative')

In [271]:
predict_sentiment(model, "google sucha brilliant way, sleek design")

(0, 'Positive')

In [274]:
TWEET_TEXT.vocab.itos[4712]

'transferred'

In [0]:
torch.save(model.state_dict(), '78_percent.pt')

# Predicting the Company

In [276]:
train_data_pd.head()

Unnamed: 0,Topic,Sentiment,TweetDate,TweetText
0,1,0,Tue Oct 18 21:53:25 +0000 2011,now all apple has to do is get swype on the ip...
1,1,0,Tue Oct 18 21:09:33 +0000 2011,apple will be adding more carrier support to t...
2,1,0,Tue Oct 18 21:02:20 +0000 2011,hilarious youtube video guy does a duet with a...
3,1,0,Tue Oct 18 20:40:10 +0000 2011,rim you made it too easy for me to switch to a...
4,1,0,Tue Oct 18 20:34:00 +0000 2011,i just realized that the reason i got into twi...


In [0]:
vectorizer = CountVectorizer(max_features=MAX_VOCAB_SIZE, min_df=5, max_df=0.7)
X_train = vectorizer.fit_transform(train_data_pd['TweetText']).toarray()
X_test = vectorizer.transform(test_data_pd['TweetText']).toarray()

In [0]:
tfidfconverter = TfidfTransformer()
X_train = tfidfconverter.fit_transform(X_train).toarray()
X_test = tfidfconverter.transform(X_test).toarray()

In [0]:
y_train = np.array(train_data_pd['Topic'])
y_test = np.array(test_data_pd['Topic'])

In [0]:
rfc = RandomForestClassifier(n_estimators=1000, random_state=SEED)
rfc.fit(X_train, y_train)
prediction_rfc = rfc.predict(X_test)

In [281]:
print(accuracy_score(prediction_rfc, y_test)*100)
print(f1_score(prediction_rfc, y_test, average='macro'))

81.87134502923976
0.8140346938028314


In [0]:
def predict_company(tweet):
    tweet = ''.join([i for i in tweet if i not in punctuation])
    tweet = vectorizer.transform([tweet.lower()]).toarray()
    tweet = tfidfconverter.transform(tweet).toarray()
    prediction = rfc.predict(tweet)
    reverse_company_dict = {1: 'apple',
                            2: 'google',
                            3: 'microsoft',
                            4: 'twitter'}
    
    return reverse_company_dict[int(prediction)]

In [283]:
predict_company("RT @techinciter: Suppose Microsoft Had Bought Siri? ")

'microsoft'

# Testing with a file

## Getting Sentiment

In [293]:
name_of_file = test_file
test_frame = pd.read_csv(name_of_file)
test_frame.head()

Unnamed: 0,Topic,Sentiment,TweetDate,TweetText
0,apple,positive,Tue Oct 18 18:36:46 +0000 2011,RT @JamaicanIdler: Lmao I think @apple is onto...
1,apple,positive,Mon Oct 17 14:36:34 +0000 2011,"Bravo, @Apple! http://t.co/BgoTzj7K"
2,apple,positive,Mon Oct 17 00:23:56 +0000 2011,"Day305, I'm thankful for the great customer se..."
3,apple,positive,Sun Oct 16 22:56:54 +0000 2011,i love this. so much. thank you @apple. http:...
4,apple,positive,Sun Oct 16 16:25:47 +0000 2011,I &lt;3 @apple http://t.co/ondXWpEr


In [294]:
test_frame = preprocessor.fit(test_frame)
test_frame.head()

Unnamed: 0,Topic,Sentiment,TweetDate,TweetText
0,1,0,Tue Oct 18 18:36:46 +0000 2011,rt jamaicanidler lmao i think apple is onto so...
1,1,0,Mon Oct 17 14:36:34 +0000 2011,bravo apple URL
2,1,0,Mon Oct 17 00:23:56 +0000 2011,day305 im thankful for the great customer serv...
3,1,0,Sun Oct 16 22:56:54 +0000 2011,i love this so much thank you apple URL
4,1,0,Sun Oct 16 16:25:47 +0000 2011,i lt3 apple URL


In [0]:
INPUT_TWEET_TEXT = data.Field(tokenize='spacy', include_lengths = True)
INPUT_SENTIMENTAL_LABEL = data.LabelField(dtype=torch.long)

In [0]:
INPUT_TWEET_TEXT.build_vocab(sa_train_data.TweetText, max_size=MAX_VOCAB_SIZE)
INPUT_SENTIMENTAL_LABEL.build_vocab(sa_train_data.Sentiment)

In [0]:
input_data_field = [  ('TweetText', INPUT_TWEET_TEXT),  ('Sentiment', INPUT_SENTIMENTAL_LABEL)]

input_data = data.TabularDataset(path='./Test.csv',
                            format = 'csv',
                            fields = input_data_field,
                            skip_header = True)

In [0]:
input_data_iterator = data.BucketIterator(input_data,
                                          batch_size=BATCH_SIZE,
                                          sort_key=lambda x: len(x.TweetText),
                                          sort_within_batch=True,
                                          device=device)

In [325]:
input_loss, input_acc, input_f_score = evaluate(model, input_data_iterator, criterion)

print(f'Input Loss: {input_loss:.3f} | Input Acc: {input_acc*100:.2f}%, Input F-fscore: {input_f_score:.4f}')

Input Loss: 0.810 | Input Acc: 77.70%, Input F-fscore: 0.6805


  'precision', 'predicted', average, warn_for)


## Getting Company Information

In [50]:
test_data_x = vectorizer.transform(test_frame['TweetText']).toarray()
test_data_x = tfidfconverter.transform(test_data_x).toarray()
test_data_y = test_frame['Topic']
prediction_test_data = rfc.predict(X_test)

print(accuracy_score(prediction_test_data, test_data_y)*100)
print(f1_score(prediction_test_data, test_data_y, average='macro'))

82.45614035087719
0.8192035152581152
