In [0]:
import torch
import torch.nn.functional as F
import torch.optim as optim
import torch.nn as nn
import numpy as np
import pandas as pd
from torchtext import data
import random
import os
from string import punctuation
import spacy
from sklearn.metrics import accuracy_score, f1_score, classification_report

In [0]:
# Setting up custom random seeds,
SEED = 1234
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
np.random.seed(SEED)
torch.backends.cudnn.deterministic = True
random.seed(SEED)

In [3]:
# Mounting Google Drive
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [4]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [0]:
data_location = 'gdrive/My Drive/Colab Notebooks/SNA Lab'
train_file = os.path.join(data_location, 'Train.csv')
test_file = os.path.join(data_location, 'Test.csv')
train_data_pd = pd.read_csv(train_file)
test_data_pd = pd.read_csv(test_file)

In [6]:
train_data_pd.head()

Unnamed: 0,Topic,Sentiment,TweetDate,TweetText
0,apple,positive,Tue Oct 18 21:53:25 +0000 2011,Now all @Apple has to do is get swype on the i...
1,apple,positive,Tue Oct 18 21:09:33 +0000 2011,@Apple will be adding more carrier support to ...
2,apple,positive,Tue Oct 18 21:02:20 +0000 2011,Hilarious @youtube video - guy does a duet wit...
3,apple,positive,Tue Oct 18 20:40:10 +0000 2011,@RIM you made it too easy for me to switch to ...
4,apple,positive,Tue Oct 18 20:34:00 +0000 2011,I just realized that the reason I got into twi...


In [7]:
train_data_pd['Sentiment'].unique()

array(['positive', 'negative', 'neutral', 'irrelevant'], dtype=object)

In [8]:
punc = punctuation.replace('@', '') 
punc

'!"#$%&\'()*+,-./:;<=>?[\\]^_`{|}~'

In [9]:
# Change sentiments and Topics to numbers 
company_dict = {'apple' : 1,
             'google' : 2,
             'microsoft': 3,
             'twitter': 4 }
sentiment_dict = {'positive' : 0,
                 'negative' : 1,
                 'neutral' : 2,
                 'irrelevant' : 9}




def pre_process(dataframe):
    dataframe['Topic'] = dataframe['Topic'].replace(company_dict)
    dataframe['Sentiment'] = dataframe['Sentiment'].replace(sentiment_dict)

    dataframe['TweetText'] = dataframe['TweetText'].apply(lambda x: x.lower())
    dataframe['TweetText'] = dataframe['TweetText'].apply(lambda x:''.join([i for i in x 
                                                  if i not in punc]))


    return dataframe


train_data_pd = pre_process(train_data_pd)
test_data_pd = pre_process(test_data_pd)

train_data_pd.head()

Unnamed: 0,Topic,Sentiment,TweetDate,TweetText
0,1,0,Tue Oct 18 21:53:25 +0000 2011,now all @apple has to do is get swype on the i...
1,1,0,Tue Oct 18 21:09:33 +0000 2011,@apple will be adding more carrier support to ...
2,1,0,Tue Oct 18 21:02:20 +0000 2011,hilarious @youtube video guy does a duet with...
3,1,0,Tue Oct 18 20:40:10 +0000 2011,@rim you made it too easy for me to switch to ...
4,1,0,Tue Oct 18 20:34:00 +0000 2011,i just realized that the reason i got into twi...


In [10]:
train_data_pd['TweetText'][20:30]

20    ipads replace bound playbooks on some nfl team...
21                                      @applegood ipad
22                      @apple @siri is efffing amazing
23    amazing new @apple ios 5 feature  httptcojatfvfpm
24    were one of a few featured education apps on t...
25    when you want something done right you do it y...
26               we did an unexpected workshop for the 
27                                                 lt3 
28    » rt @apple no question bro rt @ainteeentrippi...
29    rt @imightbewrong im over people bitching abou...
Name: TweetText, dtype: object

In [0]:
TWEET_TEXT = data.Field(tokenize='spacy', include_lengths = True)
TWEET_COMPANY = data.LabelField(dtype=torch.long)
SENTIMENT_LABEL = data.LabelField(dtype=torch.long)

# For Sentimental Analysis

In [0]:
sa_train_data_pd = train_data_pd[train_data_pd['Sentiment'] != 9].loc[:, ['TweetText', 'Sentiment']]
sa_test_data_pd = test_data_pd[test_data_pd['Sentiment'] != 9].loc[:, ['TweetText', 'Sentiment']]

In [104]:
sa_train_data_pd.head()

Unnamed: 0,TweetText,Sentiment
0,now all @apple has to do is get swype on the i...,0
1,@apple will be adding more carrier support to ...,0
2,hilarious @youtube video guy does a duet with...,0
3,@rim you made it too easy for me to switch to ...,0
4,i just realized that the reason i got into twi...,0


In [0]:
sa_train_data_pd.to_csv('Train_sa.csv', index=False)
sa_test_data_pd.to_csv('Test_sa.csv', index=False)

In [0]:
sa_data_fields = [  ('TweetText', TWEET_TEXT),  ('Sentiment', SENTIMENT_LABEL)]

sa_train_data, sa_test_data = data.TabularDataset.splits(
                                        path = '.',
                                        train = 'Train_sa.csv',
                                        test = 'Test_sa.csv',
                                        format = 'csv',
                                        fields = sa_data_fields,
                                        skip_header = True
                                        )

In [0]:
MAX_VOCAB_SIZE = 25000
TWEET_TEXT.build_vocab(sa_train_data.TweetText, max_size=MAX_VOCAB_SIZE)
                       #vectors='glove.twitter.27B.100d',
                       #unk_init=torch.Tensor.normal_)
SENTIMENT_LABEL.build_vocab(sa_train_data.Sentiment)

In [17]:
TWEET_TEXT.vocab.freqs.most_common(10)

[('@apple', 771),
 ('the', 707),
 (' ', 592),
 ('to', 586),
 ('i', 516),
 ('a', 344),
 ('on', 340),
 ('rt', 333),
 ('is', 326),
 ('for', 325)]

In [0]:
# Create Iterator

BATCH_SIZE = 64

train_iterator_sa, test_iterator_sa = data.BucketIterator.splits(
    (sa_train_data, sa_test_data),
    batch_size=BATCH_SIZE,
    sort_key=lambda x: len(x.TweetText),
    sort_within_batch=True,
    device=device
)

# Defining the Model

In [0]:
class RNN(nn.Module):

    def __init__(self, vocab_size, embedded_dimensions, hidden_dimension,
                 output_dimension, n_layers, bidirectional, dropout, pad_index):
        super(RNN, self).__init__()

        self.embedded = nn.Embedding(vocab_size, embedded_dimensions, padding_idx=pad_index)
        self.lstm = nn.LSTM(embedded_dimensions,
                            hidden_dimension,
                            num_layers=n_layers,
                            bidirectional=bidirectional,
                            dropout=dropout)
        self.linear1 = nn.Linear(hidden_dimension * 2, output_dimension)
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, text, text_length):
        
        embedded = self.embedded(text)
        packed_sequence = nn.utils.rnn.pack_padded_sequence(embedded, text_length)

        packed_output, (hidden, output) = self.lstm(packed_sequence)

        output, output_length = nn.utils.rnn.pad_packed_sequence(packed_output)

        hidden = self.dropout(torch.cat((hidden[-2, :, :], hidden[-1, :, :]), dim = 1))
        fc1 = self.dropout(self.linear1(hidden.squeeze(0)))
        return fc1



# Part: 1 Training Sentimental Analyser

In [0]:
# HyperParamters 

VOCAB_SIZE = len(TWEET_TEXT.vocab)
EMBEDDING_DIMENSIONS = 100
HIDDEN_DIMENSIONS = 256
OUTPUT_DIMENSIONS = 3
N_LAYERS = 2
BIDIRECTIONAL = True
DROPOUT = 0.7
PAD_IDX = TWEET_TEXT.vocab.stoi[TWEET_TEXT.pad_token]


model = RNN(VOCAB_SIZE, EMBEDDING_DIMENSIONS, HIDDEN_DIMENSIONS,  OUTPUT_DIMENSIONS
            , N_LAYERS, BIDIRECTIONAL,DROPOUT, PAD_IDX)

In [21]:
# Parameter count : 
total_parameters = sum(l.numel() for l in model.parameters() if l.requires_grad)
print('Total Paramters : {:,}'.format(total_parameters))

Total Paramters : 2,869,883


In [0]:
# # Update initial weight of embedding Layers
# pretrained_embeddings = TWEET_TEXT.vocab.vectors
# print(pretrained_embeddings)
# # model.embedded.weight.data.copy_(pretrained_embeddings)

In [23]:
# Setting Padding indexes to zero to not to determine sentiment
model.embedded.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIMENSIONS)
print(model.embedded.weight.data)


tensor([[-0.1117, -0.4966,  0.1631,  ...,  1.5903, -0.1947, -0.2415],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.7289, -0.7336,  1.5624,  ..., -0.5592, -0.4480, -0.6476],
        ...,
        [-1.2337,  0.5082,  1.6095,  ..., -0.2820,  0.7624,  0.0363],
        [ 1.3951, -0.1898,  1.2610,  ..., -1.4310,  0.4310,  0.4169],
        [-1.4680,  1.7992,  0.4500,  ..., -1.1899,  1.0843, -1.6378]])


## Training the model

In [0]:
optimizer = optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss()
model = model.to(device)
criterion = criterion.to(device)

In [0]:
# value, index = torch.max(torch.softmax(torch.Tensor([1,2,3,4]), dim=0), 0)
# index.item()

In [0]:
def classification_accuracy(predictions, label):
    # rounded_preds = torch.round(torch.tanh(predictions))
    # correct = (rounded_preds == label).float() #convert into float for division 
    # acc = correct.sum() / len(correct)
    # return acc

    value, index = torch.max(torch.softmax(predictions,1), 1)

    correct = (index == label).float()
    acc =   correct.sum() / len(correct)
    return acc

In [0]:
def train(model, iterator, optimizer, criterion):
    epoch_loss = 0
    epoch_accuracy = 0

    model.train()

    for batch in iterator:

        optimizer.zero_grad()
        text, text_length = batch.TweetText
        predictions = model(text, text_length).squeeze(1)
        loss = criterion(predictions, batch.Sentiment)
        acc = classification_accuracy(predictions, batch.Sentiment)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
        epoch_accuracy += acc.item()

    return epoch_loss / len(iterator), epoch_accuracy / len(iterator)






In [0]:
def evaluate(model, iterator, criterion):
    epoch_loss = 0
    epoch_accuracy = 0
    model.eval()
    f_score = 0
    with torch.no_grad():
        for batch in iterator:
            text, text_length = batch.TweetText
            predictions = model(text, text_length).squeeze(1)
            # print(predictions, batch.Sentiment)
            loss = criterion(predictions, batch.Sentiment)
            acc = classification_accuracy(predictions, batch.Sentiment)
            f_score += f1_score(np.argmax(predictions.cpu().numpy(), axis=1), batch.Sentiment.cpu().numpy(),average='weighted' )
            epoch_loss += loss.item()
            epoch_accuracy += acc.item()
    
    return epoch_loss / len(iterator), epoch_accuracy / len(iterator), f_score/ len(iterator)

In [0]:
import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [30]:
N_EPOCHS = 25

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):

    start_time = time.time()
    
    train_loss, train_acc = train(model, train_iterator_sa, optimizer, criterion)
    # valid_loss, valid_acc = evaluate(model, valid_iterator_sa, criterion)
    
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    # if valid_loss < best_valid_loss:
    #     best_valid_loss = valid_loss
    #     torch.save(model.state_dict(), 'tut2-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    # print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

Epoch: 01 | Epoch Time: 0m 0s
	Train Loss: 1.042 | Train Acc: 63.32%
Epoch: 02 | Epoch Time: 0m 0s
	Train Loss: 0.999 | Train Acc: 64.67%
Epoch: 03 | Epoch Time: 0m 0s
	Train Loss: 0.969 | Train Acc: 65.36%
Epoch: 04 | Epoch Time: 0m 0s
	Train Loss: 0.941 | Train Acc: 66.19%
Epoch: 05 | Epoch Time: 0m 0s
	Train Loss: 0.919 | Train Acc: 67.71%
Epoch: 06 | Epoch Time: 0m 0s
	Train Loss: 0.886 | Train Acc: 68.71%
Epoch: 07 | Epoch Time: 0m 0s
	Train Loss: 0.863 | Train Acc: 69.97%
Epoch: 08 | Epoch Time: 0m 0s
	Train Loss: 0.839 | Train Acc: 70.83%
Epoch: 09 | Epoch Time: 0m 0s
	Train Loss: 0.804 | Train Acc: 71.18%
Epoch: 10 | Epoch Time: 0m 0s
	Train Loss: 0.798 | Train Acc: 72.57%
Epoch: 11 | Epoch Time: 0m 0s
	Train Loss: 0.766 | Train Acc: 73.35%
Epoch: 12 | Epoch Time: 0m 0s
	Train Loss: 0.731 | Train Acc: 75.22%
Epoch: 13 | Epoch Time: 0m 0s
	Train Loss: 0.713 | Train Acc: 75.35%
Epoch: 14 | Epoch Time: 0m 0s
	Train Loss: 0.690 | Train Acc: 76.95%
Epoch: 15 | Epoch Time: 0m 0s
	Tra

In [37]:
test_loss, test_acc, f_score = evaluate(model, test_iterator_sa, criterion)

print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%, F-fscore {f_score:.4f}')

Test Loss: 1.051 | Test Acc: 72.20%, F-fscore 0.7466


  'recall', 'true', average, warn_for)


In [0]:
nlp = spacy.load('en')

def predict_sentiment(model, sentence):
    model.eval()
    tokenized = [tok.text for tok in nlp.tokenizer(sentence.lower())]
    indexed = [TWEET_TEXT.vocab.stoi[t] for t in tokenized]
    print(indexed)
    length = [len(indexed)]
    tensor = torch.LongTensor(indexed).to(device)
    tensor = tensor.unsqueeze(1)
    length_tensor = torch.LongTensor(length)
    value, prediction = torch.max(model(tensor, length_tensor), 0)
    # prediction = model(tensor, length_tensor)
    reverse_key = { 0: 'Positive', 1: 'Negative', 2: 'Neutral'}
    return reverse_key[prediction.item()]
    # return prediction

In [33]:
predict_sentiment(model, "@apple  why don't you guys test your upgrades before you put them out. 15 1/2 hrs u guys wasted of my time yesterday. Thanks.")

'Negative'

In [34]:
predict_sentiment(model, "google sucha brilliant way, sleek design")

'Positive'

In [99]:
predict_sentiment(model, "Saludoss pa To' L@s Twitter@s Nocturn@s.. Jeje.. @elBellacoMusic Nose pero Ahoritaa me Sumbo Mas pal")

[0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4730, 0, 28, 0, 0, 0]


'Positive'

In [98]:
TWEET_TEXT.vocab.itos[4712]

'para'

In [0]:
torch.save(model.state_dict(), '72_percent.pt')

# Predicting the Company

In [40]:
train_data_pd.head()

Unnamed: 0,Topic,Sentiment,TweetDate,TweetText
0,1,0,Tue Oct 18 21:53:25 +0000 2011,now all @apple has to do is get swype on the i...
1,1,0,Tue Oct 18 21:09:33 +0000 2011,@apple will be adding more carrier support to ...
2,1,0,Tue Oct 18 21:02:20 +0000 2011,hilarious @youtube video guy does a duet with...
3,1,0,Tue Oct 18 20:40:10 +0000 2011,@rim you made it too easy for me to switch to ...
4,1,0,Tue Oct 18 20:34:00 +0000 2011,i just realized that the reason i got into twi...


In [0]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

vectorizer = CountVectorizer(max_features=MAX_VOCAB_SIZE, min_df=5, max_df=0.7)
X_train = vectorizer.fit_transform(train_data_pd['TweetText']).toarray()
X_test = vectorizer.transform(test_data_pd['TweetText']).toarray()

In [46]:
X.shape

(3413, 982)

In [0]:
tfidfconverter = TfidfTransformer()
X_train = tfidfconverter.fit_transform(X_train).toarray()
X_test = tfidfconverter.transform(X_test).toarray()

In [0]:
y_train = np.array(train_data_pd['Topic'])
y_test = np.array(test_data_pd['Topic'])

In [0]:
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier

In [0]:
xgb = XGBClassifier(random_state=SEED)
xgb.fit(X_train, y_train)
predictions_xgb = xgb.predict(X_test)

In [0]:
rfc = RandomForestClassifier(n_estimators=1000, random_state=SEED)
rfc.fit(X_train, y_train)
prediction_rfc = rfc.predict(X_test)

In [88]:
print(accuracy_score(prediction_rfc, y_test)*100)
print(f1_score(prediction_rfc, y_test, average='macro'))

82.45614035087719
0.8192035152581152


In [89]:
print(accuracy_score(predictions_xgb, y_test)*100)
print(f1_score(predictions_xgb, y_test, average='macro'))

77.77777777777779
0.7701272075319517


In [0]:
def predict_company(tweet):
    tweet = ''.join([i for i in tweet if i not in punctuation])
    tweet = vectorizer.transform([tweet.lower()]).toarray()
    tweet = tfidfconverter.transform(tweet).toarray()
    prediction = rfc.predict(tweet)
    reverse_company_dict = {1: 'apple',
                            2: 'google',
                            3: 'microsoft',
                            4: 'twitter'}
    # company_dict = {'apple' : 1,
    #          'google' : 2,
    #          'microsoft': 3,
    #          'twitter': 4 }
    return reverse_company_dict[int(prediction)]

In [106]:
predict_company("RT @techinciter: Suppose Microsoft Had Bought Siri? ")

'microsoft'