<a href="https://colab.research.google.com/github/shivammehta007/NLPResearch/blob/master/Tutorials/Natural%20Language%20Processing/SNA_Assignemnt.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import random
import os
import re

import torch
import torch.nn.functional as F
import torch.optim as optim
import torch.nn as nn
import numpy as np
import pandas as pd
from torchtext import data

from string import punctuation
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
import pickle
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize

import spacy
nlp = spacy.load('en')

! pip install revtok

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


#### Setting Up SEED's to keep up consistent values

In [0]:
# Setting up custom random seeds,
SEED = 1234
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
np.random.seed(SEED)
torch.backends.cudnn.deterministic = True
random.seed(SEED)

In [3]:
# Mounting Google Drive
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [4]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [0]:
data_location = 'gdrive/My Drive/Colab Notebooks/SNA Lab'
train_file = os.path.join(data_location, 'Train.csv')
test_file = os.path.join(data_location, 'Test.csv')
train_data_pd = pd.read_csv(train_file)
test_data_pd = pd.read_csv(test_file)

In [6]:
train_data_pd.head()

Unnamed: 0,Topic,Sentiment,TweetDate,TweetText
0,apple,positive,Tue Oct 18 21:53:25 +0000 2011,Now all @Apple has to do is get swype on the i...
1,apple,positive,Tue Oct 18 21:09:33 +0000 2011,@Apple will be adding more carrier support to ...
2,apple,positive,Tue Oct 18 21:02:20 +0000 2011,Hilarious @youtube video - guy does a duet wit...
3,apple,positive,Tue Oct 18 20:40:10 +0000 2011,@RIM you made it too easy for me to switch to ...
4,apple,positive,Tue Oct 18 20:34:00 +0000 2011,I just realized that the reason I got into twi...


### Preprocess Data

In [0]:
class PreprocessingTweet:
    
    def __init__(self, task='Sentiment'):
        self.task = task
        self.sentiment_dict = {'positive' : 0,
                 'negative' : 1,
                 'neutral' : 2,
                 'irrelevant' : 9}

        self.company_dict = {'apple' : 1,
             'google' : 2,
             'microsoft': 3,
             'twitter': 4 }

    def __process_tweets__(self, tweets):
        tweets_ = []
        for tweet in tweets:
            tweet = tweet.lower() # convert text to lower-case
            tweet = re.sub('((www\.[^\s]+)|(https?://[^\s]+))', 'URL', tweet) # remove URLs
            if self.task == 'Sentiment':
                tweet = re.sub('@[^\s]+', 'AT_USER', tweet) # remove usernames
            tweet = re.sub(r'#([^\s]+)', r'\1', tweet) # remove the # in #hashtag
            tweet = ''.join([i for i in tweet if i not in punctuation])
            tweet = ' '.join(word_tokenize(tweet))
            tweets_.append(tweet)
        return pd.Series(tweets_)
    
    def __process_sentiment__(self, sentiments):
        return sentiments.replace(self.sentiment_dict)

    def __process_companies__(self, companies):
        return companies.replace(self.company_dict)

    def fit(self, dataframe):
        dataframe['TweetText'] = self.__process_tweets__(dataframe['TweetText'])
        dataframe['Topic'] = self.__process_companies__(dataframe['Topic'])
        dataframe['Sentiment'] = self.__process_sentiment__(dataframe['Sentiment'])
        return dataframe



# For Sentimental Analysis

In [0]:
preprocessor_sentiment = PreprocessingTweet(task='Sentiment')

In [0]:
train_data_sentiment_pd = preprocessor_sentiment.fit(train_data_pd)
test_data_sentiment_pd = preprocessor_sentiment.fit(test_data_pd)

In [10]:
train_data_sentiment_pd.head()

Unnamed: 0,Topic,Sentiment,TweetDate,TweetText
0,1,0,Tue Oct 18 21:53:25 +0000 2011,now all ATUSER has to do is get swype on the i...
1,1,0,Tue Oct 18 21:09:33 +0000 2011,ATUSER will be adding more carrier support to ...
2,1,0,Tue Oct 18 21:02:20 +0000 2011,hilarious ATUSER video guy does a duet with AT...
3,1,0,Tue Oct 18 20:40:10 +0000 2011,ATUSER you made it too easy for me to switch t...
4,1,0,Tue Oct 18 20:34:00 +0000 2011,i just realized that the reason i got into twi...


In [0]:
TWEET_TEXT = data.Field(tokenize='spacy', include_lengths = True)
SENTIMENT_LABEL = data.LabelField(dtype=torch.long)
TWEET_REVERSE = data.ReversibleField(sequential=True, lower=True, include_lengths=True)

In [0]:
sa_train_data_pd = train_data_sentiment_pd[train_data_sentiment_pd['Sentiment'] != 9].loc[:, ['TweetText', 'Sentiment']]
sa_test_data_pd = test_data_sentiment_pd.loc[:, ['TweetText', 'Sentiment']]

In [13]:
sa_train_data_pd.head()

Unnamed: 0,TweetText,Sentiment
0,now all ATUSER has to do is get swype on the i...,0
1,ATUSER will be adding more carrier support to ...,0
2,hilarious ATUSER video guy does a duet with AT...,0
3,ATUSER you made it too easy for me to switch t...,0
4,i just realized that the reason i got into twi...,0


In [0]:
sa_train_data_pd.to_csv('Train_sa.csv', index=False)
sa_test_data_pd.to_csv('Test_sa.csv', index=False)

In [0]:
sa_data_fields = [  ('TweetText', TWEET_TEXT),  ('Sentiment', SENTIMENT_LABEL)]

sa_train_data, sa_test_data = data.TabularDataset.splits(
                                        path = '.',
                                        train = 'Train_sa.csv',
                                        test = 'Test_sa.csv',
                                        format = 'csv',
                                        fields = sa_data_fields,
                                        skip_header = True
                                        )

In [0]:
MAX_VOCAB_SIZE = 25000
TWEET_TEXT.build_vocab(sa_train_data.TweetText, max_size=MAX_VOCAB_SIZE) 
SENTIMENT_LABEL.build_vocab(sa_train_data.Sentiment)
TWEET_REVERSE.build_vocab(sa_train_data.TweetText, max_size=MAX_VOCAB_SIZE)


In [17]:
TWEET_TEXT.vocab.freqs.most_common(10)

[('ATUSER', 1840),
 ('the', 707),
 ('to', 586),
 ('URL', 565),
 ('i', 516),
 ('a', 344),
 ('on', 340),
 ('rt', 333),
 ('is', 326),
 ('for', 325)]

In [0]:
# Create Iterator

BATCH_SIZE = 64

train_iterator_sa, test_iterator_sa = data.BucketIterator.splits(
    (sa_train_data, sa_test_data),
    batch_size=BATCH_SIZE,
    sort_key=lambda x: len(x.TweetText),
    sort_within_batch=True,
    device=device
)

# Defining the Model

In [0]:
class RNN(nn.Module):

    def __init__(self, vocab_size, embedded_dimensions, hidden_dimension,
                 output_dimension, n_layers, bidirectional, dropout, pad_index):
        super(RNN, self).__init__()

        self.embedded = nn.Embedding(vocab_size, embedded_dimensions, padding_idx=pad_index)
        self.lstm = nn.LSTM(embedded_dimensions,
                            hidden_dimension,
                            num_layers=n_layers,
                            bidirectional=bidirectional,
                            dropout=dropout)
        self.linear1 = nn.Linear(hidden_dimension * 2, output_dimension)
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, text, text_length):
        
        embedded = self.embedded(text)
        packed_sequence = nn.utils.rnn.pack_padded_sequence(embedded, text_length)

        packed_output, (hidden, output) = self.lstm(packed_sequence)

        output, output_length = nn.utils.rnn.pad_packed_sequence(packed_output)

        hidden = self.dropout(torch.cat((hidden[-2, :, :], hidden[-1, :, :]), dim = 1))
        fc1 = self.dropout(self.linear1(hidden.squeeze(0)))
        return fc1

## Training Sentimental Analyser

In [0]:
# HyperParamters 

VOCAB_SIZE = len(TWEET_TEXT.vocab)
EMBEDDING_DIMENSIONS = 100
HIDDEN_DIMENSIONS = 256
OUTPUT_DIMENSIONS = 3
N_LAYERS = 2
BIDIRECTIONAL = True
DROPOUT = 0.7
PAD_IDX = TWEET_TEXT.vocab.stoi[TWEET_TEXT.pad_token]


model = RNN(VOCAB_SIZE, EMBEDDING_DIMENSIONS, HIDDEN_DIMENSIONS,  OUTPUT_DIMENSIONS
            , N_LAYERS, BIDIRECTIONAL,DROPOUT, PAD_IDX)

In [21]:
total_parameters = sum(l.numel() for l in model.parameters() if l.requires_grad)
print('Total Paramters : {:,}'.format(total_parameters))

Total Paramters : 2,742,383


In [22]:
# Setting Padding indexes to zero to not to determine sentiment
model.embedded.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIMENSIONS)
print(model.embedded.weight.data)


tensor([[-1.1172e-01, -4.9659e-01,  1.6307e-01,  ...,  1.5903e+00,
         -1.9474e-01, -2.4149e-01],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  0.0000e+00,
          0.0000e+00,  0.0000e+00],
        [ 7.2888e-01, -7.3363e-01,  1.5624e+00,  ..., -5.5922e-01,
         -4.4799e-01, -6.4758e-01],
        ...,
        [ 9.9998e-01,  1.4538e+00,  3.5335e-02,  ..., -1.6285e+00,
          4.8843e-01,  4.1236e-01],
        [-1.0366e+00, -2.5958e+00, -1.5447e+00,  ...,  2.0455e-03,
          1.1170e+00, -4.3080e-01],
        [ 1.7727e+00,  9.4144e-01,  8.9142e-01,  ...,  1.6193e+00,
          1.6658e+00, -1.3113e+00]])


## Training the model

In [0]:
optimizer = optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss()
model = model.to(device)
criterion = criterion.to(device)

In [0]:
def classification_accuracy(predictions, label):
    value, index = torch.max(torch.softmax(predictions,1), 1)
    correct = (index == label).float()
    acc =   correct.sum() / len(correct)
    return acc

In [0]:
def train(model, iterator, optimizer, criterion):
    epoch_loss = 0
    epoch_accuracy = 0
    f_score = 0
    model.train()
    for batch in iterator:
        optimizer.zero_grad()
        text, text_length = batch.TweetText
        predictions = model(text, text_length).squeeze(1)
        loss = criterion(predictions, batch.Sentiment)
        acc = classification_accuracy(predictions, batch.Sentiment)
        loss.backward()
        optimizer.step()
        f_score += f1_score(torch.max(predictions, dim=1)[1].cpu().numpy(), batch.Sentiment.cpu().numpy(), average='weighted')
        epoch_loss += loss.item()
        epoch_accuracy += acc.item()

    return epoch_loss / len(iterator), epoch_accuracy / len(iterator), f_score/ len(iterator)

In [0]:
def evaluate(model, iterator, criterion):
    epoch_loss = 0
    epoch_accuracy = 0
    model.eval()
    f_score = 0
    with torch.no_grad():
        for batch in iterator:
            text, text_length = batch.TweetText
            # (x, x_lengths), y = batch.TweetText, batch.Sentiment
            # orig_text = TWEET_REVERSE.reverse(x.data)
            # print(orig_text)          
            predictions = model(text, text_length).squeeze(1)
            loss = criterion(predictions, batch.Sentiment)
            acc = classification_accuracy(predictions, batch.Sentiment)
            f_score += f1_score(np.argmax(predictions.cpu().numpy(), axis=1), batch.Sentiment.cpu().numpy(), average='weighted')
            epoch_loss += loss.item()
            epoch_accuracy += acc.item()
    
    return epoch_loss / len(iterator), epoch_accuracy / len(iterator), f_score/ len(iterator)

In [0]:
import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [28]:
N_EPOCHS = 30


for epoch in range(N_EPOCHS):
    start_time = time.time()
    train_loss, train_acc, train_f_score = train(model, train_iterator_sa, optimizer, criterion)
    end_time = time.time()
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')


  'recall', 'true', average, warn_for)


Epoch: 01 | Epoch Time: 0m 0s
	Train Loss: 1.035 | Train Acc: 62.28%
Epoch: 02 | Epoch Time: 0m 0s
	Train Loss: 0.988 | Train Acc: 64.71%
Epoch: 03 | Epoch Time: 0m 0s
	Train Loss: 0.963 | Train Acc: 65.93%


  'precision', 'predicted', average, warn_for)


Epoch: 04 | Epoch Time: 0m 0s
	Train Loss: 0.934 | Train Acc: 65.89%
Epoch: 05 | Epoch Time: 0m 0s
	Train Loss: 0.938 | Train Acc: 66.93%
Epoch: 06 | Epoch Time: 0m 0s
	Train Loss: 0.902 | Train Acc: 68.62%
Epoch: 07 | Epoch Time: 0m 0s
	Train Loss: 0.889 | Train Acc: 68.62%
Epoch: 08 | Epoch Time: 0m 0s
	Train Loss: 0.859 | Train Acc: 70.27%
Epoch: 09 | Epoch Time: 0m 0s
	Train Loss: 0.837 | Train Acc: 71.61%
Epoch: 10 | Epoch Time: 0m 0s
	Train Loss: 0.810 | Train Acc: 72.70%
Epoch: 11 | Epoch Time: 0m 0s
	Train Loss: 0.770 | Train Acc: 72.57%
Epoch: 12 | Epoch Time: 0m 0s
	Train Loss: 0.765 | Train Acc: 74.05%
Epoch: 13 | Epoch Time: 0m 0s
	Train Loss: 0.743 | Train Acc: 73.78%
Epoch: 14 | Epoch Time: 0m 0s
	Train Loss: 0.727 | Train Acc: 74.96%
Epoch: 15 | Epoch Time: 0m 0s
	Train Loss: 0.678 | Train Acc: 75.65%
Epoch: 16 | Epoch Time: 0m 0s
	Train Loss: 0.682 | Train Acc: 77.78%
Epoch: 17 | Epoch Time: 0m 0s
	Train Loss: 0.672 | Train Acc: 76.52%
Epoch: 18 | Epoch Time: 0m 0s
	Tra

In [29]:
print(f'Training F-Score: {train_f_score:.4f}')

Training F-Score: 0.8168


In [30]:
test_loss, test_acc, f_score = evaluate(model, test_iterator_sa, criterion)

print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%, F-fscore: {f_score:.4f}')

Test Loss: 0.878 | Test Acc: 75.52%, F-fscore: 0.7616


  'recall', 'true', average, warn_for)


In [0]:
def predict_sentiment(model, sentence):
    model.eval()
    tokenized = [tok.text for tok in nlp.tokenizer(sentence.lower())]
    indexed = [TWEET_TEXT.vocab.stoi[t] for t in tokenized]
    length = [len(indexed)]
    tensor = torch.LongTensor(indexed).to(device)
    tensor = tensor.unsqueeze(1)
    length_tensor = torch.LongTensor(length)
    value, prediction = torch.max(model(tensor, length_tensor), 0)
    reverse_key = { 0: 'Positive', 1: 'Negative', 2: 'Neutral', 3: 'Irrelevant'}
    return prediction.item() , reverse_key[prediction.item()]

In [32]:
predict_sentiment(model, "@apple  why don't you guys test your upgrades before you put them out. 15 1/2 hrs u guys wasted of my time yesterday. Thanks.")

(1, 'Negative')

In [33]:
predict_sentiment(model, "google sucha brilliant way, sleek design")

(0, 'Positive')

In [0]:
torch.save(model.state_dict(), 'Sentimental_Analysis.pt')

# Predicting the Company

## Preprocessing for company Prediction

In [0]:
train_data_pd = pd.read_csv(train_file)
test_data_pd = pd.read_csv(test_file)

In [0]:
preprocessor_company_prediction = PreprocessingTweet(task='Company')
train_data_company_pd = preprocessor_company_prediction.fit(train_data_pd)
test_data_company_pd = preprocessor_company_prediction.fit(test_data_pd)

In [37]:
train_data_company_pd.head()

Unnamed: 0,Topic,Sentiment,TweetDate,TweetText
0,1,0,Tue Oct 18 21:53:25 +0000 2011,now all apple has to do is get swype on the ip...
1,1,0,Tue Oct 18 21:09:33 +0000 2011,apple will be adding more carrier support to t...
2,1,0,Tue Oct 18 21:02:20 +0000 2011,hilarious youtube video guy does a duet with a...
3,1,0,Tue Oct 18 20:40:10 +0000 2011,rim you made it too easy for me to switch to a...
4,1,0,Tue Oct 18 20:34:00 +0000 2011,i just realized that the reason i got into twi...


In [0]:
vectorizer = CountVectorizer(max_features=MAX_VOCAB_SIZE, min_df=5, max_df=0.7)
X_train = vectorizer.fit_transform(train_data_company_pd['TweetText']).toarray()
X_test = vectorizer.transform(test_data_company_pd['TweetText']).toarray()

In [0]:
tfidfconverter = TfidfTransformer()
X_train = tfidfconverter.fit_transform(X_train).toarray()
X_test = tfidfconverter.transform(X_test).toarray()

In [0]:
y_train = np.array(train_data_company_pd['Topic'])
y_test = np.array(test_data_company_pd['Topic'])

In [0]:
rfc = RandomForestClassifier(n_estimators=1000, random_state=SEED)
rfc.fit(X_train, y_train)
prediction_rfc = rfc.predict(X_test)

In [42]:
print(accuracy_score(prediction_rfc, y_test)*100)
print(f1_score(prediction_rfc, y_test, average='macro'))

81.87134502923976
0.8140346938028314


In [0]:
def predict_company(tweet):
    tweet = ''.join([i for i in tweet if i not in punctuation])
    tweet = vectorizer.transform([tweet.lower()]).toarray()
    tweet = tfidfconverter.transform(tweet).toarray()
    prediction = rfc.predict(tweet)
    reverse_company_dict = {1: 'apple',
                            2: 'google',
                            3: 'microsoft',
                            4: 'twitter'}
    
    return reverse_company_dict[int(prediction)]

In [0]:
pickle.dump(rfc, open('Company_classifier.h5', 'wb'))

In [45]:
predict_company("RT @techinciter: Suppose Microsoft Had Bought Siri? ")

'microsoft'

# Getting Custom Information

## Using a Sentence

In [0]:
def get_sentiment_and_company(text):
    company = predict_company(text)
    value, sentiment = predict_sentiment(model, text)
    print(f'The Text: {text}')
    print(f'Sentiment: {sentiment}')
    print(f'Company: {company}')
    return

In [47]:
get_sentiment_and_company("RT @techinciter: Suppose Microsoft Had Bought Siri? ")

The Text: RT @techinciter: Suppose Microsoft Had Bought Siri? 
Sentiment: Positive
Company: microsoft


## Using a file

In [0]:
name_of_file = test_file

## Getting Sentiment

In [49]:
test_frame = pd.read_csv(name_of_file)
test_frame.head()

Unnamed: 0,Topic,Sentiment,TweetDate,TweetText
0,apple,positive,Tue Oct 18 18:36:46 +0000 2011,RT @JamaicanIdler: Lmao I think @apple is onto...
1,apple,positive,Mon Oct 17 14:36:34 +0000 2011,"Bravo, @Apple! http://t.co/BgoTzj7K"
2,apple,positive,Mon Oct 17 00:23:56 +0000 2011,"Day305, I'm thankful for the great customer se..."
3,apple,positive,Sun Oct 16 22:56:54 +0000 2011,i love this. so much. thank you @apple. http:...
4,apple,positive,Sun Oct 16 16:25:47 +0000 2011,I &lt;3 @apple http://t.co/ondXWpEr


In [50]:
test_frame = preprocessor_sentiment.fit(test_frame)
test_frame.head()

Unnamed: 0,Topic,Sentiment,TweetDate,TweetText
0,1,0,Tue Oct 18 18:36:46 +0000 2011,rt ATUSER lmao i think ATUSER is onto somethin...
1,1,0,Mon Oct 17 14:36:34 +0000 2011,bravo ATUSER URL
2,1,0,Mon Oct 17 00:23:56 +0000 2011,day305 im thankful for the great customer serv...
3,1,0,Sun Oct 16 22:56:54 +0000 2011,i love this so much thank you ATUSER URL
4,1,0,Sun Oct 16 16:25:47 +0000 2011,i lt3 ATUSER URL


In [0]:
test_frame.to_csv('temp.csv')

In [0]:
INPUT_TWEET_TEXT = data.Field(tokenize='spacy', include_lengths = True)
INPUT_SENTIMENTAL_LABEL = data.LabelField(dtype=torch.long)

In [0]:
INPUT_TWEET_TEXT.build_vocab(sa_train_data.TweetText, max_size=MAX_VOCAB_SIZE)
INPUT_SENTIMENTAL_LABEL.build_vocab(sa_train_data.Sentiment)

In [0]:
input_data_field = [  ('TweetText', INPUT_TWEET_TEXT),  ('Sentiment', INPUT_SENTIMENTAL_LABEL)]

input_data = data.TabularDataset(path='./temp.csv',
                            format = 'csv',
                            fields = input_data_field,
                            skip_header = True)

In [0]:
input_data_iterator = data.BucketIterator(input_data,
                                          batch_size=BATCH_SIZE,
                                          sort_key=lambda x: len(x.TweetText),
                                          sort_within_batch=True,
                                          device=device)

In [56]:
input_loss, input_acc, input_f_score = evaluate(model, input_data_iterator, criterion)

print(f'Input Loss: {input_loss:.3f} | Input Acc: {input_acc*100:.2f}%, Input F-fscore: {input_f_score:.4f}')

Input Loss: 0.896 | Input Acc: 73.01%, Input F-fscore: 0.8139


  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


## Getting Company Information

In [57]:
test_frame = pd.read_csv(name_of_file)
test_frame.head()

Unnamed: 0,Topic,Sentiment,TweetDate,TweetText
0,apple,positive,Tue Oct 18 18:36:46 +0000 2011,RT @JamaicanIdler: Lmao I think @apple is onto...
1,apple,positive,Mon Oct 17 14:36:34 +0000 2011,"Bravo, @Apple! http://t.co/BgoTzj7K"
2,apple,positive,Mon Oct 17 00:23:56 +0000 2011,"Day305, I'm thankful for the great customer se..."
3,apple,positive,Sun Oct 16 22:56:54 +0000 2011,i love this. so much. thank you @apple. http:...
4,apple,positive,Sun Oct 16 16:25:47 +0000 2011,I &lt;3 @apple http://t.co/ondXWpEr


In [58]:
test_frame = preprocessor_company_prediction.fit(test_frame)
test_frame.head()

Unnamed: 0,Topic,Sentiment,TweetDate,TweetText
0,1,0,Tue Oct 18 18:36:46 +0000 2011,rt jamaicanidler lmao i think apple is onto so...
1,1,0,Mon Oct 17 14:36:34 +0000 2011,bravo apple URL
2,1,0,Mon Oct 17 00:23:56 +0000 2011,day305 im thankful for the great customer serv...
3,1,0,Sun Oct 16 22:56:54 +0000 2011,i love this so much thank you apple URL
4,1,0,Sun Oct 16 16:25:47 +0000 2011,i lt3 apple URL


In [59]:
test_data_x = vectorizer.transform(test_frame['TweetText']).toarray()
test_data_x = tfidfconverter.transform(test_data_x).toarray()
test_data_y = test_frame['Topic']
prediction_test_data = rfc.predict(X_test)

accuracy_company = accuracy_score(prediction_test_data, test_data_y)
f1_company = f1_score(prediction_test_data, test_data_y, average='macro')
print(accuracy_company)
print(f1_company)

0.8187134502923976
0.8140346938028314


# Results

## Results from Data Files: 

In [0]:
results = { 'Source' : ['Training File', 'Test File'], 'Accuracy' : [f'{train_acc*100: .2f}%', f'{test_acc*100: .2f}%'], 'F-Score' : [f'{train_f_score: .4f}', f'{f_score: .4f}']}

In [61]:
results_pd = pd.DataFrame(results)
results_pd

Unnamed: 0,Source,Accuracy,F-Score
0,Training File,78.21%,0.8168
1,Test File,75.52%,0.7616


## Results from Input Files

In [62]:
print(f'File: {name_of_file}')
print(f'Sentiment Analysis \nAccuracy: {input_acc*100:.2f}%')
print(f'F-Score: {input_f_score:.4f}')
print(f'Company Detection:')
print(f'Accuracy: {accuracy_company*100:.2f}%')
print(f'F-Score: {f1_company:.4f}')


File: gdrive/My Drive/Colab Notebooks/SNA Lab/Test.csv
Sentiment Analysis 
Accuracy: 73.01%
F-Score: 0.8139
Company Detection:
Accuracy: 81.87%
F-Score: 0.8140
