# Generating Dataset For the Sentimental Analysis of Tweets

In [1]:
!pip install snscrape

Collecting snscrape
  Downloading snscrape-0.6.2.20230320-py3-none-any.whl (71 kB)
     ---------------------------------------- 71.8/71.8 kB 1.3 MB/s eta 0:00:00
Collecting lxml
  Downloading lxml-4.9.2-cp311-cp311-win_amd64.whl (3.8 MB)
     ---------------------------------------- 3.8/3.8 MB 3.9 MB/s eta 0:00:00
Collecting beautifulsoup4
  Downloading beautifulsoup4-4.12.2-py3-none-any.whl (142 kB)
     -------------------------------------- 143.0/143.0 kB 4.3 MB/s eta 0:00:00
Collecting soupsieve>1.2
  Downloading soupsieve-2.4-py3-none-any.whl (37 kB)
Collecting PySocks!=1.5.7,>=1.5.6
  Downloading PySocks-1.7.1-py3-none-any.whl (16 kB)
Installing collected packages: soupsieve, PySocks, lxml, beautifulsoup4, snscrape
Successfully installed PySocks-1.7.1 beautifulsoup4-4.12.2 lxml-4.9.2 snscrape-0.6.2.20230320 soupsieve-2.4



[notice] A new release of pip available: 22.3.1 -> 23.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## Scraping the data from Twitter using `snscrapr`

In [6]:
import snscrape.modules.twitter as sntwitter

query = "(from:ANI) until:2023-01-12 since:2013-01-08"
tweets = []
limit = 10

for tweet in sntwitter.TwitterSearchScraper(query).get_items():
    if len(tweets) == limit:
        break
    else:
        tweets.append([tweet.lang, tweet.content])
        
df = pd.DataFrame(tweets, columns=['Lang', 'Tweet'])
df.head()

  tweets.append([tweet.lang, tweet.content])


Unnamed: 0,Lang,Tweet
0,en,Tripura Chief Minister Manik Saha inspected th...
1,en,I'd like to appreciate Ambassador Katherine Ta...
2,en,The ground rules of IPEF were laid out very we...
3,en,We also discussed progress of Indo-Pacific Eco...
4,en,I have had a chance to spend time with US Comm...


In [7]:
df.shape

(10, 2)

## Filtering english Tweets from the Dataset

In [8]:
# To See What are the Varities of Language in the Dataset
df['Lang'].unique()

array(['en'], dtype=object)

In [9]:
df = df[(df.Lang=='en')]
df.head()

Unnamed: 0,Lang,Tweet
0,en,Tripura Chief Minister Manik Saha inspected th...
1,en,I'd like to appreciate Ambassador Katherine Ta...
2,en,The ground rules of IPEF were laid out very we...
3,en,We also discussed progress of Indo-Pacific Eco...
4,en,I have had a chance to spend time with US Comm...


In [10]:
df['Lang'].unique()

array(['en'], dtype=object)

In [11]:
df.shape

(10, 2)

## Removing the links and the username from the tweets

In [12]:
# Defining a function to change the inks and the mentiones user in the tweets
def removeLinkAndUser(text):
    new_text = []
    for t in text.split(" "):
        t = '@user' if t.startswith('@') and len(t) > 1 else t
        t = 'http' if t.startswith('http') else t
        new_text.append(t)
    return " ".join(new_text)

In [14]:
# Getting a random tweet from the dataset
df.reset_index(drop=True, inplace=True)
sampleTweet = df['Tweet'][1]
sampleTweet

"I'd like to appreciate Ambassador Katherine Tai &amp; US Secretary of Commerce Gina Raimondo for the leadership that the US &amp; the two have personally given to the IPEF dialogue. India stands solidly with the United States in wanting to make that happen: Union Minister Piyush Goyal"

In [15]:
# After removing username and links from the tweet
removeLinkAndUser(sampleTweet)

"I'd like to appreciate Ambassador Katherine Tai &amp; US Secretary of Commerce Gina Raimondo for the leadership that the US &amp; the two have personally given to the IPEF dialogue. India stands solidly with the United States in wanting to make that happen: Union Minister Piyush Goyal"

### Appling this to all the tweets in the dataframe

In [16]:
df["Tweet"]=df["Tweet"].apply(removeLinkAndUser)
df.head()

Unnamed: 0,Lang,Tweet
0,en,Tripura Chief Minister Manik Saha inspected th...
1,en,I'd like to appreciate Ambassador Katherine Ta...
2,en,The ground rules of IPEF were laid out very we...
3,en,We also discussed progress of Indo-Pacific Eco...
4,en,I have had a chance to spend time with US Comm...


# VADAR Sentiment Scoring
* VADER (Valence Aware Dictionary and sEntiment Reasoner) is a rule-based sentiment analysis tool designed to extract polarity (positive, negative, or neutral) from text data. 
* It uses a lexicon of sentiment-related words and phrases along with a set of rules to determine the sentiment of a given text.

## Limitation of VADAR
* Limited Domain-Specific Knowledge
* Doesnot Account the relationship between the words
* Over-reliance on Lexicon (The lexicon consists of a list of words and phrases that are labeled with their polarity (positive, negative, or neutral) based on their commonly perceived sentiment.)
* Inability to Capture Complex Emotions
* Lack of Understanding of Sarcasm and Irony
* Difficulty with Non-Text Data

In [28]:
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
from tqdm import tqdm
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\abirp\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [29]:
# Invoking a Instance of SentimentIntensityAnalyzer
sia = SentimentIntensityAnalyzer()

In [30]:
sia.polarity_scores("I am very happy")

{'neg': 0.0, 'neu': 0.334, 'pos': 0.666, 'compound': 0.6115}

In [31]:
sia.polarity_scores("I am very sad")

{'neg': 0.629, 'neu': 0.371, 'pos': 0.0, 'compound': -0.5256}

In [32]:
# Generating Polarity Score on All Tweets and store then in the res
j=1;
res = {}
for i,row in tqdm(df.iterrows(),total = len(df)):
  mytweet = row['Tweet']
  myid = j
  j = j + 1
  vader_result = sia.polarity_scores(mytweet) 
  vader_result_rename = {}
  for key, values in vader_result.items():
      vader_result_rename[f"vader_{key}"] = values
  res[myid] = vader_result_rename


100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 538.93it/s][A


In [33]:
# Crating a dataset for the sentiment scores using the VADAR Method
sentimentScoresVader = pd.DataFrame(res).T
sentimentScoresVader.head()

Unnamed: 0,vader_neg,vader_neu,vader_pos,vader_compound
1,0.0,1.0,0.0,0.0
2,0.0,0.846,0.154,0.7906
3,0.0,0.947,0.053,0.3384
4,0.0,0.94,0.06,0.4215
5,0.0,0.955,0.045,0.25


# Hugging Face *Method* (Roberta Pretrained Model)

In [34]:
!pip install transformers
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from scipy.special import softmax




[notice] A new release of pip available: 22.3.1 -> 23.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [35]:
# This Hugging Face Transformers library to load a pre-trained sentiment analysis model for Twitter data.
task='sentiment'
MODEL = f"cardiffnlp/twitter-roberta-base-{task}"
tokenizer = AutoTokenizer.from_pretrained(MODEL) # Convert the raw text into a format so that the model can process
model = AutoModelForSequenceClassification.from_pretrained(MODEL)

### Now Lets take a example to get the sentiment score using the Roberta Pretrained Model

In [36]:
# Checking the sentiment score of the single tweet
def polarity_score(tweet):
  encoded_text = tokenizer(tweet,return_tensors='pt')
  output = model(**encoded_text)
  scores = output[0][0].detach().numpy()
  scores = softmax(scores)
  scores_dict = {
      "roberta_neg" : scores[0],
      "roberta_neu" : scores[1],
      "roberta_pos" : scores[2]
  }
  return scores_dict

In [38]:
polarity_score(df['Tweet'][1])

{'roberta_neg': 0.0032028793,
 'roberta_neu': 0.1253697,
 'roberta_pos': 0.8714274}

### Now getting it for the rest of the tweets

In [39]:
j = 1;
res = {}
for i,row in tqdm(df.iterrows(),total = len(df)):
  try:
    mytweet = row['Tweet']
    myid = j
    j=j+1
    roborto_result = polarity_score(mytweet)
    res[myid] = roborto_result
  except RuntimeError:
    print (f"Broke at id {myid}")


  0%|                                                                                            | 0/10 [00:00<?, ?it/s][A
 10%|████████▍                                                                           | 1/10 [00:00<00:01,  6.54it/s][A
 20%|████████████████▊                                                                   | 2/10 [00:00<00:01,  4.79it/s][A
 30%|█████████████████████████▏                                                          | 3/10 [00:00<00:01,  4.17it/s][A
 40%|█████████████████████████████████▌                                                  | 4/10 [00:00<00:01,  3.98it/s][A
 50%|██████████████████████████████████████████                                          | 5/10 [00:01<00:01,  3.96it/s][A
 60%|██████████████████████████████████████████████████▍                                 | 6/10 [00:01<00:00,  4.13it/s][A
 70%|██████████████████████████████████████████████████████████▊                         | 7/10 [00:01<00:00,  4.14it/s][A
 80%|██

In [40]:
# Crating a dataset for the sentiment scores using the Roberta Method
sentimentScoresRoberta = pd.DataFrame(res).T
sentimentScoresRoberta.head()

Unnamed: 0,roberta_neg,roberta_neu,roberta_pos
1,0.014746,0.928273,0.05698
2,0.003203,0.12537,0.871427
3,0.001249,0.041221,0.95753
4,0.005952,0.873916,0.120132
5,0.00254,0.786259,0.2112


## So now we have sentitemt scores from 2 types of method so let's merge all of them
* VADAR Methord
* Roberta Methord

In [41]:
df.head()
df.reset_index(drop=True, inplace=True)
df.shape

(10, 2)

In [42]:
sentimentScoresVader.head()
sentimentScoresVader.reset_index(drop=True, inplace=True)
sentimentScoresVader.shape

(10, 4)

In [43]:
sentimentScoresRoberta.head()
sentimentScoresRoberta.reset_index(drop=True, inplace=True)
sentimentScoresRoberta.shape

(10, 3)

In [44]:
df_intermediate = pd.concat([sentimentScoresVader, sentimentScoresRoberta, df], axis=1, join='inner')
df_intermediate.head()
df_intermediate.shape

(10, 9)

## Since here we are following Supervised Learning we need to Lable the tweets 
Now we need to lable the tweets from a pretrained model on sentimental analysis from hugging face 

In [45]:
df_intermediate.columns

Index(['vader_neg', 'vader_neu', 'vader_pos', 'vader_compound', 'roberta_neg',
       'roberta_neu', 'roberta_pos', 'Lang', 'Tweet'],
      dtype='object')

In [46]:
df_intermediate.head()

Unnamed: 0,vader_neg,vader_neu,vader_pos,vader_compound,roberta_neg,roberta_neu,roberta_pos,Lang,Tweet
0,0.0,1.0,0.0,0.0,0.014746,0.928273,0.05698,en,Tripura Chief Minister Manik Saha inspected th...
1,0.0,0.846,0.154,0.7906,0.003203,0.12537,0.871427,en,I'd like to appreciate Ambassador Katherine Ta...
2,0.0,0.947,0.053,0.3384,0.001249,0.041221,0.95753,en,The ground rules of IPEF were laid out very we...
3,0.0,0.94,0.06,0.4215,0.005952,0.873916,0.120132,en,We also discussed progress of Indo-Pacific Eco...
4,0.0,0.955,0.045,0.25,0.00254,0.786259,0.2112,en,I have had a chance to spend time with US Comm...


In [47]:
from transformers import pipeline
sent_pipeline = pipeline("sentiment-analysis")

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.

Downloading (…)lve/main/config.json: 100%|█████████████████████████████████████████████| 629/629 [00:00<00:00, 40.5kB/s][A
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development

Downloading pytorch_model.bin:   0%|                                                         | 0.00/268M [00:00<?, ?B/s][A
Downloading pytorch_model.bin:   4%|█▉                                              | 10.5M/268M [00:02<01:06, 3.88MB/s][A
Downloading pytorch_model.bin:   8%|███▊                                            | 

In [48]:
j = 1;
res = {}
for i,row in tqdm(df.iterrows(),total = len(df)):
  try:
    mytweet = row['Tweet']
    myid = j
    j=j+1
    roborto_result = sent_pipeline(mytweet)
    res[myid] = { 'Tag' : roborto_result[0]['label'], 'Score' : roborto_result[0]['score']}
  except RuntimeError:
    print (f"Broke at id {myid}")



  0%|                                                                                            | 0/10 [00:00<?, ?it/s][A
 20%|████████████████▊                                                                   | 2/10 [00:00<00:00, 11.77it/s][A
 40%|█████████████████████████████████▌                                                  | 4/10 [00:00<00:00, 10.19it/s][A
 60%|██████████████████████████████████████████████████▍                                 | 6/10 [00:00<00:00,  9.44it/s][A
 80%|███████████████████████████████████████████████████████████████████▏                | 8/10 [00:00<00:00,  9.29it/s][A
100%|███████████████████████████████████████████████████████████████████████████████████| 10/10 [00:01<00:00,  9.72it/s][A


In [49]:
# Crating a dataset for the sentiment scores using the Roberta Method
LabledData = pd.DataFrame(res).T
LabledData["Score"] = LabledData["Score"].astype(float)
LabledData.head()

Unnamed: 0,Tag,Score
1,POSITIVE,0.564948
2,POSITIVE,0.999794
3,POSITIVE,0.999485
4,POSITIVE,0.956361
5,POSITIVE,0.984511


In [50]:
LabledData.head()
LabledData.reset_index(drop=True, inplace=True)
LabledData.shape

(10, 2)

In [51]:
df_final = pd.concat([df_intermediate, LabledData], axis=1, join='inner')
df_final.shape

(10, 11)

In [52]:
df_final.head()

Unnamed: 0,vader_neg,vader_neu,vader_pos,vader_compound,roberta_neg,roberta_neu,roberta_pos,Lang,Tweet,Tag,Score
0,0.0,1.0,0.0,0.0,0.014746,0.928273,0.05698,en,Tripura Chief Minister Manik Saha inspected th...,POSITIVE,0.564948
1,0.0,0.846,0.154,0.7906,0.003203,0.12537,0.871427,en,I'd like to appreciate Ambassador Katherine Ta...,POSITIVE,0.999794
2,0.0,0.947,0.053,0.3384,0.001249,0.041221,0.95753,en,The ground rules of IPEF were laid out very we...,POSITIVE,0.999485
3,0.0,0.94,0.06,0.4215,0.005952,0.873916,0.120132,en,We also discussed progress of Indo-Pacific Eco...,POSITIVE,0.956361
4,0.0,0.955,0.045,0.25,0.00254,0.786259,0.2112,en,I have had a chance to spend time with US Comm...,POSITIVE,0.984511


In [53]:
df_final['Lang'].unique()

array(['en'], dtype=object)

In [54]:
df_final.describe()

Unnamed: 0,vader_neg,vader_neu,vader_pos,vader_compound,roberta_neg,roberta_neu,roberta_pos,Score
count,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0
mean,0.0,0.8867,0.1133,0.53909,0.003539,0.325137,0.671324,0.950191
std,0.0,0.081776,0.081776,0.308584,0.004215,0.384543,0.387698,0.13606
min,0.0,0.745,0.0,0.0,0.00081,0.019155,0.05698,0.564948
25%,0.0,0.82425,0.0545,0.3388,0.001321,0.043922,0.321074,0.987862
50%,0.0,0.9045,0.0955,0.55115,0.002268,0.090103,0.907237,0.999501
75%,0.0,0.9455,0.17575,0.81085,0.003037,0.676416,0.954937,0.999764
max,0.0,1.0,0.255,0.9001,0.014746,0.928273,0.980034,0.999874


In [55]:
df_final.dtypes

vader_neg         float64
vader_neu         float64
vader_pos         float64
vader_compound    float64
roberta_neg       float32
roberta_neu       float32
roberta_pos       float32
Lang               object
Tweet              object
Tag                object
Score             float64
dtype: object

In [57]:
df_final.to_csv(f"TwitterData.csv",index=False)