In [None]:
import pandas as pd
import re
from emoji import demojize
from html import unescape
from wordsegment import segment
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords

In [None]:
df = pd.read_csv("/content/amtrak_data.csv")
df.shape

(21842, 35)

In [None]:
df1 = df[df['text'].str.contains("Amtrak train")]

In [None]:
df1.shape

(17290, 35)

In [None]:
df1 = df[df['lang'] == 'en']
df1.shape

(21842, 74)

In [None]:
df1 = df1[['id','conversation_id', 'referenced_tweets.replied_to.id',
       'referenced_tweets.retweeted.id', 'referenced_tweets.quoted.id',
       'author_id', 'in_reply_to_user_id', 'retweeted_user_id',
       'quoted_user_id', 'created_at', 'text','source','public_metrics.like_count', 'public_metrics.quote_count',
       'public_metrics.reply_count', 'public_metrics.retweet_count','author.id', 'author.created_at',
       'author.username', 'author.name', 'author.description','author.entities.description.cashtags',
       'author.entities.description.hashtags',
       'author.entities.description.mentions']]

In [None]:
df1['author.id'].nunique()

14456

# Libraries install 

In [None]:
!pip install wordsegment emoji
!pip install bertopic
!pip install flair
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


# Analysis

In [None]:
def find_retweeted(tweet):
    '''This function will extract the twitter handles of retweed people'''
    return re.findall('(?<=RT\s)(@[A-Za-z]+[A-Za-z0-9-_]+)', tweet)

def find_mentioned(tweet):
    '''This function will extract the twitter handles of people mentioned in the tweet'''
    return re.findall('(?<!RT\s)(@[A-Za-z]+[A-Za-z0-9-_]+)', tweet)  

def find_hashtags(tweet):
    '''This function will extract hashtags'''
    return re.findall('(#[A-Za-z]+[A-Za-z0-9-_]+)', tweet)  

def find_links(tweet):
    '''This function will extract hashtags'''
    return re.findall('(https?://[^\s]+)', tweet)

In [None]:
def clean_text(tweet):
    www_exp = r'www.[^ ]+'
    http_exp = r'https?[^\s]+'
    text = re.sub('|'.join((www_exp, http_exp)), r'', tweet)
    text = unescape(text)
    text = re.sub(r'@[\w\-]+', r'', text)
    text = re.sub(r'#([\w\-]+)', r'', text) 
    text = demojize(text, delimiters=(' :', ': '))
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'\\n', '', text)  # remove newlines
    return text.strip()

In [None]:
df1['retweeted'] = df1.text.apply(find_retweeted)
df1['mentioned'] = df1.text.apply(find_mentioned)
df1['hashtags'] = df1.text.apply(find_hashtags)
df1['links'] = df1.text.apply(find_links)
df1['clean_text'] = df1.text.apply(clean_text)

In [None]:
df1['mentioned'].head(20)

1                               []
2                               []
3                               []
4                               []
5                               []
6                               []
7     [@TheReturn84, @kdwhite1012]
8                               []
9                        [@KRCG13]
10                              []
11                              []
12                              []
14                              []
15                              []
16                              []
17                       [@Amtrak]
18                              []
19                              []
21                              []
22                              []
Name: mentioned, dtype: object

In [None]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
def remove_links(tweet):
    '''Takes a string and removes web links from it'''
    tweet = re.sub(r'http\S+', '', tweet) # remove http links
    tweet = re.sub(r'bit.ly/\S+', '', tweet) # rempve bitly links
    tweet = tweet.strip('[link]') # remove [links]
    return tweet

def remove_users(tweet):
    '''Takes a string and removes retweet and @user information'''
    tweet = re.sub('(RT\s@[A-Za-z]+[A-Za-z0-9-_]+)', '', tweet) # remove retweet
    tweet = re.sub('(@[A-Za-z]+[A-Za-z0-9-_]+)', '', tweet) # remove tweeted at
    tweet = re.sub(r'\\n', '', tweet)  # remove newlines
    return tweet

In [None]:
my_stopwords = nltk.corpus.stopwords.words('english')
word_rooter = nltk.stem.snowball.PorterStemmer(ignore_stopwords=False).stem
my_punctuation = '!"$%&\'()*+,-./:;<=>?[\\]^_`{|}~•@'

# cleaning master function
def clean_tweet(tweet, bigrams=False):
    tweet = remove_users(tweet)
    tweet = remove_links(tweet)
    tweet = tweet.lower() # lower case
    tweet = re.sub('['+my_punctuation + ']+', ' ', tweet) # strip punctuation
    tweet = re.sub('\s+', ' ', tweet) #remove double spacing
    tweet = re.sub('([0-9]+)', '', tweet) # remove numbers
    tweet_token_list = [word for word in tweet.split(' ')
                            if word not in my_stopwords] # remove stopwords

    # tweet_token_list = [word_rooter(word) if '#' not in word else word
    #                     for word in tweet_token_list] # apply word rooter
    if bigrams:
        tweet_token_list = tweet_token_list+[tweet_token_list[i]+'_'+tweet_token_list[i+1]
                                            for i in range(len(tweet_token_list)-1)]
    tweet = ' '.join(tweet_token_list)
    return tweet

In [None]:
df1['clean_tweet'] = df1.text.apply(clean_tweet)

In [None]:
df1['clean_tweet']=df['clean_tweet'].fillna("")
df2 = df1.drop_duplicates(subset = "clean_tweet").reset_index()

df2.shape

(3375, 36)

In [None]:
df2.shape

(3375, 36)

In [None]:
docs = df2.clean_tweet

In [None]:
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
import hdbscan
import umap
import flair
from flair.embeddings import TransformerDocumentEmbeddings
from sentence_transformers import SentenceTransformer

In [None]:
# we add this to remove stopwords
vectorizer_model = CountVectorizer(ngram_range=(1, 2), stop_words="english")
roberta = TransformerDocumentEmbeddings('roberta-base')
sentence_model = SentenceTransformer("all-MiniLM-L6-v2")
model = BERTopic(
    embedding_model=sentence_model,
    vectorizer_model=vectorizer_model,
    top_n_words=10,
    language='english', calculate_probabilities=True,
    verbose=True
)
# model = BERTopic(language="english", calculate_probabilities=True, verbose=True)


Downloading config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

Downloading vocab.json:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading merges.txt:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/478M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/612 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/350 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/349 [00:00<?, ?B/s]

NameError: ignored

In [None]:
topics, probs = model2.fit_transform(docs)

Batches:   0%|          | 0/106 [00:00<?, ?it/s]

2022-08-16 20:44:10,940 - BERTopic - Transformed documents to Embeddings
2022-08-16 20:44:32,393 - BERTopic - Reduced dimensionality
2022-08-16 20:44:33,581 - BERTopic - Clustered reduced embeddings


In [None]:
model.save("/content/my_model")

In [None]:
model2 = 	BERTopic.load("/content/my_model")

In [None]:
topic_info = model2.get_topic_info()
# topic_info.to_csv("/content/topic.csv")

In [None]:
topic_info

Unnamed: 0,Topic,Count,Name
0,-1,865,-1_truck_dump_train_dump truck
1,0,240,0_train_new_hours_station
2,1,162,1_boy_scouts_scout_boy scouts
3,2,98,2_mendon_mendon missouri_near mendon_near
4,3,83,3_train crash_crash missouri_crash_killed dozens
...,...,...,...
80,79,11,79_derails officials_officials_dead injured_of...
81,80,11,80_el_collides dump_derails missour_dump
82,81,11,81_losangeles_dumptruck la_la socal_losangeles...
83,82,11,82_live_derailment live_live ntsb_updates amtrak


In [None]:
lis = [3,4,5,9,12,15,17,19]
model2.visualize_barchart(lis)

In [None]:
tweet_topic =[]
prob = []
for x in probs:
  y = x.tolist()
  max_index = y.index(max(y))
  tweet_topic.append(max_index)
  prob.append(max(y))

top = pd.DataFrame({'Topic':tweet_topic,
    'Prob':prob})

In [None]:
topic_info.columns,top.columns

(Index(['Topic', 'Count', 'Name'], dtype='object'),
 Index(['Topic', 'Prob'], dtype='object'))

In [None]:
df_new = top.merge(topic_info, how='inner', on='Topic')
df_new

Unnamed: 0,Topic,Prob,Count,Name
0,0,0.262264,240,0_train_new_hours_station
1,0,0.090416,240,0_train_new_hours_station
2,0,0.018861,240,0_train_new_hours_station
3,0,0.052299,240,0_train_new_hours_station
4,0,0.127333,240,0_train_new_hours_station
...,...,...,...,...
3370,72,0.031484,12,72_truck injuries_injuries reported_hitting tr...
3371,72,0.031441,12,72_truck injuries_injuries reported_hitting tr...
3372,72,0.022025,12,72_truck injuries_injuries reported_hitting tr...
3373,72,1.000000,12,72_truck injuries_injuries reported_hitting tr...


In [None]:
df_new = pd.concat([df2,df_new], axis=1, ignore_index=False).reset_index()
df_new1 = df_new[['created_at', 'text','clean_tweet', 'Topic', 'Prob','Name']]
df_new1

Unnamed: 0,created_at,text,clean_tweet,Topic,Prob,Name
0,2022-07-07 23:34:22+00:00,I'm sitting on an Amtrak train and I'm looking...,sitting amtrak train looking sunset clear sky ...,0,0.262264,0_train_new_hours_station
1,2022-07-07 22:58:01+00:00,Multiple lawsuits have been filed in the after...,multiple lawsuits filed aftermath amtrak train...,0,0.090416,0_train_new_hours_station
2,2022-07-07 22:54:15+00:00,Community response in crisis - the small town ...,community response crisis small town helped pe...,0,0.018861,0_train_new_hours_station
3,2022-07-07 22:09:48+00:00,Three died &amp; at least 50 were injured afte...,three died amp least injured #amtrak train co...,0,0.052299,0_train_new_hours_station
4,2022-07-07 21:44:45+00:00,Lawsuit: Amtrak train over capacity before Mis...,lawsuit amtrak train capacity missouri crash,0,0.127333,0_train_new_hours_station
...,...,...,...,...,...,...
3370,2022-06-25 06:24:14+00:00,"step by my coat, while I'm focused, I'm fresh ...",step coat focused fresh like amtrak train dare...,72,0.031484,72_truck injuries_injuries reported_hitting tr...
3371,2022-06-25 03:43:35+00:00,"@lukehop1 @Sydsnap ""A train"" is more common in...",train common areas commuter rail system catch...,72,0.031441,72_truck injuries_injuries reported_hitting tr...
3372,2022-06-25 02:37:18+00:00,But on a Amtrak train,amtrak tra,72,0.022025,72_truck injuries_injuries reported_hitting tr...
3373,2022-06-25 01:49:32+00:00,@TyrannyBanks @SecretaryPete @Amtrak @USDOT @M...,meteor go tampa star amtrak train running cur...,72,1.000000,72_truck injuries_injuries reported_hitting tr...


In [None]:
df_new1.to_csv('/content/tweet_with_topic_v2.csv')

In [None]:
tweets = df2.clean_tweet.to_list()
df2['created_at'] = pd.to_datetime(df2['created_at'])
timestamps = df2.created_at.tolist()
# topics = df_new1.Name.tolist()


In [None]:
# topics

In [None]:
chosen_topic_timeseris = [0,5,7,8,9]
chose_df = df_new1[df_new1['Topic'].isin(chosen_topic_timeseris)]
topics1 = chose_df["Name"].tolist()

In [None]:
# topics1

In [None]:
tweets = chose_df.clean_tweet.to_list()
chose_df['created_at'] = pd.to_datetime(chose_df['created_at'])
timestamps = chose_df.created_at.tolist()
# topics = chose_df.Name.tolist()

In [None]:
chose_df

Unnamed: 0,created_at,text,clean_tweet,Topic,Prob,Name
0,2022-07-07 23:34:22+00:00,I'm sitting on an Amtrak train and I'm looking...,sitting amtrak train looking sunset clear sky ...,0,0.135101,0_train_new_amtrak_hours
1,2022-07-07 22:58:01+00:00,Multiple lawsuits have been filed in the after...,multiple lawsuits filed aftermath amtrak train...,0,0.116632,0_train_new_amtrak_hours
2,2022-07-07 22:54:15+00:00,Community response in crisis - the small town ...,community response crisis small town helped pe...,0,0.029618,0_train_new_amtrak_hours
3,2022-07-07 22:09:48+00:00,Three died &amp; at least 50 were injured afte...,three died amp least injured #amtrak train co...,0,0.081093,0_train_new_amtrak_hours
4,2022-07-07 21:44:45+00:00,Lawsuit: Amtrak train over capacity before Mis...,lawsuit amtrak train capacity missouri crash,0,0.103193,0_train_new_amtrak_hours
...,...,...,...,...,...,...
2584,2022-06-27 21:49:31+00:00,Multiple injuries after Amtrak train hits dump...,multiple injuries amtrak train hits dump truck...,8,0.065025,8_train moving_moving speed_crash ntsb_limit d...
2585,2022-06-27 21:49:04+00:00,New story on NPR: An Amtrak train collides wit...,new story npr amtrak train collides truck dera...,8,0.089641,8_train moving_moving speed_crash ntsb_limit d...
2586,2022-06-27 21:49:03+00:00,"Amtrak train, carrying 243 passengers from LA ...",amtrak train carrying passengers la chicago d...,8,0.063468,8_train moving_moving speed_crash ntsb_limit d...
2587,2022-06-27 21:48:22+00:00,"""At least 50 injured and multiple people kille...",least injured multiple people killed amtrak ...,8,0.028985,8_train moving_moving speed_crash ntsb_limit d...


In [None]:
topics_over_time = model2.topics_over_time(tweets, topics1, timestamps, nr_bins=20)

17it [00:00, 35.31it/s]


In [None]:
model2.visualize_topics_over_time(topics_over_time)

In [None]:
model2.visualize_topics_over_time(topics_over_time)

# Word freq

In [None]:
all_unique_tweets = df1.clean_tweet.unique()
all_unique_tweets[0].split()

['sitting',
 'amtrak',
 'train',
 'looking',
 'sunset',
 'clear',
 'sky',
 'hour',
 'passes',
 'sky',
 'cloudy',
 'sun',
 'look',
 'pretty',
 'crazy',
 'someone',
 'view',
 'pretty',
 'others',
 'think',
 'weather',
 'beautifu']

In [None]:
import collections
import itertools
words_in_tweet = [tweet.split() for tweet in all_unique_tweets]

all_words = list(itertools.chain(*words_in_tweet))

# Create counter
counts_no_urls = collections.Counter(all_words)

In [None]:
counts_no_urls.most_common(30)

[('train', 4857),
 ('amtrak', 4202),
 ('missouri', 2553),
 ('truck', 1611),
 ('derails', 1189),
 ('dump', 1183),
 ('people', 1168),
 ('injured', 1059),
 ('derailed', 902),
 ('killed', 875),
 ('derailment', 747),
 ('dead', 646),
 ('least', 616),
 ('three', 604),
 ('passengers', 585),
 ('multiple', 574),
 ('crossing', 565),
 ('chicago', 549),
 ('crash', 459),
 ('mendon', 426),
 ('injuries', 402),
 ('hitting', 387),
 ('monday', 365),
 ('car', 349),
 ('via', 332),
 ('#amtrak', 330),
 ('hits', 308),
 ('reported', 300),
 ('said', 288),
 ('news', 286)]

In [None]:
word_freq = pd.DataFrame.from_dict(counts_no_urls, orient='index').reset_index()
word_freq = word_freq.rename(columns={'index': "word", 0: "count"})
word_freq.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6416 entries, 0 to 6415
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   word    6416 non-null   object
 1   count   6416 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 100.4+ KB


In [None]:
word_freq = word_freq.sort_values(by='count', ascending=False).reset_index()
word_freq = word_freq[['word','count']]
word_freq

Unnamed: 0,word,count
0,train,4857
1,amtrak,4202
2,missouri,2553
3,truck,1611
4,derails,1189
...,...,...
6411,#trainaccidentlawyers,1
6412,#trainaccidentattorneys,1
6413,placement,1
6414,careful,1


In [None]:
word_freq.to_csv('/content/amtrak_wordfreq.csv')

# Geotag

In [None]:
import re
import spacy
nlp = spacy.load("en_core_web_sm")

import numpy as np
df1['OrganizationTag'] = ''
df1['OrganizationMention'] = ''
df1['Geotag'] = ''
df1['GeoMention'] = ''
for index, row in df1.iterrows():
  if(row["hashtags"] != []):
    hashtags = (row["hashtags"])
    print(hashtags)
    hashtags = str(hashtags)[1:-1]
    text = re.findall("#([a-zA-Z0-9_]{1,50})", hashtags)
    text = str(text)[1:-1]
    doc = nlp(text) 
    ents = [(e.text, e.label_) for e in doc.ents]
    geo = []
    org = []
    flag = 0
    for i in ents:
      if(i[1] == "GPE"):
        geo.append(i[0])
        flag = 2
      if(i[1] == "ORG"):
        org.append(i[0])
        flag = 1
    if flag == 2:
      df1.at[index, 'Geotag'] = geo
    if flag == 1:
      df1.at[index, 'OrganizationTag'] = org

['#Amtrak', '#Mendon', '#Missouri', '#US', '#LosAngeles', '#Chicago', '#SEAToday']
['#stlouis', '#news']
['#CovidIsNotOver', '#MaskUp']
['#Amtrak', '#Mendon', '#Missouri', '#US', '#LosAngeles', '#Chicago', '#SEAToday']
['#CallawayMo', '#MidMo']
['#passengertrain', '#StLouis-', '#KCMO', '#AmtrakMidwest']
['#passengertrain', '#StLouis-', '#KCMO', '#AmtrakMidwest']
['#RFootball']
['#passengertrain', '#StLouis-', '#KCMO', '#AmtrakMidwest']
['#passengertrain', '#StLouis-', '#KCMO', '#AmtrakMidwest']
['#Amtrak', '#Mendon', '#Missouri', '#US', '#LosAngeles', '#Chicago', '#SEAToday']
['#Amtrak', '#Mendon', '#Missouri', '#US', '#LosAngeles', '#Chicago', '#SEAToday']
['#Amtrak', '#Mendon', '#Missouri', '#US', '#LosAngeles', '#Chicago', '#SEAToday']
['#btv']
['#duvalcounty']
['#quote', '#Travel']
['#quote', '#Travel']
['#Amtrak', '#Mendon', '#Missouri', '#US', '#LosAngeles', '#Chicago', '#SEAToday']
['#JCMO', '#MidMo']
['#Amtrak', '#Mendon', '#Missouri', '#US', '#LosAngeles', '#Chicago', '#SEATod

In [None]:
import numpy as np

df1['GeoMention'] = ''
df1['OrganizationMention'] = ''
for index, row in df1.iterrows():
  if(row["mentioned"] != '[]'):
    hashtags = (row["mentioned"])
    hashtags = str(hashtags)[1:-1]
    text = re.findall("@([a-zA-Z0-9_]{1,50})", hashtags)
    text = str(text)[1:-1]
    doc = nlp(text) 
    ents = [(e.text, e.label_) for e in doc.ents]
    geo = []
    org = []
    flag = 0
    for i in ents:
      if(i[1] == "ORG"):
        org.append(i[0])
        flag = 1
      if(i[1] == "GPE"):
        geo.append(i[0])
        flag = 2
        # print(geo)
    if flag == 2:
      df1.at[index, 'GeoMention'] = geo
    if flag == 1:
      df1.at[index, 'OrganizationMention'] = org

# Sentiment

In [None]:
df1.to_csv("/content/amtrak_data.csv")

In [None]:
df = pd.read_csv("/content/tweet_with_topic.csv")

In [None]:
from emoji import demojize
from html import unescape
from wordsegment import segment
import re
def clean_tweet_senti(tweet, bigrams=False):
    tweet = remove_users(tweet)
    tweet = remove_links(tweet)
    # tweet = demojize(tweet, delimiters=(' :', ': '))
    # tweet = tweet.lower() # lower case
    # tweet = re.sub('['+my_punctuation + ']+', ' ', tweet) # strip punctuation
    tweet = re.sub('\s+', ' ', tweet) #remove double spacing
    tweet = demojize(tweet, delimiters=(' :', ': '))
    tweet = re.sub(r'\\n', '', tweet)  # remove newlines
    # if bigrams:
    #     tweet_token_list = tweet_token_list+[tweet_token_list[i]+'_'+tweet_token_list[i+1]
    #                                         for i in range(len(tweet_token_list)-1)]
    # tweet = ' '.join(tweet_token_list)
    return tweet

In [None]:
df['tweet_senti'] = df.text.apply(clean_tweet_senti)

In [None]:
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer
import numpy as np
from scipy.special import softmax
import csv
import urllib.request

# Preprocess text (username and link placeholders)
def preprocess(text):
    new_text = []
 
 
    for t in text.split(" "):
        t = '@user' if t.startswith('@') and len(t) > 1 else t
        t = 'http' if t.startswith('http') else t
        new_text.append(t)
    return " ".join(new_text)

# Tasks:
# emoji, emotion, hate, irony, offensive, sentiment
# stance/abortion, stance/atheism, stance/climate, stance/feminist, stance/hillary

task='sentiment'
MODEL = "/content/cardiffnlp/twitter-roberta-base-{task}"
MODEL = f"cardiffnlp/twitter-roberta-base-{task}"

tokenizer = AutoTokenizer.from_pretrained(MODEL)

# download label mapping
labels=[]
mapping_link = f"https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/{task}/mapping.txt"
with urllib.request.urlopen(mapping_link) as f:
    html = f.read().decode('utf-8').split("\n")
    csvreader = csv.reader(html, delimiter='\t')
labels = [row[1] for row in csvreader if len(row) > 1]

# PT
model = AutoModelForSequenceClassification.from_pretrained(MODEL)
model.save_pretrained(MODEL)

Downloading config.json:   0%|          | 0.00/747 [00:00<?, ?B/s]

Downloading vocab.json:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading merges.txt:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/476M [00:00<?, ?B/s]

In [None]:
df1 = df.drop_duplicates(subset=['tweet_senti'])
batch_sentences = df1["tweet_senti"].tolist()
positive = []
neutral = []
negative = []
batch_size = 5
num_samples = len(batch_sentences)
num_batches = num_samples / batch_size
num_batches = round(num_batches)
for i in range(num_batches):
    # print(i)
    start_idx = i * batch_size
    end_idx = (i + 1) * batch_size
    if end_idx > num_samples:
        end_idx = num_samples
    x_batch = batch_sentences[start_idx:end_idx]
    encoded_input = tokenizer(x_batch, padding=True, truncation=True, return_tensors="pt")
    output = model(**encoded_input)
    # print(len(output[0]))
    for i in range(0,len(output[0])):
      scores = output[0][i].detach().numpy()
      scores = softmax(scores)
      positive.append(scores[2])
      neutral.append(scores[1])
      negative.append(scores[0])

In [None]:
df1["positive"] = positive
df1["neutral"] = neutral
df1["negative"] = negative
sentiment = ['positive', 'neutral','negative']
for index, row in df1.iterrows():
  # lis = [row['Anger'],row['Depression'],row['Fatigue'],row['Vigour'],row['Tension'],row['Confusion']]
  lis = [row['positive'],row['neutral'],row['negative']]
  max_value = max(lis)
  max_index = lis.index(max_value)
  # max_index = lis.index(max_value)
  df1.at[index, 'Sentiment'] = sentiment[max_index]

  

In [None]:
df1['Sentiment'].value_counts()

negative    3449
neutral     1144
positive     246
Name: Sentiment, dtype: int64

In [None]:
def clean_tweet_emo(tweet, bigrams=False):
    tweet = remove_users(tweet)
    tweet = remove_links(tweet)
    # tweet = tweet.lower() # lower case
    # tweet = re.sub('['+my_punctuation + ']+', ' ', tweet) # strip punctuation
    tweet = re.sub('\s+', ' ', tweet) #remove double spacing
    tweet = demojize(tweet, delimiters=(' :', ': '))
    tweet = re.sub(r'\\n', '', tweet)  # remove newlines
    # if bigrams:
    #     tweet_token_list = tweet_token_list+[tweet_token_list[i]+'_'+tweet_token_list[i+1]
    #                                         for i in range(len(tweet_token_list)-1)]
    # tweet = ' '.join(tweet_token_list)
    return tweet

In [None]:
df1['clean_tweet_emo'] = df1.text.apply(clean_tweet_emo)

In [None]:
df_new.to_csv("/content/tweet_with_topic_senti_emo.csv")

In [None]:
df = pd.read_csv("/content/tweet_with_topic_emo.csv")
df1 = pd.read_csv("/content/tweet_with_topic_senti_emo.csv")
df_new = pd.concat([df1,df], axis=1, ignore_index=False).reset_index()
df_new

Unnamed: 0.3,index,Unnamed: 0,Unnamed: 0.1,created_at,text,clean_tweet,Topic,Prob,Name,tweet_senti,...,Sentiment,clean_tweet_emo,Unnamed: 0.2,Tweet,Anger,Disgust,Fear,Joy,Sadness,Surprise
0,0,0,0,2022-07-07T23:34:22.000Z,I'm sitting on an Amtrak train and I'm looking...,sitting amtrak train looking sunset clear sky ...,0,0.004982,0_like_hours_time_train,I'm sitting on an Amtrak train and I'm looking...,...,negative,I'm sitting on an Amtrak train and I'm looking...,0,I'm sitting on an Amtrak train and I'm looking...,0.247981,0.034915,0.274323,0.288415,0.074412,0.079954
1,1,1,1,2022-07-07T22:58:01.000Z,Multiple lawsuits have been filed in the after...,multiple lawsuits filed aftermath amtrak train...,0,0.129245,0_like_hours_time_train,Multiple lawsuits have been filed in the after...,...,negative,Multiple lawsuits have been filed in the after...,1,Multiple lawsuits have been filed in the after...,0.021269,0.077412,0.821822,0.004486,0.030966,0.044046
2,2,2,2,2022-07-07T22:54:15.000Z,Community response in crisis - the small town ...,community response crisis small town helped pe...,0,0.203460,0_like_hours_time_train,Community response in crisis - the small town ...,...,neutral,Community response in crisis - the small town ...,2,Community response in crisis - the small town ...,0.064493,0.034457,0.103957,0.655608,0.058958,0.082527
3,3,3,3,2022-07-07T22:09:48.000Z,Three died &amp; at least 50 were injured afte...,three died amp least injured #amtrak train co...,0,0.017905,0_like_hours_time_train,Three died &amp; at least 50 were injured afte...,...,negative,Three died &amp; at least 50 were injured afte...,3,Three died &amp; at least 50 were injured afte...,0.077969,0.029546,0.120075,0.344025,0.412177,0.016208
4,4,4,4,2022-07-07T21:44:45.000Z,Lawsuit: Amtrak train over capacity before Mis...,lawsuit amtrak train capacity missouri crash,0,0.076439,0_like_hours_time_train,Lawsuit: Amtrak train over capacity before Mis...,...,negative,Lawsuit: Amtrak train over capacity before Mis...,4,Lawsuit: Amtrak train over capacity before Mis...,0.006282,0.011733,0.098546,0.235400,0.418989,0.229051
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4834,4834,4834,4834,2022-06-25T02:35:49.000Z,My Dream Lionel Train: My Own Amtrak Train by ...,dream lionel train amtrak train lego lionel fa...,92,1.000000,92_missouri injured_passengers missouri_missou...,My Dream Lionel Train: My Own Amtrak Train by ...,...,positive,My Dream Lionel Train: My Own Amtrak Train by ...,4834,My Dream Lionel Train: My Own Amtrak Train by ...,0.002771,0.000351,0.014361,0.934068,0.032738,0.015710
4835,4835,4835,4835,2022-06-25T01:52:59.000Z,@AccuPovick @breakingweather I took this awhil...,took awhile amtrak train minnesota small town...,92,1.000000,92_missouri injured_passengers missouri_missou...,I took this awhile I was in on amtrak train i...,...,negative,I took this awhile I was in on amtrak train i...,4835,I took this awhile I was in on amtrak train i...,0.056148,0.003938,0.143192,0.461008,0.201908,0.133806
4836,4836,4836,4836,2022-06-25T01:49:32.000Z,@TyrannyBanks @SecretaryPete @Amtrak @USDOT @M...,meteor go tampa star amtrak train running cur...,92,1.000000,92_missouri injured_passengers missouri_missou...,The Meteor does not go to Tampa. But the Star...,...,neutral,The Meteor does not go to Tampa. But the Star...,4836,The Meteor does not go to Tampa. But the Star...,0.019655,0.005047,0.568081,0.301736,0.099666,0.005816
4837,4837,4837,4837,2022-06-25T01:00:23.000Z,"Just took my first amtrak train up to nyc, it ...",took first amtrak train nyc great,92,1.000000,92_missouri injured_passengers missouri_missou...,"Just took my first amtrak train up to nyc, it ...",...,positive,"Just took my first amtrak train up to nyc, it ...",4837,"Just took my first amtrak train up to nyc, it ...",0.000043,0.000024,0.001135,0.071670,0.000677,0.926450


In [None]:
emotion = ['Anger','Disgust', 'Fear', 'Joy', 'Sadness','Surprise']
# emotion = ['Anger', 'Depression','Fatigue', 'Vigour', 'Tension', 'Confusion']

for index, row in df_new.iterrows():
  # lis = [row['Anger'],row['Depression'],row['Fatigue'],row['Vigour'],row['Tension'],row['Confusion']]
  lis = [row['Anger'],row['Disgust'],row['Fear'],row['Joy'],row['Sadness'],row['Surprise']]
  max_value = max(lis)
  max_index = lis.index(max_value)
  # max_index = lis.index(max_value)
  df_new.at[index, 'Emotion'] = emotion[max_index]
df_new

Unnamed: 0.3,index,Unnamed: 0,Unnamed: 0.1,created_at,text,clean_tweet,Topic,Prob,Name,tweet_senti,...,clean_tweet_emo,Unnamed: 0.2,Tweet,Anger,Disgust,Fear,Joy,Sadness,Surprise,Emotion
0,0,0,0,2022-07-07T23:34:22.000Z,I'm sitting on an Amtrak train and I'm looking...,sitting amtrak train looking sunset clear sky ...,0,0.004982,0_like_hours_time_train,I'm sitting on an Amtrak train and I'm looking...,...,I'm sitting on an Amtrak train and I'm looking...,0,I'm sitting on an Amtrak train and I'm looking...,0.247981,0.034915,0.274323,0.288415,0.074412,0.079954,Joy
1,1,1,1,2022-07-07T22:58:01.000Z,Multiple lawsuits have been filed in the after...,multiple lawsuits filed aftermath amtrak train...,0,0.129245,0_like_hours_time_train,Multiple lawsuits have been filed in the after...,...,Multiple lawsuits have been filed in the after...,1,Multiple lawsuits have been filed in the after...,0.021269,0.077412,0.821822,0.004486,0.030966,0.044046,Fear
2,2,2,2,2022-07-07T22:54:15.000Z,Community response in crisis - the small town ...,community response crisis small town helped pe...,0,0.203460,0_like_hours_time_train,Community response in crisis - the small town ...,...,Community response in crisis - the small town ...,2,Community response in crisis - the small town ...,0.064493,0.034457,0.103957,0.655608,0.058958,0.082527,Joy
3,3,3,3,2022-07-07T22:09:48.000Z,Three died &amp; at least 50 were injured afte...,three died amp least injured #amtrak train co...,0,0.017905,0_like_hours_time_train,Three died &amp; at least 50 were injured afte...,...,Three died &amp; at least 50 were injured afte...,3,Three died &amp; at least 50 were injured afte...,0.077969,0.029546,0.120075,0.344025,0.412177,0.016208,Sadness
4,4,4,4,2022-07-07T21:44:45.000Z,Lawsuit: Amtrak train over capacity before Mis...,lawsuit amtrak train capacity missouri crash,0,0.076439,0_like_hours_time_train,Lawsuit: Amtrak train over capacity before Mis...,...,Lawsuit: Amtrak train over capacity before Mis...,4,Lawsuit: Amtrak train over capacity before Mis...,0.006282,0.011733,0.098546,0.235400,0.418989,0.229051,Sadness
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4834,4834,4834,4834,2022-06-25T02:35:49.000Z,My Dream Lionel Train: My Own Amtrak Train by ...,dream lionel train amtrak train lego lionel fa...,92,1.000000,92_missouri injured_passengers missouri_missou...,My Dream Lionel Train: My Own Amtrak Train by ...,...,My Dream Lionel Train: My Own Amtrak Train by ...,4834,My Dream Lionel Train: My Own Amtrak Train by ...,0.002771,0.000351,0.014361,0.934068,0.032738,0.015710,Joy
4835,4835,4835,4835,2022-06-25T01:52:59.000Z,@AccuPovick @breakingweather I took this awhil...,took awhile amtrak train minnesota small town...,92,1.000000,92_missouri injured_passengers missouri_missou...,I took this awhile I was in on amtrak train i...,...,I took this awhile I was in on amtrak train i...,4835,I took this awhile I was in on amtrak train i...,0.056148,0.003938,0.143192,0.461008,0.201908,0.133806,Joy
4836,4836,4836,4836,2022-06-25T01:49:32.000Z,@TyrannyBanks @SecretaryPete @Amtrak @USDOT @M...,meteor go tampa star amtrak train running cur...,92,1.000000,92_missouri injured_passengers missouri_missou...,The Meteor does not go to Tampa. But the Star...,...,The Meteor does not go to Tampa. But the Star...,4836,The Meteor does not go to Tampa. But the Star...,0.019655,0.005047,0.568081,0.301736,0.099666,0.005816,Fear
4837,4837,4837,4837,2022-06-25T01:00:23.000Z,"Just took my first amtrak train up to nyc, it ...",took first amtrak train nyc great,92,1.000000,92_missouri injured_passengers missouri_missou...,"Just took my first amtrak train up to nyc, it ...",...,"Just took my first amtrak train up to nyc, it ...",4837,"Just took my first amtrak train up to nyc, it ...",0.000043,0.000024,0.001135,0.071670,0.000677,0.926450,Surprise


In [None]:
df_new['Emotion'].value_counts()

Joy         2902
Fear         648
Sadness      633
Surprise     508
Disgust       81
Anger         67
Name: Emotion, dtype: int64