# Analysis of Tweets on Generative AI

### Import Packages & Download Dataset

In [None]:
# install tweetnlp
# ! pip install tweetnlp

In [3]:
# import packages
import os
import pandas as pd
from datetime import datetime as dt
import numpy as np
import tweetnlp
import spacy

In [165]:
full_df = pd.read_csv(r'C:\Users\rebri\Documents\Data Projects\gen-ai-tweets\dataset\GenerativeAI tweets.csv')
full_df.head()

Unnamed: 0.1,Unnamed: 0,Datetime,Tweet Id,Text,Username
0,0,2023-04-19 21:27:19+00:00,1648800467206672384,From Studio Gangster to Synthetic Gangster 🎤.....,resembleai
1,1,2023-04-19 21:27:09+00:00,1648800425540476929,Took me some time to find this. I build this #...,devaanparbhoo
2,2,2023-04-19 21:26:57+00:00,1648800376479715328,Mind blowing next wave #generativeai platform...,timreha
3,3,2023-04-19 21:26:49+00:00,1648800341193027584,Open Source Generative AI Image Specialist Sta...,VirtReview
4,4,2023-04-19 21:25:00+00:00,1648799883934203905,Are you an #HR leader considering which future...,FrozeElle


In [167]:
len(df)

56221

Due to the size of the dataset, we will select a sample size of 10K tweets to develop the project with.

In [219]:
# select 10,000 random tweets
sample = full_df.sample(n = 6000, random_state = 42)

df = pd.DataFrame(sample)
df.sort_values(by = 'Datetime', ascending = False, inplace = True)
df.head()

Unnamed: 0.1,Unnamed: 0,Datetime,Tweet Id,Text,Username
4,4,2023-04-19 21:25:00+00:00,1648799883934203905,Are you an #HR leader considering which future...,FrozeElle
6,6,2023-04-19 21:21:48+00:00,1648799078342426625,Salesforce announces plans to integrate Einste...,annebonnerdata
34,34,2023-04-19 20:36:53+00:00,1648787775481577472,Red White Dots. @openfashion_en #aifashionchal...,wyzborrero
40,40,2023-04-19 20:24:30+00:00,1648784660749508609,Ending our Demo Day with an interactive sessio...,bcgx_
80,80,2023-04-19 19:22:30+00:00,1648769055401324544,Engaged in productive meetings with photograph...,Akash93892149


In [220]:
df.dtypes

Unnamed: 0     int64
Datetime      object
Tweet Id       int64
Text          object
Username      object
dtype: object

To process the data, we will:
1. Drop the unnamed column (duplicate index)
2. Change the datetime column to datetime datatype


In [221]:
df.drop(columns = ['Unnamed: 0'], inplace = True)
df.head()

Unnamed: 0,Datetime,Tweet Id,Text,Username
4,2023-04-19 21:25:00+00:00,1648799883934203905,Are you an #HR leader considering which future...,FrozeElle
6,2023-04-19 21:21:48+00:00,1648799078342426625,Salesforce announces plans to integrate Einste...,annebonnerdata
34,2023-04-19 20:36:53+00:00,1648787775481577472,Red White Dots. @openfashion_en #aifashionchal...,wyzborrero
40,2023-04-19 20:24:30+00:00,1648784660749508609,Ending our Demo Day with an interactive sessio...,bcgx_
80,2023-04-19 19:22:30+00:00,1648769055401324544,Engaged in productive meetings with photograph...,Akash93892149


In [222]:
df['Datetime'] = pd.to_datetime(df['Datetime'], format = '%Y-%m-%d %H:%M:%S%z')
df.head()

Unnamed: 0,Datetime,Tweet Id,Text,Username
4,2023-04-19 21:25:00+00:00,1648799883934203905,Are you an #HR leader considering which future...,FrozeElle
6,2023-04-19 21:21:48+00:00,1648799078342426625,Salesforce announces plans to integrate Einste...,annebonnerdata
34,2023-04-19 20:36:53+00:00,1648787775481577472,Red White Dots. @openfashion_en #aifashionchal...,wyzborrero
40,2023-04-19 20:24:30+00:00,1648784660749508609,Ending our Demo Day with an interactive sessio...,bcgx_
80,2023-04-19 19:22:30+00:00,1648769055401324544,Engaged in productive meetings with photograph...,Akash93892149


In [223]:
# check datatypes again
df.dtypes

Datetime    datetime64[ns, UTC]
Tweet Id                  int64
Text                     object
Username                 object
dtype: object

Let's check what the text of the tweets look like.

In [224]:
# check full text of a tweet
df.iloc[4, 2]

'Engaged in productive meetings with photography association and research faculty, gathering insights on content provenance challenges in the Generative AI era. #contentprovenance #generativeAI 📷'

I chose to show the above Tweet because it includes a number of interesting features which we should address:
*  Newline characters (\n)
* Link at the end of the Tweet
* Hashtags (#StableLM)
* Mentions (@StabilityAI)

Let's start with removing the newline characters.

In [225]:
df['Text'] = df['Text'].str.replace('\n', '')
df.iloc[4, 2]

'Engaged in productive meetings with photography association and research faculty, gathering insights on content provenance challenges in the Generative AI era. #contentprovenance #generativeAI 📷'

In [226]:
df.iloc[0,2]

"Are you an #HR leader considering which future trends to prioritize? Watch this 3-part series w/ @holgermu &amp; @diginomica co-founder @jonerp analyzing @workday's 2023 #AI &amp; #ML Summit, and broader #GenerativeAI and HR trends shaping the industry. https://t.co/LVJpzkMH9P"

Viewing another tweet reveals that it contains "&amp;", which is the HTML entity code for an ampersand (&). I will also replace this.

In [227]:
df['Text'] = df['Text'].str.replace('&amp;', '&')
df.iloc[0,2]

"Are you an #HR leader considering which future trends to prioritize? Watch this 3-part series w/ @holgermu & @diginomica co-founder @jonerp analyzing @workday's 2023 #AI & #ML Summit, and broader #GenerativeAI and HR trends shaping the industry. https://t.co/LVJpzkMH9P"

Next, I want to see if more Tweets include a link.

In [228]:
# select 5 random tweets and print the full text
tweets = []
for i in range(5):
  x = np.random.randint(0, len(df))
  text = df.iloc[i, 2]
  tweets.append(text)
tweets

["Are you an #HR leader considering which future trends to prioritize? Watch this 3-part series w/ @holgermu & @diginomica co-founder @jonerp analyzing @workday's 2023 #AI & #ML Summit, and broader #GenerativeAI and HR trends shaping the industry. https://t.co/LVJpzkMH9P",
 'Salesforce announces plans to integrate Einstein GPT and data cloud with its flow workflow automation suite. How natural language prompts will trigger actions based on real-time #data insights: #GenerativeAI #automation #analytics @salesforce https://t.co/GO1e5nS6qR',
 'Red White Dots. @openfashion_en #aifashionchallenge #openfashion #AIart #aifashion #midjourneyv5 #midjourney #AIArtwork #FashionTech #GenerativeAI https://t.co/FKexxVikgm',
 'Ending our Demo Day with an interactive session on #GenerativeAI hosted by Colin Jarvis and Nicolai Skabo from @OpenAI 💥. Thanks for joining us 👋.You can learn more about @BCG’s collaboration with #OpenAI here: https://t.co/JMowsGEnN6 https://t.co/rtS2ZDC0cC',
 'Engaged in prod

All five of these Tweets include a link, so it is likely many more do too. I will separate out all links in to a new column

In [229]:
df['Link'] = df['Text'].str.extract(r'(https?://[^\s]+)')
df['Text'] = df['Text'].str.replace(r'https?://[^\s]+|www\.[^\s]+|t\.co/[^\s]+', '', regex=True)
df.head()

Unnamed: 0,Datetime,Tweet Id,Text,Username,Link
4,2023-04-19 21:25:00+00:00,1648799883934203905,Are you an #HR leader considering which future...,FrozeElle,https://t.co/LVJpzkMH9P
6,2023-04-19 21:21:48+00:00,1648799078342426625,Salesforce announces plans to integrate Einste...,annebonnerdata,https://t.co/GO1e5nS6qR
34,2023-04-19 20:36:53+00:00,1648787775481577472,Red White Dots. @openfashion_en #aifashionchal...,wyzborrero,https://t.co/FKexxVikgm
40,2023-04-19 20:24:30+00:00,1648784660749508609,Ending our Demo Day with an interactive sessio...,bcgx_,https://t.co/JMowsGEnN6
80,2023-04-19 19:22:30+00:00,1648769055401324544,Engaged in productive meetings with photograph...,Akash93892149,


In [230]:
# confirm link is removed
df.iloc[4, 2]

'Engaged in productive meetings with photography association and research faculty, gathering insights on content provenance challenges in the Generative AI era. #contentprovenance #generativeAI 📷'

By checking for null values, we can also see that not *all* Tweets include a link.

In [231]:
# check for null values
df.isnull().sum()

Datetime      0
Tweet Id      0
Text          0
Username      0
Link        853
dtype: int64

Next, I want to copy the hashtags (#) and mentions (@) for each Tweet, so that I may further analyze them.

In [232]:
import re

r1 = "#\w+"
r2 = "@\w+"

df['Hashtag'] = df['Text'].str.findall(r1)
df['Mention'] = df['Text'].str.findall(r2)
df.head()

Unnamed: 0,Datetime,Tweet Id,Text,Username,Link,Hashtag,Mention
4,2023-04-19 21:25:00+00:00,1648799883934203905,Are you an #HR leader considering which future...,FrozeElle,https://t.co/LVJpzkMH9P,"[#HR, #AI, #ML, #GenerativeAI]","[@holgermu, @diginomica, @jonerp, @workday]"
6,2023-04-19 21:21:48+00:00,1648799078342426625,Salesforce announces plans to integrate Einste...,annebonnerdata,https://t.co/GO1e5nS6qR,"[#data, #GenerativeAI, #automation, #analytics]",[@salesforce]
34,2023-04-19 20:36:53+00:00,1648787775481577472,Red White Dots. @openfashion_en #aifashionchal...,wyzborrero,https://t.co/FKexxVikgm,"[#aifashionchallenge, #openfashion, #AIart, #a...",[@openfashion_en]
40,2023-04-19 20:24:30+00:00,1648784660749508609,Ending our Demo Day with an interactive sessio...,bcgx_,https://t.co/JMowsGEnN6,"[#GenerativeAI, #OpenAI]","[@OpenAI, @BCG]"
80,2023-04-19 19:22:30+00:00,1648769055401324544,Engaged in productive meetings with photograph...,Akash93892149,,"[#contentprovenance, #generativeAI]",[]


To explore how TweetNLP handles mentions and hashtags, we will use testcases with a couple of the models

In [None]:
# testing with NER model
model = tweetnlp.NER()

testcase = 'For example, @Microsoft and #Google and @Amazon are all big tech companies. So is @IBM'
model.ner(testcase)

In [None]:
# testing with the sentiment model

model = tweetnlp.Sentiment()

# define the 4 testcases
senTest = pd.DataFrame()
testcases = ['I love dogs',
             'I #love dogs',
             'I hate dogs',
             'I #hate dogs']

for string in testcases:
  result = model.sentiment(string, return_probability = True)
  negative, neutral, positive = result['probability'].values()
  # create df for comparing the results
  newdf = pd.DataFrame({
    'Phrase' : [string],
    'Negative' : [negative],
    'Neutral' : [neutral],
    'Positive' : [positive]
  })
  senTest = pd.concat([senTest,newdf], ignore_index = True)



senTest
# create visualization to emphasize?

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Unnamed: 0,Phrase,Negative,Neutral,Positive
0,I love dogs,0.0116,0.056431,0.931969
1,I #love dogs,0.006855,0.025927,0.967218
2,I hate dogs,0.859892,0.117777,0.022331
3,I #hate dogs,0.920269,0.066798,0.012934


### Load Model & Create Doc

In [None]:
# load medium spacy model
spacy.cli.download("en_core_web_md")
nlp = spacy.load("en_core_web_md")



# doc = nlp(concat_text)


[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')


#### option 1: keep as one doc

In [234]:
text_list = df['Text'].to_list()
concat_text = " ".join(text_list)
nlp.max_length = len(concat_text)
doc = nlp(concat_text)
# runtime: 16 s for 6000
# 33 s for 10000
# 4 m 20 s for 56221

#### option 2: split into two docs (not currently working)

In [145]:
# splitting the dataframe into two halves to process the data without potential memory issues
import math

fhEnd = math.floor(len(df)/2)

fh_list = df.loc[0:fhEnd,'Text'].to_list()
sh_list = df.loc[fhEnd:,'Text'].to_list()

fh_text = " ".join(fh_list)
sh_text = " ".join(sh_list)

In [148]:
# runs in 2m 20s
doc1 = nlp(fh_text)
doc2 = nlp(sh_text)

In [149]:
# combining the tokens of the two docs
combined_tokens = []
for doc in [doc1, doc2]:
    combined_tokens.extend(doc)

# creating a new doc with the concatenated tokens
doc = spacy.tokens.Doc(nlp.vocab, words = [token.text for token in combined_tokens])

#### What are the most common words or phrases about genAI? Are there any recurring topics or named entities?

In [235]:
from collections import Counter

nouns = [token.text.lower() for token in doc if token.is_stop != True and token.is_punct != True and token.pos_ == 'NOUN']
word_freq = Counter(nouns)
word_freq.most_common(10)

[('generativeai', 1537),
 ('images', 800),
 ('aiart', 569),
 ('chatgpt', 485),
 ('ai', 470),
 ('art', 388),
 ('courtesy', 340),
 ('technology', 276),
 ('tech', 253),
 ('data', 235)]

NOT NEEDED WITH SAMPLE SIZE DATASET: Because ♥ is not actually a noun, we will remove it from our rankings.

In [236]:
nouns = [token.text.lower() for token in doc if token.is_stop != True and token.is_punct != True and token.pos_ == 'NOUN' and token.text != '♥']
word_freq = Counter(nouns)
word_freq.most_common(10)

[('generativeai', 1537),
 ('images', 800),
 ('aiart', 569),
 ('chatgpt', 485),
 ('ai', 470),
 ('art', 388),
 ('courtesy', 340),
 ('technology', 276),
 ('tech', 253),
 ('data', 235)]

Words like 'images', 'aiart', and 'art' hint at the buzz around AI art generators, such as OpenAI's DALL·E 2, which launched on April 6th, 2022.

### Are there any recurring named entities?

In [237]:
model = tweetnlp.NER()

result_list = []
for tweet in df['Text']:
# for tweet in df.iloc[0:50, 2]:
    result = model.ner(tweet)
    result_list.extend(result)

result_df = pd.DataFrame(result_list)
result_df.head()

#runtime 32 m 46 s

Unnamed: 0,type,entity
0,person,@holgermu
1,corporation,@diginomica
2,person,@jonerp
3,corporation,@workday's
4,event,2023 #AI & #ML Summit


In [238]:
top10_ent = result_df['entity'].value_counts().head(10)
top10_ent

entity
 #StableDiffusion    299
 ChatGPT             225
 #                   194
GenerativeAI         183
@useroldyWarp        163
 @                   159
ChatGPT              134
Google                89
 Pixar                82
Microsoft             71
Name: count, dtype: int64

In [215]:
# using spacy instead of tweetnlp
ner_result = []
for tweet in df['Text']:
# for tweet in df.iloc[0:10, 2]:
    doc = nlp(tweet)
    entities = [(ent.label_, ent.text) for ent in doc.ents]
    ner_result.extend(entities)
result_df = pd.DataFrame(ner_result, columns = ['Label', 'Entity'])
result_df

# top10_ent = result_df['Entity'].value_counts().head(10)
# top10_ent

Unnamed: 0,Label,Entity
0,CARDINAL,3
1,ORG,@holgermu & @diginomica
2,GPE,@jonerp
3,DATE,@workday
4,MONEY,2023 #AI & #ML Summit
...,...,...
29002,DATE,next month
29003,GPE,Spain
29004,MONEY,#AI #
29005,MONEY,#ArtificialIntelligence #


In [None]:
# result = model.ner(concat_text)
# for entity, label in result:
#     print(f'Entity: {entity}, Label: {label}')