<a href="https://colab.research.google.com/github/rsb3git/AI_Sentiment-Twitter_Analysis/blob/main/AI_Sentiment_CodeFile.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Install and Import

In [None]:
!git clone https://github.com/zlisto/social_media_analytics

import os
os.chdir("social_media_analytics")

In [None]:
!pip install -r requirements.txt --quiet

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

import gensim.downloader as api
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation

import sklearn.cluster as cluster
from sklearn import metrics
from scipy import stats

from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

import pyLDAvis
import pyLDAvis.lda_model
pyLDAvis.enable_notebook()

import scripts.TextAnalysis as ta
from scripts.api import *

In [None]:
import umap
import tensorflow as tf
tf.test.gpu_device_name()

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import codecs  #this let's us display tweets properly (emojis, etc.)
pd.set_option("display.max_colwidth", None)

In [None]:
%%time
tokenizer = AutoTokenizer.from_pretrained("nlptown/bert-base-multilingual-uncased-sentiment")
model = AutoModelForSequenceClassification.from_pretrained("nlptown/bert-base-multilingual-uncased-sentiment")

# Mounting Drive

Code for mounting and unmounting your Google drive so you can access Arpita's DB files

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
#drive.flush_and_unmount()

  and should_run_async(code)


After mounting your drive, run this code to access the DB

In [None]:
fname_db = '/content/drive/MyDrive/Social Media Analytics/AItweets_v2'
df = DB.fetch(table_name='keyword_tweets',path=fname_db)
n=len(df)
print(f"{n} tweets")
df.head(n=2)

**CLEAN TEXT DATA**

In [None]:
#clean text data

df['text_clean'] = df.text.apply(ta.clean_tweet)  #clean the tweets
df = df[df.text_clean.str.len() >0]  #remove cleaned tweets of length 0
nclean = len(df)
print(f"{n} tweets, {nclean} clean tweets")

df.sample(n=5)

# Wordclouds, frequency, likes, retweets, tweet rate

##Wordcloud around keyword tweets

In [None]:
#Add Created At Datetime Column and Sort
format = "%Y-%m-%d %H:%M:%S"
df['created_at_datetime'] = pd.to_datetime(df['created_at'],format=format).dt.tz_localize(None)
df.sort_values(by = 'created_at_datetime',
               inplace = True,
               ascending = True)

#define initial and final time variables (for plot axis limits)
ti = df.created_at_datetime.head(n=1).values[0]
tf = df.created_at_datetime.tail(n=1).values[0]

print(f"Head: {ti}")
print(f"Tail: {tf}")

In [None]:
stopwords = set(STOPWORDS)
stopwords.add("de")
stopwords.add("que")
stopwords.add("chatgpt")
stopwords.add("ai")
stopwords.add('artificial intelligence')
stopwords.add("gpt4")
stopwords.add("gpt-4")
stopwords.add("midjourney")
stopwords.add("dalle")
stopwords.add("dall-e")
text=' '.join(df.text_clean.tolist()).lower()
wordcloud = WordCloud(stopwords=stopwords,max_font_size=150,
                      max_words=100,
                      background_color="black",
                      width=1000,
                      height=600)

wordcloud.generate(text)

#visualize word cloud
fig = plt.figure(figsize = (10,8))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()

##Keyword tweets vs like count and retweet count

In [None]:
#keywords and tweets graph

keywords = ['AI' or '"Artificial Intelligence"','Dall-E' or 'DallE',
            'ChatGPT', 'Midjourney', 'GPT-4' or 'GPT4']

df['keyword'] = None
for keyword in keywords:
    ind =  df.text.str.contains(keyword, case=False)
    df[f'keyword_{keyword}'] = ind
    df.loc[ind, 'keyword'] = keyword
    print(f"{keyword}: {len(df[df[f'keyword_{keyword}']==True])} tweets ")

print(f"{len(df)} total tweets")

fig = plt.figure(figsize = (16,6))
ax1 = plt.subplot(1,2,1)
sns.barplot(data = df,
                x = 'keyword',
                y = 'like_count',
                color = 'darkorange')
plt.ylabel("Like Count")
plt.xlabel("Keyword")
plt.title("Keywords vs Like Count",fontsize = 20)
plt.grid()
plt.show()

fig = plt.figure(figsize = (16,6))
ax2 = plt.subplot(1,2,2)
sns.barplot(data = df,
                x = 'keyword',
                y = 'retweet_count',
                color = 'blue')
plt.ylabel("Retweet Count")
plt.xlabel("Keyword")
plt.title("Keywords vs Retweet Count",fontsize = 20)
plt.grid()
plt.show()

##Keyword frequency

In [None]:
#keyword frequency

for keyword in keywords:
    ind =  df.text.str.contains(keyword, case=False)
    df.loc[ind, 'keyword'] = keyword
ax = sns.countplot(data = df,
                 x = 'keyword')
plt.grid()
plt.xlabel("Keyword")
plt.ylabel("Frequency")
plt.title("Keyword Frequency",fontsize = 20)
ax.set_xticklabels(ax.get_xticklabels(),
                     rotation = 90)
plt.show()

## Keyword tweet rate

In [None]:
#keyword tweet rate
colors = ['red','blue','orange','green','cyan']
fig = plt.figure(figsize = (12,4))

for color,keyword in zip(colors,keywords):
  df_plot = df[df.text.str.contains(keyword, case=False)].copy()
  df_plot['tweet_indicator'] = np.ones(len(df_plot))
  df_plot[f'rate_1D_{keyword}'] = df_plot.rolling('1D',on = 'created_at_datetime')['tweet_indicator'].sum()

  sns.lineplot(data = df_plot,
               x = 'created_at_datetime',
               y = f'rate_1D_{keyword}',
               label = keyword,linewidth = 1,
               color = color)

plt.grid()
plt.xlabel("Time")
plt.ylabel("Rate [tweets/day]")
plt.xlim(ti,tf)
#plt.ylim([0,30])

plt.legend()
plt.show()

## Peak rate and tweets

In [None]:
#peak tweet rate and tweets at peak

df['tweet_indicator'] = np.ones(len(df))
df['rate_1D'] = df.rolling('1D',on = 'created_at_datetime').sum()['tweet_indicator']
df['rate_7D'] = df.rolling('7D',on = 'created_at_datetime').sum()['tweet_indicator']/7

tpeak = df[df.rate_1D==df.rate_1D.max()].created_at_datetime.values[0]
print(f"Peak tweet rate on {tpeak}")

t0 = tpeak - np.timedelta64(24, 'h')
t1 = tpeak
df1 = df[(df.created_at_datetime>=t0) & (df.created_at_datetime<t1)]
df1[['text']]

In [None]:
#wordcloud of peak tweets

stopwords = set(STOPWORDS)
stopwords.add("de")
stopwords.add("que")
stopwords.add("chatgpt")
stopwords.add("ai")
stopwords.add('"artificial intelligence"')
stopwords.add("gpt4")
stopwords.add("gpt-4")
stopwords.add("midjourney")
stopwords.add("dalle")
stopwords.add("dall-e")
text=' '.join(df1.text_clean.tolist()).lower()
wordcloud = WordCloud(stopwords=stopwords,max_font_size=150, max_words=100, background_color="white",width=1000, height=600)
wordcloud.generate(text)

fig = plt.figure(figsize = (10,10))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()

## Top retweeted tweets

In [None]:
ndisplay = 10  #number of tweets to display

df_u = df[df.text.str.contains("RT @", case=False)==False] #remove any retweets
df.sort_values(by = 'retweet_count', ascending = False, inplace = True)
print("\n")
display(df[['retweet_count','text']].head(n=ndisplay))

##Top words and wordcloud

In [None]:
stopwords = set(STOPWORDS)  #set the stopwords
stopwords.add("de")
stopwords.add("que")
stopwords.add("chatgpt")
stopwords.add("ai")
stopwords.add('"artificial intelligence"')
stopwords.add("gpt4")
stopwords.add("gpt-4")
stopwords.add("midjourney")
stopwords.add("dalle")
stopwords.add("dall-e")

words_max = 10 #maximum number of words to plot in word frequency plot



text=' '.join(df_u.text_clean.tolist()).lower()

#generate word cloud
wordcloud = WordCloud(stopwords=stopwords,
                      max_font_size=150,
                      max_words=100,
                      background_color="black",
                      width=1000,
                      height=600)
wordcloud.generate(text)

#create dataframe of words and frequencies
df_words = pd.DataFrame({'word':wordcloud.words_.keys(),
                          'frequency':wordcloud.words_.values()})
df_words = df_words.sort_values(by = 'frequency',
                                ascending = False)

#plot word cloud and word frequencies
plt.figure(figsize = (16,6))
plt.subplot(1,2,1)
plt.imshow(wordcloud, interpolation="bilinear")
plt.title("Top words", fontsize = 20)
plt.axis("off")

plt.subplot(1,2,2)
ax= sns.barplot(data = df_words[0:words_max],
                x = 'word',
                y = 'frequency',
                color = 'orange')
ax.set_xticklabels(ax.get_xticklabels(),
                    rotation = 90)
plt.grid()
plt.show()

# Embedding and Clustering

## TF Embedding

In [None]:
#tf embedding
tf_vectorizer = CountVectorizer(min_df=5, stop_words='english')
tf_embedding = tf_vectorizer.fit_transform(df.text_clean)
tf_feature_names = tf_vectorizer.get_feature_names_out()

nvocab = len(tf_feature_names)
ntweets = len(df.text_clean)
print(f"{ntweets} tweets, {nvocab} words in vocabulary")
print(f"TF embedding shape is {tf_embedding.shape}")

  and should_run_async(code)


33543 tweets, 8858 words in vocabulary
TF embedding shape is (33543, 8858)


##TF-IDF Embedding

In [None]:
#tf-idf embedding
tfidf_vectorizer = TfidfVectorizer(min_df=5, stop_words='english')
tfidf_embedding = tfidf_vectorizer.fit_transform(df.text_clean)
tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()

nvocab = len(tfidf_feature_names)
print(f"{ntweets} tweets, {nvocab} words in vocabulary")
print(f"TF-IDF embedding shape is {tfidf_embedding.shape}")

  and should_run_async(code)


33543 tweets, 8858 words in vocabulary
TF-IDF embedding shape is (33543, 8858)


## LDA Embedding

In [None]:
#lda embedding
%%time
num_topics = 5
lda = LatentDirichletAllocation(n_components=num_topics, max_iter=5,
                                learning_method='online', learning_offset=50.,
                                random_state=0).fit(tf_embedding)
lda_embedding = lda.transform(tf_embedding)
print(f"{ntweets} tweets, {num_topics} topics in LDA model")
print(f"shape of lda embedding is {lda_embedding.shape}")

  and should_run_async(code)


33543 tweets, 5 topics in LDA model
shape of lda embedding is (33543, 5)
CPU times: user 35.3 s, sys: 121 ms, total: 35.4 s
Wall time: 35.8 s


In [None]:
viz = pyLDAvis.lda_model.prepare(lda, tf_embedding, tf_vectorizer)
pyLDAvis.display(viz)

## Umap Embedding

In [None]:
#umap
umap_tf_embedding = umap.UMAP(n_components=2, metric='hellinger').fit_transform(tf_embedding)
umap_tfidf_embedding = umap.UMAP(n_components=2, metric='hellinger').fit_transform(tfidf_embedding)

df['tf_umap_x'] = umap_tf_embedding[:,0]
df['tf_umap_y'] = umap_tf_embedding[:,1]
df['tfidf_umap_x'] = umap_tfidf_embedding[:,0]
df['tfidf_umap_y'] = umap_tfidf_embedding[:,1]
#zscoring centers the vectors at zero
umap_tf_embedding = stats.zscore(umap_tf_embedding,nan_policy='omit')
umap_tfidf_embedding = stats.zscore(umap_tfidf_embedding,nan_policy='omit')

xmax = 3  #range for x-axis
ymax = 3  #range for y-axis
s = 5  #marker size

fig = plt.figure(figsize = (16,8))

ax1 = plt.subplot(1,2,1)
sns.scatterplot(data=df, x="tf_umap_x",
                y="tf_umap_y", hue="screen_name", s=s)
plt.title("TF Embedding")
plt.xlim([-xmax, xmax])
plt.ylim([-ymax,ymax])

ax2 = plt.subplot(1,2,2)
sns.scatterplot(data=df, x="tfidf_umap_x",
                y="tfidf_umap_y", hue="screen_name", s=s)
plt.title("TF-IDF Embedding");
plt.xlim([-xmax, xmax])
plt.ylim([-ymax,ymax])

plt.show()



The above umap code gave us issues because of quirks in our data

## K-means Clustering

In [None]:
#cluster tweets using kmeans

n_clusters = 3

kmeans_label = cluster.KMeans(n_clusters=n_clusters).fit_predict(tf_embedding)
df['kmeans_label_tf'] = [str(x) for x in kmeans_label]

kmeans_label = cluster.KMeans(n_clusters=n_clusters).fit_predict(tfidf_embedding)
df['kmeans_label_tfidf'] = [str(x) for x in kmeans_label]

kmeans_label = cluster.KMeans(n_clusters=n_clusters).fit_predict(lda_embedding)
df['kmeans_label_lda'] = [str(x) for x in kmeans_label]


In [None]:
#wordcloud
def kmeans_wordcloud(df, cluster_label_column,stopwords):
    print(cluster_label_column)
    for k in np.sort(df[cluster_label_column].unique()):
        s=df[df[cluster_label_column]==k]
        text=' '.join(s.text_clean.tolist()).lower()
        wordcloud = WordCloud(stopwords=stopwords,max_font_size=150, max_words=100, background_color="white",width=1000, height=600)
        wordcloud.generate(text)
        print(f"\n\tCluster {k} {cluster_label_column} has {len(s)} tweets")
      #  plt.subplot(1,2,2)
        plt.imshow(wordcloud, interpolation="bilinear")
        plt.axis("off")
        plt.show()
    return 1

  and should_run_async(code)


In [None]:
#wordclouds for tf
stopwords = set(STOPWORDS)
stopwords.add("chatgpt")
stopwords.add("ai")
stopwords.add('"artificial intelligence"')
stopwords.add("gpt4")
stopwords.add("gpt-4")
stopwords.add("midjourney")
stopwords.add("dalle")
stopwords.add("dall-e")
cluster_label_column= 'kmeans_label_tf'
kmeans_wordcloud(df,cluster_label_column,stopwords)

In [None]:
#wordclouds for tfidf
stopwords = set(STOPWORDS)
stopwords.add("chatgpt")
stopwords.add("ai")
stopwords.add('"artificial intelligence"')
stopwords.add("gpt4")
stopwords.add("gpt-4")
stopwords.add("midjourney")
stopwords.add("dalle")
stopwords.add("dall-e")
cluster_label_column= 'kmeans_label_tfidf'
kmeans_wordcloud(df,cluster_label_column,stopwords)

In [None]:
#wordclouds for lda
stopwords = set(STOPWORDS)
stopwords.add("chatgpt")
stopwords.add("ai")
stopwords.add('"artificial intelligence"')
stopwords.add("gpt4")
stopwords.add("gpt-4")
stopwords.add("midjourney")
stopwords.add("dalle")
stopwords.add("dall-e")
cluster_label_column= 'kmeans_label_lda'
kmeans_wordcloud(df,cluster_label_column,stopwords )

In [None]:
fname_db = '../drive/MyDrive/historic_v2'
df_tweets = DB.fetch(table_name='keyword_tweets',path=fname_db)
df_tweets.head()

In [None]:
#MH code for average sentiment over time - this code cleans and preps

#need to access the database first....historic_v2.db

df_tweets['text_clean'] = df_tweets.text.apply(ta.clean_tweet)  #clean the tweets
df_tweets = df_tweets[df_tweets.text_clean.str.len() >0]  #remove cleaned tweets of length 0
nclean = len(df_tweets)

df_tweets.sample(n=5)

def sentiment_classifier(text,model,tokenizer):
    inputs = tokenizer.encode_plus(text, return_tensors='pt', add_special_tokens=True)

    token_type_ids = inputs['token_type_ids']
    input_ids = inputs['input_ids']

    output = model(input_ids, token_type_ids=token_type_ids,return_dict=True,output_hidden_states=True)
    logits = np.array(output.logits.tolist()[0])
    prob = np.exp(logits)/np.sum(np.exp(logits))
    sentiment = np.sum([(x+1)*prob[x] for x in range(len(prob))])  #use this line if you want the mean score
    embedding = output.hidden_states[12].detach().numpy().squeeze()[0]

    return sentiment,embedding

  and should_run_async(code)


In [None]:
%%time
c = 0
Sentiment = []
Embedding = []
for index,row in df_tweets.iterrows():  #iterate over rows of dataframe
    c+=1
    if c%1000==0:print(f"Tweet {c}/{len(df_tweets)}")  #print progres every 1000 rows

    sentiment,embedding = sentiment_classifier(row.text,model,tokenizer)  #calculate sentiment and embedding of tweet
    Sentiment.append(sentiment)  #append sentiment of tweet to Sentiment list
    Embedding.append(embedding) #append embedding of tweet to Embedding list

df_tweets['sentiment'] = Sentiment  #add sentiment column to dataframe of tweets
df_tweets.head()

  and should_run_async(code)


Tweet 1000/29633
Tweet 2000/29633
Tweet 3000/29633
Tweet 4000/29633
Tweet 5000/29633
Tweet 6000/29633
Tweet 7000/29633
Tweet 8000/29633
Tweet 9000/29633
Tweet 10000/29633
Tweet 11000/29633
Tweet 12000/29633
Tweet 13000/29633
Tweet 14000/29633
Tweet 15000/29633
Tweet 16000/29633
Tweet 17000/29633
Tweet 18000/29633
Tweet 19000/29633
Tweet 20000/29633
Tweet 21000/29633
Tweet 22000/29633
Tweet 23000/29633
Tweet 24000/29633
Tweet 25000/29633
Tweet 26000/29633
Tweet 27000/29633
Tweet 28000/29633
Tweet 29000/29633
CPU times: user 2h 7min 27s, sys: 11.4 s, total: 2h 7min 39s
Wall time: 2h 8min 25s


Unnamed: 0,created_at,screen_name,text,lang,retweet_count,reply_count,like_count,quote_count,impression_count,id,author_id,conversation_id,in_reply_to_user_id,geo,entities,text_clean,sentiment
0,2022-04-29T23:59:09.000Z,,10 Reasons Why you Should Learn Artificial Intelligence https://t.co/nqYx4aS058 #deeplearning,en,0,0,0,0,0,1520190984910479368,1013232376758128643,1520190984910479368,,,"{'urls': [{'start': 56, 'end': 79, 'url': 'https://t.co/nqYx4aS058', 'expanded_url': 'https://hackernoon.com/10-reasons-why-you-should-learn-artificial-intelligence-5v6q30vo', 'display_url': 'hackernoon.com/10-reasons-why…', 'status': 200, 'unwound_url': 'https://hackernoon.com/10-reasons-why-you-should-learn-artificial-intelligence-5v6q30vo'}], 'hashtags': [{'start': 80, 'end': 93, 'tag': 'deeplearning'}]}",10 reasons why you should learn artificial intelligence deeplearning,3.803131
1,2022-04-29T23:56:12.000Z,,"Gods And Goddesses Gods, and Goddesses can range from artificial intelligence to Earthly nature deities that all ultimately fall under the umbrella of higher dimensional ... https://t.co/ODSA8XFEND #GodsGoddesses #NatureDeities #Supernatural #Paranormal #Fantasy #scifi #ASMSG",en,0,0,0,0,0,1520190245341261825,388385054,1520190245341261825,,,"{'urls': [{'start': 174, 'end': 197, 'url': 'https://t.co/ODSA8XFEND', 'expanded_url': 'https://mysticinvestigations.com/gods/', 'display_url': 'mysticinvestigations.com/gods/', 'status': 200, 'unwound_url': 'https://mysticinvestigations.com/gods/'}], 'annotations': [{'start': 271, 'end': 275, 'probability': 0.5446, 'type': 'Other', 'normalized_text': 'ASMSG'}], 'hashtags': [{'start': 198, 'end': 212, 'tag': 'GodsGoddesses'}, {'start': 213, 'end': 227, 'tag': 'NatureDeities'}, {'start': 228, 'end': 241, 'tag': 'Supernatural'}, {'start': 242, 'end': 253, 'tag': 'Paranormal'}, {'start': 254, 'end': 262, 'tag': 'Fantasy'}, {'start': 263, 'end': 269, 'tag': 'scifi'}, {'start': 270, 'end': 276, 'tag': 'ASMSG'}]}",gods and goddesses gods and goddesses can range from artificial intelligence to earthly nature deities that all ultimately fall under the umbrella of higher dimensional godsgoddesses naturedeities supernatural paranormal fantasy scifi asmsg,3.426736
2,2022-04-29T23:55:05.000Z,,There’s no shortage of #data on customers these days. Companies are collecting reams of it to get the best insights on their customers.\n\nAI in #CustomerAnalytics - https://t.co/RmRMDPolFL \n\n#AI #Automation #TechUpdate #IT #tech #industry https://t.co/7QqKb3rS7Z,en,0,0,0,0,0,1520189963614277633,1334030877018316801,1520189963614277633,,,"{'urls': [{'start': 164, 'end': 187, 'url': 'https://t.co/RmRMDPolFL', 'expanded_url': 'https://bit.ly/3vJ91SI', 'display_url': 'bit.ly/3vJ91SI', 'status': 200, 'unwound_url': 'https://www.itpro.co.uk/technology/artificial-intelligence-ai/361259/ai-in-customer-analytics'}, {'start': 238, 'end': 261, 'url': 'https://t.co/7QqKb3rS7Z', 'expanded_url': 'https://twitter.com/IgnitivOfficial/status/1520189963614277633/photo/1', 'display_url': 'pic.twitter.com/7QqKb3rS7Z', 'media_key': '3_1520189961424838657'}], 'annotations': [{'start': 137, 'end': 138, 'probability': 0.8035, 'type': 'Organization', 'normalized_text': 'AI'}, {'start': 191, 'end': 192, 'probability': 0.6384, 'type': 'Organization', 'normalized_text': 'AI'}], 'hashtags': [{'start': 23, 'end': 28, 'tag': 'data'}, {'start': 143, 'end': 161, 'tag': 'CustomerAnalytics'}, {'start': 190, 'end': 193, 'tag': 'AI'}, {'start': 194, 'end': 205, 'tag': 'Automation'}, {'start': 206, 'end': 217, 'tag': 'TechUpdate'}, {'start': 218, 'end': 221, 'tag': 'IT'}, {'start': 222, 'end': 227, 'tag': 'tech'}, {'start': 228, 'end': 237, 'tag': 'industry'}]}",theres no shortage of data on customers these days companies are collecting reams of it to get the best insights on their customersai in customeranalytics ai automation techupdate it tech industry,4.272908
3,2022-04-29T23:53:40.000Z,,Economics Homework Help HSM Artificial Intelligence &amp; Internet of Things &amp; Its Impact on Delgro Worksheet\n\n https://t.co/q658tYPOsh,en,0,0,0,0,0,1520189606162956289,1353946694120067074,1520189606162956289,,,"{'urls': [{'start': 117, 'end': 140, 'url': 'https://t.co/q658tYPOsh', 'expanded_url': 'https://besthomeworkservices.com/economics-homework-help-7780/?utm_source=ReviveOldPost&utm_medium=social&utm_campaign=ReviveOldPost', 'display_url': 'besthomeworkservices.com/economics-home…', 'status': 403, 'unwound_url': 'https://besthomeworkservices.com/economics-homework-help-7780/?utm_source=ReviveOldPost&utm_medium=social&utm_campaign=ReviveOldPost'}], 'annotations': [{'start': 24, 'end': 26, 'probability': 0.4441, 'type': 'Other', 'normalized_text': 'HSM'}, {'start': 97, 'end': 112, 'probability': 0.3944, 'type': 'Other', 'normalized_text': 'Delgro Worksheet'}]}",economics homework help hsm artificial intelligence internet of things its impact on delgro worksheet,3.693248
4,2022-04-29T23:53:06.000Z,,Artificial intelligence challenges what it means to be creative - Science News Magazine: Artificial intelligence challenges what it means to be creative Science News Magazine https://t.co/rGKIaITETH #AI #artificialintelligence #Finperform https://t.co/9cNPF8Ptyb,en,0,0,0,0,0,1520189463766265857,856240505826496513,1520189463766265857,,,"{'urls': [{'start': 176, 'end': 199, 'url': 'https://t.co/rGKIaITETH', 'expanded_url': 'http://dlvr.it/SPVc3m', 'display_url': 'dlvr.it/SPVc3m', 'images': [{'url': 'https://pbs.twimg.com/news_img/1600963220268896266/UCyMCryB?format=jpg&name=orig', 'width': 1440, 'height': 810}, {'url': 'https://pbs.twimg.com/news_img/1600963220268896266/UCyMCryB?format=jpg&name=150x150', 'width': 150, 'height': 150}], 'status': 200, 'title': 'Artificial intelligence challenges what it means to be creative', 'description': 'Computer programs can mimic famous artworks, but struggle with originality and lack self-awareness.', 'unwound_url': 'https://www.sciencenews.org/article/artificial-intelligence-ai-creativity-art-computer-program?utm_source=dlvr.it&utm_medium=twitter'}, {'start': 240, 'end': 263, 'url': 'https://t.co/9cNPF8Ptyb', 'expanded_url': 'https://twitter.com/SuriyaSubraman/status/1520189463766265857/photo/1', 'display_url': 'pic.twitter.com/9cNPF8Ptyb', 'media_key': '3_1520189460939386880'}], 'annotations': [{'start': 66, 'end': 86, 'probability': 0.8423, 'type': 'Other', 'normalized_text': 'Science News Magazine'}, {'start': 162, 'end': 174, 'probability': 0.7132, 'type': 'Other', 'normalized_text': 'News Magazine'}, {'start': 201, 'end': 202, 'probability': 0.5984, 'type': 'Organization', 'normalized_text': 'AI'}], 'hashtags': [{'start': 200, 'end': 203, 'tag': 'AI'}, {'start': 204, 'end': 227, 'tag': 'artificialintelligence'}, {'start': 228, 'end': 239, 'tag': 'Finperform'}]}",artificial intelligence challenges what it means to be creative science news magazine artificial intelligence challenges what it means to be creativescience news magazine ai artificialintelligence finperform,3.441661


In [None]:
df_tweets.to_csv('tweets_sentiment.csv')

#download to machine and re-upload to google drive


  and should_run_async(code)


In [None]:
df_tweets = pd.read_csv('../drive/MyDrive/tweets_sentiment.csv')

  and should_run_async(code)
  df_tweets = pd.read_csv('../drive/MyDrive/tweets_sentiment.csv')


In [None]:
df_tweets.tail()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS

# Filter the dataset to include only tweets related to "artificial intelligence" or "ai"
ai_df_tweets = df_tweets[df_tweets['text'].str.contains('artificial intelligence|ai', case=False)]

# Convert the "created_at" column to a datetime object and set timezone
ai_df_tweets['created_at'] = pd.to_datetime(ai_df_tweets['created_at']).dt.tz_localize(None).dt.tz_localize('UTC')

# Filter the dataset to include only tweets from April 2022 to April 2023
start_date = pd.to_datetime('2022-04-29').tz_localize('UTC')
end_date = pd.to_datetime('2023-04-21').tz_localize('UTC')
ai_df_tweets = ai_df_tweets[(ai_df_tweets['created_at'] >= start_date) & (ai_df_tweets['created_at'] <= end_date)]

# Calculate the average sentiment score for each week
average_sentiment_scores = ai_df_tweets.groupby(pd.Grouper(key='created_at', freq='W')).mean()['sentiment'].tolist()

# Create a list of x-axis labels to display the first week of each month
x_labels = []
for date in pd.date_range(start_date, end_date, freq='W'):
    if date.day <= 7:
        x_labels.append(date.strftime('%b %d, %Y'))
    else:
        x_labels.append('')

# Create a line chart for the average sentiment scores
plt.plot(average_sentiment_scores)

# Add title and axis labels
plt.title('Average sentiment related to "artificial intelligence" and "ai" over time')
plt.xlabel('Week')
plt.ylabel('Sentiment Score (out of 5)')

# Set the Y-axis limits to reflect the sentiment score range
plt.ylim([2.5, 4])

# Set the x-axis tick labels to only display the first week of each month
plt.xticks(range(len(x_labels)), x_labels, rotation=45)

# Display the chart
plt.show()

num_tweets = len(ai_df_tweets)
print(f"There are {num_tweets} tweets in the ai_df_tweets dataframe.")

In [None]:
#edited
import pandas as pd
import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS
# Filter the dataset to include only tweets related to "artificial intelligence" or "ai"
ai_df_tweets = df_tweets[df_tweets['text'].str.contains('artificial intelligence|ai', case=False)]

# Convert the "created_at" column to a datetime object and set timezone
ai_df_tweets['created_at'] = pd.to_datetime(ai_df_tweets['created_at']).dt.tz_localize(None).dt.tz_localize('UTC')

# Filter the dataset to include only tweets from April 2022 to April 2023
start_date = pd.to_datetime('2022-04-29').tz_localize('UTC')
end_date = pd.to_datetime('2023-04-21').tz_localize('UTC')
ai_df_tweets = ai_df_tweets[(ai_df_tweets['created_at'] >= start_date) & (ai_df_tweets['created_at'] <= end_date)]

# Calculate the average sentiment score and standard deviation for each month
monthly_sentiment = ai_df_tweets.groupby(pd.Grouper(key='created_at', freq='M')).agg({'sentiment': ['mean', 'std']})

# Create a bar chart for the average sentiment scores
x_labels = monthly_sentiment.index.strftime('%b %Y')
x_pos = np.arange(len(x_labels))
plt.bar(x_pos, monthly_sentiment['sentiment']['mean'], yerr=monthly_sentiment['sentiment']['std'], align='center', alpha=0.5)

# Add title and axis labels
plt.title('Average sentiment related to "artificial intelligence" and "ai" per month')
plt.xlabel('Month')
plt.ylabel('Sentiment Score (out of 5)')

# Set the Y-axis limits to reflect the sentiment score range
plt.ylim([2, 4.5])

# Set the x-axis tick labels
plt.xticks(x_pos, x_labels, rotation=45)

# Increase the size of the figure
fig = plt.gcf()
fig.set_size_inches(12, 6)

# Display the chart
plt.show()

In [None]:
# edited
# MH code for what do people talk about when they talk about AI (Wordclouds)

import pandas as pd
import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS

# Define stopwords to be removed from the wordcloud
stopwords = set(STOPWORDS)
stopwords.update(["artificial", "intelligence", "ai", "https", "artificialintelligence"])

# Define function to create wordcloud for a specific month
def create_wordcloud_for_month(month):
    # Filter tweets to only include those from the given month
    month_tweets = ai_df_tweets[pd.DatetimeIndex(ai_df_tweets['created_at']).month == month]

    # Remove the t co link shortener from the tweets
    month_tweets['text'] = month_tweets['text'].str.replace(r'http\S+|www.\S+', '', case=False)

    # Join all tweets into a single string
    text = " ".join(tweet for tweet in month_tweets.text)

    # Generate the wordcloud
    wordcloud = WordCloud(stopwords=stopwords,max_font_size=150,
                          max_words=100,
                          background_color="black",
                          width=1000,
                          height=600).generate(text)

    # Visualize the wordcloud
    fig = plt.figure(figsize=(10, 8))
    plt.imshow(wordcloud, interpolation="bilinear")
    plt.axis("off")
    plt.title(f"Wordcloud for {pd.Timestamp(2022, month, 1).strftime('%B')}")
    plt.show()

# Call the function for each specified month
create_wordcloud_for_month(5)  # May
create_wordcloud_for_month(6)  # June
create_wordcloud_for_month(7)  # July
create_wordcloud_for_month(8)  # August
create_wordcloud_for_month(11) # November
create_wordcloud_for_month(12) # December
create_wordcloud_for_month(1) # January
create_wordcloud_for_month(2)  # February
create_wordcloud_for_month(3)  # March

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS

# Define stopwords to be removed from the wordcloud
stopwords = set(STOPWORDS)
stopwords.update(["artificial", "intelligence", "ai", "https", "artificialintelligence"])

# Define function to create wordcloud for a specific month
def create_wordcloud_for_month(month):
    # Filter tweets to only include those from the given month
    month_tweets = ai_df_tweets[pd.DatetimeIndex(ai_df_tweets['created_at']).month == month]

    # Filter tweets to only include those from April 2023
    month_tweets = month_tweets[pd.DatetimeIndex(month_tweets['created_at']).year == 2023]

    # Remove the t co link shortener from the tweets
    month_tweets['text'] = month_tweets['text'].str.replace(r'http\S+|www.\S+', '', case=False)

    # Join all tweets into a single string
    text = " ".join(tweet for tweet in month_tweets.text)

    # Generate the wordcloud
    wordcloud = WordCloud(stopwords=stopwords,max_font_size=150,
                          max_words=100,
                          background_color="black",
                          width=1000,
                          height=600).generate(text)

    # Visualize the wordcloud
    fig = plt.figure(figsize=(10, 8))
    plt.imshow(wordcloud, interpolation="bilinear")
    plt.axis("off")
    plt.title(f"Wordcloud for {pd.Timestamp(2023, month, 1).strftime('%B')}")
    plt.show()

# Call the function for April 2023
create_wordcloud_for_month(4)  # April

In [None]:
# Define function to get random Learning python sample of tweets for a given month and keyword
def get_random_tweets_for_month(month, keyword, num_tweets=5):
    # Filter tweets to only include those from the given month
    month_tweets = ai_df_tweets[pd.DatetimeIndex(ai_df_tweets['created_at']).month.isin([month])]

    # Filter tweets to only include those containing the given keyword
    month_tweets = month_tweets[month_tweets['text'].str.contains(keyword, case=False)]

    # Remove the t co link shortener from the tweets
    month_tweets['text'] = month_tweets['text'].str.replace(r'http\S+|www.\S+', '', case=False)

    # Get a random sample of tweets
    random_tweets = month_tweets.sample(num_tweets)

    # Return the sample of tweets as a list
    return random_tweets['text'].tolist()

# Call the function to get a random sample of 5 tweets containing the word "Python" from September 2022
get_random_tweets_for_month(9, 'python')

In [None]:
# Define function to get random chatgpt for January
def get_random_tweets_for_month(month, keyword, num_tweets=5):
    # Filter tweets to only include those from the given month
    month_tweets = ai_df_tweets[pd.DatetimeIndex(ai_df_tweets['created_at']).month.isin([month])]

    # Filter tweets to only include those containing the given keyword
    month_tweets = month_tweets[month_tweets['text'].str.contains(keyword, case=False)]

    # Remove the t co link shortener from the tweets
    month_tweets['text'] = month_tweets['text'].str.replace(r'http\S+|www.\S+', '', case=False)

    # Get a random sample of tweets
    random_tweets = month_tweets.sample(num_tweets)

    # Return the sample of tweets as a list
    return random_tweets['text'].tolist()

# Call the function to get a random sample of 5 tweets containing the word "Python" from September 2022
get_random_tweets_for_month(1, 'chatgpt')

In [None]:
# Define function to get random art for December
def get_random_tweets_for_month(month, keyword, num_tweets=5):
    # Filter tweets to only include those from the given month
    month_tweets = ai_df_tweets[pd.DatetimeIndex(ai_df_tweets['created_at']).month.isin([month])]

    # Filter tweets to only include those containing the given keyword
    month_tweets = month_tweets[month_tweets['text'].str.contains(keyword, case=False)]

    # Remove the t co link shortener from the tweets
    month_tweets['text'] = month_tweets['text'].str.replace(r'http\S+|www.\S+', '', case=False)

    # Get a random sample of tweets
    random_tweets = month_tweets.sample(num_tweets)

    # Return the sample of tweets as a list
    return random_tweets['text'].tolist()

# Call the function to get a random sample of 5 tweets containing the word "Python" from September 2022
get_random_tweets_for_month(12, 'artist')

In [None]:
# Define function to get random chatgpt for December
def get_random_tweets_for_month(month, keyword, num_tweets=5):
    # Filter tweets to only include those from the given month
    month_tweets = ai_df_tweets[pd.DatetimeIndex(ai_df_tweets['created_at']).month.isin([month])]

    # Filter tweets to only include those containing the given keyword
    month_tweets = month_tweets[month_tweets['text'].str.contains(keyword, case=False)]

    # Remove the t co link shortener from the tweets
    month_tweets['text'] = month_tweets['text'].str.replace(r'http\S+|www.\S+', '', case=False)

    # Get a random sample of tweets
    random_tweets = month_tweets.sample(num_tweets)

    # Return the sample of tweets as a list
    return random_tweets['text'].tolist()

# Call the function to get a random sample of 5 tweets containing the word "Python" from September 2022
get_random_tweets_for_month(12, 'chatgpt')

In [None]:
# Define function to get random match for April
def get_random_tweets_for_month(month, keyword, num_tweets=5):
    # Filter tweets to only include those from the given month
    month_tweets = ai_df_tweets[pd.DatetimeIndex(ai_df_tweets['created_at']).month.isin([month])]

    # Filter tweets to only include those containing the given keyword
    month_tweets = month_tweets[month_tweets['text'].str.contains(keyword, case=False)]

    # Remove the t co link shortener from the tweets
    month_tweets['text'] = month_tweets['text'].str.replace(r'http\S+|www.\S+', '', case=False)

    # Get a random sample of tweets
    random_tweets = month_tweets.sample(num_tweets)

    # Return the sample of tweets as a list
    return random_tweets['text'].tolist()

# Call the function to get a random sample of 5 tweets containing the word "Python" from September 2022
get_random_tweets_for_month(4, 'token')

In [None]:
nltk.download('punkt')

In [None]:
df_en = DB.fetch(table_name='keyword_tweets',path=fname_db)
df_it = DB.fetch(table_name='italian_tweets',path=fname_db)

In [None]:
c = 0
Sentiment = []
Embedding = []
for index,row in df_it.iterrows():  #iterate over rows of dataframe
    c+=1
    if c%1000==0:print(f"Tweet {c}/{len(df_it)}")  #print progres every 1000 rows

    sentiment,embedding = sentiment_classifier(row.text,model,tokenizer)  #calculate sentiment and embedding of tweet
    Sentiment.append(sentiment)  #append sentiment of tweet to Sentiment list
    Embedding.append(embedding) #append embedding of tweet to Embedding list

df_it['sentiment'] = Sentiment  #add sentiment column to dataframe of tweets
df_it.head()

In [None]:
df_en['created_date'] = df_en.apply(lambda row: row['created_at'].split('T')[0], axis=1)
df_it['created_date'] = df_it.apply(lambda row: row['created_at'].split('T')[0], axis=1)

avg_sentiment_by_date_en = df_en.groupby(['created_date'])['sentiment'].mean()
avg_sentiment_by_date_it = df_it.groupby(['created_date'])['sentiment'].mean()

df_sentiments = pd.concat([avg_sentiment_by_date_en, avg_sentiment_by_date_it], axis=1)
df_sentiments = df_sentiments.dropna()

df_sentiments.columns = ['en_sentiment','it_sentiment']

In [None]:
ax = sns.lineplot(data=df_sentiments[['en_sentiment','it_sentiment']])
plt.xticks(rotation=90, fontsize = 'xx-small')
plt.title('Tweet Sentiment about ChatGPT')
plt.xlabel('Date')
plt.ylabel('Sentiment')
plt.show()

In [None]:
def clean_text(text):
    stop_words = set(stopwords.words('italian'))
    tokens = word_tokenize(text.lower())
    cleaned_tokens = [token for token in tokens if token.isalpha() and token not in stop_words]
    cleaned_text = ' '.join(cleaned_tokens)
    return cleaned_text

# Clean the text of the tweets
df_it['cleaned_text'] = df_it['text'].apply(clean_text)

# Create a list of the months for which we want to create word clouds
dates = ['2022-12-20', '2023-04-04']

# Loop through the months and create a word cloud for each month
for date in dates:
    # Filter the dataset to include only tweets from the specified month
    df_tweets = df_it[df_it['created_date'].str.contains(date)]

    # Concatenate all the cleaned text from the tweets into a single string
    text = ' '.join(df_tweets['cleaned_text'].tolist())

    # Create a word cloud for the month
    wordcloud = WordCloud(width=800, height=800, background_color='white').generate(text)

    # Display the word cloud
    plt.figure(figsize=(8, 8), facecolor=None)
    plt.imshow(wordcloud)
    plt.axis('off')
    plt.tight_layout(pad=0)
    plt.title('Word Cloud for ' + date)
    plt.show()

In [None]:
df_it[df_it['created_date'] == '2023-04-04'].sort_values(by='sentiment',ascending=True).head()


# Sentiment Analysis

In [None]:
import scripts.TextAnalysis as ta
from scripts.api import *
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import codecs

  and should_run_async(code)


In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import codecs  #this let's us display tweets properly (emojis, etc.)
pd.set_option("display.max_colwidth", None)

  and should_run_async(code)


In [None]:
tokenizer = AutoTokenizer.from_pretrained("nlptown/bert-base-multilingual-uncased-sentiment")
model = AutoModelForSequenceClassification.from_pretrained("nlptown/bert-base-multilingual-uncased-sentiment")

In [None]:
fname_db = '/content/drive/MyDrive/SMAFP/AItweets_v2'
df = DB.fetch(table_name='keyword_tweets',path=fname_db)
df.head()

In [None]:
df['text_clean'] = df.text.apply(ta.clean_tweet)
df = df[df.text_clean.str.len() >0]
nclean = len(df)

df.sample(n=5)

In [None]:
df = df[~df['text_clean'].str.contains('health consultations we are revolutionizing the way people access medical')]
df = df[~df['text_clean'].str.contains('ai revolution download to participate in the ai2earn economy')]
df = df[~df['text_clean'].str.contains('total maximum supply only 10 units a great artificial intelligence project dont miss')]
df = df[~df['text_clean'].str.contains('blockchain based virtual world that allows users to create build buy and sell')]
df = df[~df['text_clean'].str.contains('neurobayes fairlaunch starts at')]
df = df[~df['text_clean'].str.contains('live happily togesssa')]

In [None]:
gpt_df = df[df['text'].str.contains('ChatGPT|chatgpt|GPT|gpt')]
len(gpt_df)

  and should_run_async(code)


11365

In [None]:
dalle_df = df[df['text'].str.contains('dalle|dall-e|Dalle|Dall-E|DallE|Dall-e|dall-E|dallE')]
len(dalle_df)

  and should_run_async(code)


2152

In [None]:
mid_df = df[df['text'].str.contains('midjourney|Midjourney|midJourney|MidJourney')]
len(mid_df)

  and should_run_async(code)


5636

In [None]:
def sentiment_classifier(text,model,tokenizer):
    inputs = tokenizer.encode_plus(text, return_tensors='pt', add_special_tokens=True)

    token_type_ids = inputs['token_type_ids']
    input_ids = inputs['input_ids']

    output = model(input_ids, token_type_ids=token_type_ids,return_dict=True,output_hidden_states=True)
    logits = np.array(output.logits.tolist()[0])
    prob = np.exp(logits)/np.sum(np.exp(logits))
    sentiment = np.sum([(x+1)*prob[x] for x in range(len(prob))])  #use this line if you want the mean score
    embedding = output.hidden_states[12].detach().numpy().squeeze()[0]

    return sentiment,embedding

c = 0
Sentiment = []
Embedding = []
for index,row in gpt_df.iterrows():
    c+=1
    if c%1000==0:print(f"Tweet {c}/{len(gpt_df)}")

    sentiment,embedding = sentiment_classifier(row.text,model,tokenizer)
    Sentiment.append(sentiment)
    Embedding.append(embedding)

gpt_df['sentiment'] = Sentiment
gpt_df.head()

In [None]:
mean_sentiment = gpt_df['sentiment'].mean()
print(f"The mean opinion is {mean_sentiment}")

The mean opinion is 3.1319764752572326


  and should_run_async(code)


In [None]:
def sentiment_classifier(text,model,tokenizer):
    inputs = tokenizer.encode_plus(text, return_tensors='pt', add_special_tokens=True)

    token_type_ids = inputs['token_type_ids']
    input_ids = inputs['input_ids']

    output = model(input_ids, token_type_ids=token_type_ids,return_dict=True,output_hidden_states=True)
    logits = np.array(output.logits.tolist()[0])
    prob = np.exp(logits)/np.sum(np.exp(logits))
    sentiment = np.sum([(x+1)*prob[x] for x in range(len(prob))])  #use this line if you want the mean score
    embedding = output.hidden_states[12].detach().numpy().squeeze()[0]

    return sentiment,embedding

c = 0
Sentiment = []
Embedding = []
for index,row in dalle_df.iterrows():
    c+=1
    if c%1000==0:print(f"Tweet {c}/{len(dalle_df)}")

    sentiment,embedding = sentiment_classifier(row.text,model,tokenizer)
    Sentiment.append(sentiment)
    Embedding.append(embedding)

dalle_df['sentiment'] = Sentiment
dalle_df.head()

In [None]:
mean_sentiment = dalle_df['sentiment'].mean()
print(f"The mean opinion is {mean_sentiment}")

The mean opinion is 3.1411879942290373


  and should_run_async(code)


In [None]:
def sentiment_classifier(text,model,tokenizer):
    inputs = tokenizer.encode_plus(text, return_tensors='pt', add_special_tokens=True)

    token_type_ids = inputs['token_type_ids']
    input_ids = inputs['input_ids']

    output = model(input_ids, token_type_ids=token_type_ids,return_dict=True,output_hidden_states=True)
    logits = np.array(output.logits.tolist()[0])
    prob = np.exp(logits)/np.sum(np.exp(logits))
    sentiment = np.sum([(x+1)*prob[x] for x in range(len(prob))])  #use this line if you want the mean score
    embedding = output.hidden_states[12].detach().numpy().squeeze()[0]

    return sentiment,embedding

c = 0
Sentiment = []
Embedding = []
for index,row in mid_df.iterrows():
    c+=1
    if c%1000==0:print(f"Tweet {c}/{len(mid_df)}")

    sentiment,embedding = sentiment_classifier(row.text,model,tokenizer)
    Sentiment.append(sentiment)
    Embedding.append(embedding)

mid_df['sentiment'] = Sentiment
mid_df.head()

  and should_run_async(code)


Tweet 1000/5636
Tweet 2000/5636
Tweet 3000/5636
Tweet 4000/5636
Tweet 5000/5636


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mid_df['sentiment'] = Sentiment


Unnamed: 0,created_at,screen_name,text,lang,retweet_count,reply_count,like_count,quote_count,impression_count,id,author_id,conversation_id,in_reply_to_user_id,geo,entities,text_clean,sentiment
321,2023-04-25T20:33:16.000Z,,Check out my latest article: Designing T-Shirts with #ChatGPT and #Midjourney \nhttps://t.co/JYU3shbrYT via @LinkedIn,en,0,0,1,0,16,1650961193065119745,6875652,1650961193065119745,,,"{'urls': [{'start': 79, 'end': 102, 'url': 'https://t.co/JYU3shbrYT', 'expanded_url': 'https://www.linkedin.com/pulse/designing-t-shirts-chatgpt-midjourney-marco-van-hylckama-vlieg', 'display_url': 'linkedin.com/pulse/designin…', 'images': [{'url': 'https://pbs.twimg.com/news_img/1650961194671538176/4OjLiuqn?format=png&name=orig', 'width': 1400, 'height': 800}, {'url': 'https://pbs.twimg.com/news_img/1650961194671538176/4OjLiuqn?format=png&name=150x150', 'width': 150, 'height': 150}], 'status': 200, 'title': 'Designing T-Shirts with ChatGPT and Midjourney', 'description': ""I recently performed an experiment that's way too cool not to at least write a little article about. I am an avid T-shirt designer as a fun side gig besides my day job as Sr."", 'unwound_url': 'https://www.linkedin.com/pulse/designing-t-shirts-chatgpt-midjourney-marco-van-hylckama-vlieg'}], 'hashtags': [{'start': 53, 'end': 61, 'tag': 'ChatGPT'}, {'start': 66, 'end': 77, 'tag': 'Midjourney'}], 'mentions': [{'start': 107, 'end': 116, 'username': 'LinkedIn', 'id': '13058772'}]}",check out my latest article designing tshirts with chatgpt and midjourney via linkedin,3.45968
463,2023-04-25T20:23:52.000Z,,"💥 Captains, the moment has come! 🦸🏻🦹🏼 \n\n🦋 ""The Metamorphosis"" is LIVE!\n\nExplore your heroes' stunning transformations and dive into new adventures 🌌 \n\nJoin us now to check the beta👷🏽at: https://t.co/7p15zt8o4y \n\n#AIU #NFT #Metamorphosis #AIArt #Midjourney #ChatGPT https://t.co/f3wscifSeM",en,4,4,7,0,88,1650958826449825804,1518932514244599809,1650958826449825804,,,"{'hashtags': [{'start': 212, 'end': 216, 'tag': 'AIU'}, {'start': 217, 'end': 221, 'tag': 'NFT'}, {'start': 222, 'end': 236, 'tag': 'Metamorphosis'}, {'start': 237, 'end': 243, 'tag': 'AIArt'}, {'start': 244, 'end': 255, 'tag': 'Midjourney'}, {'start': 256, 'end': 264, 'tag': 'ChatGPT'}], 'urls': [{'start': 186, 'end': 209, 'url': 'https://t.co/7p15zt8o4y', 'expanded_url': 'https://warp-drive-eight.vercel.app/', 'display_url': 'warp-drive-eight.vercel.app', 'status': 200, 'title': 'ESTABLISHED CONNECTION WITH:', 'unwound_url': 'https://warp-drive-eight.vercel.app/'}, {'start': 265, 'end': 288, 'url': 'https://t.co/f3wscifSeM', 'expanded_url': 'https://twitter.com/AI_UniverseNFT/status/1650958826449825804/photo/1', 'display_url': 'pic.twitter.com/f3wscifSeM', 'media_key': '3_1650958819768299553'}], 'annotations': [{'start': 43, 'end': 59, 'probability': 0.948, 'type': 'Other', 'normalized_text': 'The Metamorphosis'}, {'start': 213, 'end': 215, 'probability': 0.3448, 'type': 'Organization', 'normalized_text': 'AIU'}, {'start': 223, 'end': 235, 'probability': 0.5315, 'type': 'Other', 'normalized_text': 'Metamorphosis'}, {'start': 238, 'end': 242, 'probability': 0.4861, 'type': 'Other', 'normalized_text': 'AIArt'}]}",captains the moment has come the metamorphosis is liveexplore your heroes stunning transformations and dive into new adventures join us now to check the betaat aiu nft metamorphosis aiart midjourney chatgpt,4.439003
664,2023-04-25T20:11:06.000Z,,"#MidJourney #OpenAi #AiArt #Art #OpenAi #StableDiffusion2 #DallE #ChatGPT #AiArtworks\n\n#imagine \n🌺 DETAILED PROMPT 👇🏼| CREATE YOUR OWN \n\nsamsara wallpaper, in the style of lit kid, time-lapse photography, moody figurative, forced perspective, mysterious backdrops, patience… https://t.co/vuFo11Sicg",en,1,0,0,0,6,1650955615823360011,1563511792915316738,1650955615823360011,,,"{'annotations': [{'start': 59, 'end': 63, 'probability': 0.4554, 'type': 'Other', 'normalized_text': 'DallE'}], 'hashtags': [{'start': 0, 'end': 11, 'tag': 'MidJourney'}, {'start': 12, 'end': 19, 'tag': 'OpenAi'}, {'start': 20, 'end': 26, 'tag': 'AiArt'}, {'start': 27, 'end': 31, 'tag': 'Art'}, {'start': 32, 'end': 39, 'tag': 'OpenAi'}, {'start': 40, 'end': 57, 'tag': 'StableDiffusion2'}, {'start': 58, 'end': 64, 'tag': 'DallE'}, {'start': 65, 'end': 73, 'tag': 'ChatGPT'}, {'start': 74, 'end': 85, 'tag': 'AiArtworks'}, {'start': 87, 'end': 95, 'tag': 'imagine'}], 'urls': [{'start': 277, 'end': 300, 'url': 'https://t.co/vuFo11Sicg', 'expanded_url': 'https://twitter.com/MidJourneyAI_/status/1650955615823360011/photo/1', 'display_url': 'pic.twitter.com/vuFo11Sicg', 'media_key': '3_1650955613852037138'}]}",midjourney openai aiart art openai stablediffusion2 dalle chatgpt aiartworksimagine detailed prompt create your own samsara wallpaper in the style of lit kid timelapse photography moody figurative forced perspective mysterious backdrops patience,3.949779
702,2023-04-25T20:08:48.000Z,,Almost perfect. Sun is going down like in the Elton John song. \n\nI would understand chatGPT having these songs in its lyrics. Midjourney is supposed to be just images https://t.co/W4zRqjEENc,en,0,1,0,0,51,1650955037856653312,476490904,1650953744723374083,476490904.0,,"{'annotations': [{'start': 16, 'end': 18, 'probability': 0.449, 'type': 'Organization', 'normalized_text': 'Sun'}, {'start': 46, 'end': 55, 'probability': 0.9214, 'type': 'Person', 'normalized_text': 'Elton John'}, {'start': 84, 'end': 90, 'probability': 0.7246, 'type': 'Other', 'normalized_text': 'chatGPT'}, {'start': 126, 'end': 135, 'probability': 0.6149, 'type': 'Other', 'normalized_text': 'Midjourney'}], 'urls': [{'start': 167, 'end': 190, 'url': 'https://t.co/W4zRqjEENc', 'expanded_url': 'https://twitter.com/quipsy/status/1650955037856653312/photo/1', 'display_url': 'pic.twitter.com/W4zRqjEENc', 'media_key': '3_1650955024564908066'}]}",almost perfect sun is going down like in the elton john song i would understand chatgpt having these songs in its lyrics midjourney is supposed to be just images,3.822272
721,2023-04-25T20:07:33.000Z,,"@TomBilyeu Check this one out from @mreflow \n\nGenerated with:\nChatGPT, ElevenLabs, Mubert, Gen2, MidJourney, LeiaPix, &amp; Genmo\nhttps://t.co/5LtaJujnOd",en,1,0,1,0,80,1650954723178971136,19281485,1650944450116608000,1087646485.0,,"{'mentions': [{'start': 0, 'end': 10, 'username': 'TomBilyeu', 'id': '1087646485'}, {'start': 35, 'end': 43, 'username': 'mreflow', 'id': '1544387652811493377'}], 'annotations': [{'start': 62, 'end': 68, 'probability': 0.9169, 'type': 'Other', 'normalized_text': 'ChatGPT'}, {'start': 71, 'end': 80, 'probability': 0.8462, 'type': 'Other', 'normalized_text': 'ElevenLabs'}, {'start': 83, 'end': 88, 'probability': 0.5968, 'type': 'Other', 'normalized_text': 'Mubert'}, {'start': 91, 'end': 94, 'probability': 0.8015, 'type': 'Other', 'normalized_text': 'Gen2'}, {'start': 97, 'end': 106, 'probability': 0.8167, 'type': 'Other', 'normalized_text': 'MidJourney'}, {'start': 109, 'end': 115, 'probability': 0.8468, 'type': 'Other', 'normalized_text': 'LeiaPix'}, {'start': 124, 'end': 128, 'probability': 0.6171, 'type': 'Other', 'normalized_text': 'Genmo'}], 'urls': [{'start': 130, 'end': 153, 'url': 'https://t.co/5LtaJujnOd', 'expanded_url': 'https://twitter.com/mreflow/status/1650669361311674368/video/1', 'display_url': 'pic.twitter.com/5LtaJujnOd', 'media_key': '7_1650668361532198912'}]}",tombilyeu check this one out from mreflow generated withchatgpt elevenlabs mubert gen2 midjourney leiapix genmo,2.878978


In [None]:
mean_sentiment = mid_df['sentiment'].mean()
print(f"The mean opinion is {mean_sentiment}")

The mean opinion is 3.2872162117108004


  and should_run_async(code)


In [None]:
from scipy import stats
from statsmodels.stats.weightstats import ztest
from statsmodels.stats.multitest import multipletests

  and should_run_async(code)


In [None]:
X0 = gpt_df['sentiment']
X1 = dalle_df['sentiment']

alpha = 0.01
tstat, pval = stats.ttest_ind(X0,X1, equal_var = False)
print(f'T-Statistic: {tstat:.4f}\nP-Value: {pval:.4f}')
if pval < alpha:
    print("Significant at a 1% level.")
else:
    print("Not significant at a 1% level.")

T-Statistic: -0.4734
P-Value: 0.6360
Not significant at a 1% level.


  and should_run_async(code)


In [None]:
X0 = mid_df['sentiment']
X1 = dalle_df['sentiment']

alpha = 0.01
tstat, pval = stats.ttest_ind(X0,X1, equal_var = False)
print(f'T-Statistic: {tstat:.4f}\nP-Value: {pval:.4f}')
if pval < alpha:
    print("Significant at a 1% level.")
else:
    print("Not significant at a 1% level.")

T-Statistic: 7.1621
P-Value: 0.0000
Significant at a 1% level.


  and should_run_async(code)


In [None]:
X0 = mid_df['sentiment']
X1 = gpt_df['sentiment']

alpha = 0.01
tstat, pval = stats.ttest_ind(X0,X1, equal_var = False)
print(f'T-Statistic: {tstat:.4f}\nP-Value: {pval:.4f}')
if pval < alpha:
    print("Significant at a 1% level.")
else:
    print("Not significant at a 1% level.")

T-Statistic: 11.7391
P-Value: 0.0000
Significant at a 1% level.


  and should_run_async(code)


In [None]:
mean_gpt_sentiment = gpt_df['sentiment'].mean()
mean_dalle_sentiment = dalle_df['sentiment'].mean()
mean_mid_sentiment = mid_df['sentiment'].mean()

plt.bar(['ChatGPT', 'Dall-E', 'Midjourney'], [mean_gpt_sentiment, mean_dalle_sentiment, mean_mid_sentiment], color=['red', 'blue', 'green'])

plt.xlabel('Keywords')
plt.ylabel('Mean of Sentiment')
plt.title('Mean of Tweet Sentiment by Keywords')

plt.ylim(bottom=3.10, top=3.30)

plt.show()

# Engagement Analysis

In [None]:
gpt_df['like_engagement'] = gpt_df.like_count/(gpt_df.impression_count+1)
gpt_df['retweet_engagement'] = gpt_df.retweet_count/(gpt_df.impression_count+1)
gpt_df['reply_engagement'] = gpt_df.reply_count/(gpt_df.impression_count+1)

dalle_df['like_engagement'] = dalle_df.like_count/(dalle_df.impression_count+1)
dalle_df['retweet_engagement'] = dalle_df.retweet_count/(dalle_df.impression_count+1)
dalle_df['reply_engagement'] = dalle_df.reply_count/(dalle_df.impression_count+1)

mid_df['like_engagement'] = mid_df.like_count/(df.impression_count+1)
mid_df['retweet_engagement'] = mid_df.retweet_count/(df.impression_count+1)
mid_df['reply_engagement'] = mid_df.reply_count/(df.impression_count+1)

In [None]:
X0 = dalle_df['like_engagement']
X1 = gpt_df['like_engagement']

alpha = 0.01
tstat, pval = stats.ttest_ind(X0,X1, equal_var = False)
print(f'T-Statistic: {tstat:.4f}\nP-Value: {pval:.4f}')
if pval < alpha:
    print("Significant at a 1% level.")
else:
    print("Not significant at a 1% level.")

T-Statistic: 2.8102
P-Value: 0.0050
Significant at a 1% level.


  and should_run_async(code)


In [None]:
X0 = dalle_df['retweet_engagement']
X1 = gpt_df['retweet_engagement']

alpha = 0.01
tstat, pval = stats.ttest_ind(X0,X1, equal_var = False)
print(f'T-Statistic: {tstat:.4f}\nP-Value: {pval:.4f}')
if pval < alpha:
    print("Significant at a 1% level.")
else:
    print("Not significant at a 1% level.")

T-Statistic: -1.8001
P-Value: 0.0719
Not significant at a 1% level.


  and should_run_async(code)


In [None]:
X0 = dalle_df['reply_engagement']
X1 = gpt_df['reply_engagement']

alpha = 0.01
tstat, pval = stats.ttest_ind(X0,X1, equal_var = False)
print(f'T-Statistic: {tstat:.4f}\nP-Value: {pval:.4f}')
if pval < alpha:
    print("Significant at a 1% level.")
else:
    print("Not significant at a 1% level.")

T-Statistic: -2.6797
P-Value: 0.0074
Significant at a 1% level.


  and should_run_async(code)


In [None]:
mean_gpt_like = gpt_df['like_engagement'].mean()
mean_dalle_like = dalle_df['like_engagement'].mean()
mean_mid_like = mid_df['like_engagement'].mean()

plt.bar(['ChatGPT', 'Dall-E', 'Midjourney'], [mean_gpt_like, mean_dalle_like, mean_mid_like], color=['red', 'blue', 'green'])

plt.xlabel('Keywords')
plt.ylabel('Mean of Like Engagement')
plt.title('Mean of Like Engagement by Tweet Keywords')

plt.show()

In [None]:
mean_gpt_retweet = gpt_df['retweet_engagement'].mean()
mean_dalle_retweet = dalle_df['retweet_engagement'].mean()
mean_mid_retweet = mid_df['retweet_engagement'].mean()

plt.bar(['ChatGPT', 'Dall-E', 'Midjourney'], [mean_gpt_retweet, mean_dalle_retweet, mean_mid_retweet], color=['red', 'blue', 'green'])

plt.xlabel('Keywords')
plt.ylabel('Mean of Retweet Engagement')
plt.title('Mean of Retweet Engagement by Tweet Keywords')

plt.show()

In [None]:
mean_gpt_reply = gpt_df['reply_engagement'].mean()
mean_dalle_reply = dalle_df['reply_engagement'].mean()
mean_mid_reply = mid_df['reply_engagement'].mean()

plt.bar(['ChatGPT', 'Dall-E', 'Midjourney'], [mean_gpt_reply, mean_dalle_reply, mean_mid_reply], color=['red', 'blue', 'green'])

plt.xlabel('Keywords')
plt.ylabel('Mean of Reply Engagement')
plt.title('Mean of Reply Engagement by Tweet Keywords')

plt.show()

#Centrality measurements

In [None]:
fname_db = f"AItweets_v2"
df = DB.fetch(table_name='keyword_tweets',path=fname_db)
df.head()

In [None]:
df['text_clean'] = df.text.apply(ta.clean_tweet)  #clean the tweets
df = df[df.text_clean.str.len() >0]  #remove cleaned tweets of length 0
nclean = len(df)
print(f" {nclean} clean tweets")

In [None]:
G = interaction_network_from_tweets(df)

nv = G.number_of_nodes()
ne = G.number_of_edges()
print(f"Network has {nv} nodes and {ne} edges")

In [None]:
Comm = nx_comm.greedy_modularity_communities(G.to_undirected())
C = []
V = []
for count,comm in enumerate(Comm):
    for v in comm:
        C.append(count)
        V.append(v)

df_mod = pd.DataFrame({'screen_name':V, 'community':C})
df_mod.head()

ncomm_mod = len(df_mod.community.unique())
print(f"{ncomm_mod} modularity communities")

In [None]:
k = 5
df_spec = spectral_communities(G,k)

ncomm_spec = len(df_spec.community.unique())
print(f"{ncomm_spec} spectral communities")

In [None]:
#plot modularity community sizes
fig = plt.figure(figsize = (16,6))
plt.subplot(1,2,1)
ax = sns.countplot(data=df_mod, x="community")
plt.xlabel("Community", fontsize = 14)
plt.ylabel("Number of nodes", fontsize = 14)
plt.title(f"{ncomm_mod} modularity communities", fontsize = 18)
plt.xticks(fontsize = 12)
plt.yticks(fontsize = 12)

plt.grid()


#plot spectral community sizes
plt.subplot(1,2,2)
ax = sns.countplot(data=df_spec, x="community")
plt.xlabel("Community", fontsize = 14)
plt.ylabel("Number of nodes", fontsize = 14)
plt.title(f"{ncomm_spec} spectral communities", fontsize = 18)
plt.grid()
plt.xticks(fontsize = 12)
plt.yticks(fontsize = 12)
plt.show()

In [None]:
df_spec.groupby(by='community',as_index=False).count()

In [None]:
df_mod_size = df_mod.groupby(by='community',as_index=False).count()
df_mod_size = df_mod_size.rename(columns={"screen_name":'size'})
size_min = df_mod_size["size"].min()
size_max =  df_mod_size["size"].max()
size_mean =  df_mod_size["size"].mean()

print(f"Minimum community size = {size_min:.3f} nodes")
print(f"Maximum community size = {size_max:.3f} nodes")
print(f"Mean community size = {size_mean:.3f} nodes")

In [None]:
Din = nx.in_degree_centrality(G)
Dout = nx.out_degree_centrality(G)
EC = nx.eigenvector_centrality(G.reverse(),max_iter = 1000)  #reverse edges to match networx convention

dictionary_list = []
for author_id in Din.keys():
    screen_name = G.nodes[author_id]['username']
    row = {'author_id':author_id,
           'screen_name':screen_name,
          'out_degree_centrality':Dout[author_id],
          'eigenvector_centrality':EC[author_id],
          'in_degree_centrality':Din[author_id]}
    dictionary_list.append(row)
df_centrality = pd.DataFrame(dictionary_list)

In [None]:
centrality = "eigenvector_centrality"
nmax = 10  #number of top screen names to display
df_centrality.sort_values(by = [centrality],ascending = False)[['screen_name',centrality]].head(n=nmax)

In [None]:
df_centrality.head()

In [None]:
Centrality_names = df_centrality.columns.tolist()[2:]
nmax = 10  #number of top screen names to plot


fig = plt.figure(figsize = (20,14))

for count,centrality_name in enumerate(Centrality_names):
    df_plot = df_centrality.sort_values(by=[centrality_name],ascending=False)  #sort dataframe by centrality value
    plt.subplot(2,3,count+1) #make a 2 x 3 subplot, plot in box cnt+1

    ax = sns.barplot(data = df_plot[0:nmax],
                     x='screen_name',
                     y=centrality_name)
    ax.set_xticklabels(ax.get_xticklabels(),rotation = 90)
    plt.ylabel(f"{centrality_name}")
    plt.xlabel('Screen name')
    plt.grid()

plt.subplots_adjust(left=None,
                    bottom=None,
                    right=None,
                    top=None,
                    wspace=None,
                    hspace=0.55)

plt.show()

#Keyword analysis for AI tweets

## Load up tweets for analysis

In [None]:
fname_db = '/content/drive/MyDrive/Social Media Analytics/AItweets_v2'
df = DB.fetch(table_name = "keyword_tweets", path = fname_db)
# df = df[["screen_name", "text", "retweet_count"]]
n = len(df)
print(f"\n {n} tweets originally")
# df.head()

Remove duplicates

In [None]:
# n = len(df)
print(f"{n} tweets originally")

df = df.drop_duplicates(subset='id').copy()

n = len(df)
print(f"\n {n} tweets after removing duplicates")

Clean text, remove bots

In [None]:
df['text_clean'] = df.text.apply(ta.clean_tweet)  #clean the tweets
df = df[df.text_clean.str.len() >0]  #remove cleaned tweets of length 0
nclean = len(df)
print(f"{n} tweets, {nclean} clean tweets")

# df.sample(n=2)

# found this to be most likely a bot activity, definitely spam
# df = df[~df['text_clean'].str.contains('')]

df = df[~df['text_clean'].str.contains('health consultations we are revolutionizing the way people access medical')]
df = df[~df['text_clean'].str.contains('ai revolution download to participate in the ai2earn economy')]
df = df[~df['text_clean'].str.contains('total maximum supply only 10 units a great artificial intelligence project dont miss')]
df = df[~df['text_clean'].str.contains('blockchain based virtual world that allows users to create build buy and sell')]
df = df[~df['text_clean'].str.contains('neurobayes fairlaunch starts at')]
df = df[~df['text_clean'].str.contains('live happily togesssa')]

Filter for AI, Artificial Intelligence tweets from larger dataset into new df

In [None]:
df[f'keyword_artificial'] = df.text_clean.str.contains('artificial intelligence', case=False)
df[f'keyword_ai'] = df.text_clean.str.contains('AI', case=False)

print(f"'Artificial Intelligence': {len(df[df[f'keyword_artificial']==True])} tweets \n")
print(f"'AI': {len(df[df[f'keyword_ai']==True])} tweets ")

df_ai = df[(df['keyword_ai'] == True) | (df['keyword_artificial'] == True)].copy()

## Engagement scores

In [None]:
df_ai['like_engagement'] = df_ai.like_count/(df_ai.impression_count+1)
df_ai['retweet_engagement'] = df_ai.retweet_count/(df_ai.impression_count+1)
df_ai['reply_engagement'] = df_ai.reply_count/(df_ai.impression_count+1)

df_ai = df_ai[df_ai.like_engagement<=1]
df_ai = df_ai[df_ai.retweet_engagement<=1]
df_ai = df_ai[df_ai.reply_engagement<=1]

print(f"{len(df_ai)} total tweets")

## Identify and track keywords for engagement

In [None]:
keywords2 = ['elon', 'economy', 'health', 'gpt', 'data', 'revolution', 'prompt', 'midjourney', 'artist', 'google']
df_ai_copy = df_ai

for keyword in keywords2:
  df_ai_copy[f'keyword_{keyword}'] = df_ai_copy.text_clean.str.contains(keyword, case=False)
  print(f"\n {keyword}: {len(df_ai_copy[df_ai_copy[f'keyword_{keyword}']==True])} tweets")

print('\n')
df_ai_copy.sample(n=2)

## Run t-tests

### Like engagement analysis

In [None]:
for keyword in keywords2:
  X0 = df_ai_copy.like_engagement[df_ai_copy[f"keyword_{keyword}"]==False].dropna().tolist()
  X1 = df_ai_copy.like_engagement[df_ai_copy[f"keyword_{keyword}"]==True].dropna().tolist()

  n0 = len(X0)
  n1 = len(X1)
  mu0 = np.mean(X0)
  mu1 = np.mean(X1)

  print(f"\n\nKeyword: {keyword}\nPresent? \tNumber of tweets\tMean like engagement")
  print(f"False\t\t{n0}\t\t\t{mu0:.3f}")
  print(f"True\t\t{n1}\t\t\t{mu1:.3f}")

  (tstat, pval) =stats.ttest_ind(X0,X1, equal_var = False)

  print('T-test')
  print(f"{keyword}: t-stat = {tstat:.3f} ({pval:.3f})\n")
  alpha = 0.01  #significance level
  if pval <=alpha:
    print("Significant at 1% level\n-----------------")
  else:
    print("Not significant at 1% level\n-----------------")

### Retweet engagement

In [None]:
for keyword in keywords2:
  X0 = df_ai_copy.retweet_engagement[df_ai_copy[f"keyword_{keyword}"]==False].dropna().tolist()
  X1 = df_ai_copy.retweet_engagement[df_ai_copy[f"keyword_{keyword}"]==True].dropna().tolist()

  n0 = len(X0)
  n1 = len(X1)
  mu0 = np.mean(X0)
  mu1 = np.mean(X1)

  print(f"\n\nKeyword: {keyword}\nPresent? \tNumber of tweets\tMean retweet engagement")
  print(f"False\t\t{n0}\t\t\t{mu0:.3f}")
  print(f"True\t\t{n1}\t\t\t{mu1:.3f}")

  (tstat, pval) =stats.ttest_ind(X0,X1, equal_var = False)

  print('T-test')
  print(f"{keyword}: t-stat = {tstat:.3f} ({pval:.3f})\n")
  alpha = 0.01  #significance level
  if pval <=alpha:
    print("Significant at 1% level\n-----------------")
  else:
    print("Not significant at 1% level\n-----------------")

### Reply engagement

In [None]:
for keyword in keywords2:
  X0 = df_ai_copy.reply_engagement[df_ai_copy[f"keyword_{keyword}"]==False].dropna().tolist()
  X1 = df_ai_copy.reply_engagement[df_ai_copy[f"keyword_{keyword}"]==True].dropna().tolist()

  n0 = len(X0)
  n1 = len(X1)
  mu0 = np.mean(X0)
  mu1 = np.mean(X1)

  print(f"\n\nKeyword: {keyword}\nPresent? \tNumber of tweets\tMean reply engagement")
  print(f"False\t\t{n0}\t\t\t{mu0:.3f}")
  print(f"True\t\t{n1}\t\t\t{mu1:.3f}")

  (tstat, pval) =stats.ttest_ind(X0,X1, equal_var = False)

  print('T-test')
  print(f"{keyword}: t-stat = {tstat:.3f} ({pval:.3f})\n")
  alpha = 0.01  #significance level
  if pval <=alpha:
    print("Significant at 1% level\n-----------------")
  else:
    print("Not significant at 1% level\n-----------------")

In [None]:
!jupyter nbconvert --to html '/content/social_media_analytics/SMA_Project_Team6.ipynb'