# Import Necessary Libraries

In [None]:
!pip install seaborn

In [None]:
!pip install detoxify

In [None]:
!pip install transformers

In [None]:
!pip install wordcloud

In [None]:
import re
import pandas as pd
import seaborn as sns
import tensorflow as tf
from datetime import datetime
from detoxify import Detoxify
import matplotlib.pyplot as plt
pd.options.mode.chained_assignment = None
from wordcloud import WordCloud, STOPWORDS
from pandas.core.groupby.groupby import DataFrame
from transformers import TFAutoModelForTokenClassification, AutoTokenizer

In [None]:
tf.config.list_physical_devices("GPU")

In [None]:
sns.set(style="whitegrid")

# Twitter Tweets Dataset

In [None]:
twitter_data = pd.read_csv('twitter.csv')
twitter_data['time_stamp'] = pd.to_datetime(twitter_data['time_stamp'], format="%Y-%m-%dT%H:%M:%S")

In [None]:
twitter_data.head(n=5)

## Tweets Toxicity Scores

In [None]:
mask = (twitter_data['time_stamp'] >= datetime(2022, 11, 1, 0, 0, 0).isoformat()) & (twitter_data['time_stamp'] < datetime(2022, 11, 15, 0, 0, 0).isoformat())
t_data = twitter_data.loc[mask]

In [None]:
def toxicityScore(row):
    results = Detoxify('original').predict(row['text'].lower())
    results.update((x, y*100) for x, y in results.items())
    print(str(results) + '\n')

In [None]:
out = t_data.apply(lambda row: toxicityScore(row), axis = 1)

## Tweets Context

In [None]:
mask = (twitter_data['time_stamp'] >= datetime(2022, 11, 1, 0, 0, 0).isoformat()) & (twitter_data['time_stamp'] < datetime(2022, 11, 15, 0, 0, 0).isoformat())
t_data = twitter_data.loc[mask]

In [None]:
t_data = t_data['context'].value_counts().rename_axis('Context').reset_index(name='Number of Tweets')

In [None]:
plt.rcParams["figure.figsize"] = [9.00, 3.50]
plt.rcParams["figure.autolayout"] = True
fig, ax =plt.subplots()
sns.barplot(x="Number of Tweets", y="Context", data=t_data).set(title="Number of tweets per context")
fig.savefig('number-of-tweets-per-context.pdf', dpi=1200)

## Top Hashtags

In [None]:
mask = (twitter_data['time_stamp'] >= datetime(2022, 11, 1, 0, 0, 0).isoformat()) & (twitter_data['time_stamp'] < datetime(2022, 11, 2, 0, 0, 0).isoformat())
t_data = twitter_data.loc[mask]

In [None]:
t_data = t_data.sample(n = 45000)

In [None]:
hashtags = t_data['text'].apply(lambda x: pd.value_counts(re.findall('(#\w+)', x.lower())))\
                         .sum(axis=0)\
                         .to_frame()\
                         .reset_index()\
                         .sort_values(by=0, ascending=False)
hashtags.columns = ['Hashtag', 'Occurences']
hashtags_1 = pd.concat([hashtags[:3], hashtags[4:6]])

In [None]:
mask = (twitter_data['time_stamp'] >= datetime(2022, 11, 2, 0, 0, 0).isoformat()) & (twitter_data['time_stamp'] < datetime(2022, 11, 3, 0, 0, 0).isoformat())
t_data = twitter_data.loc[mask]

In [None]:
hashtags = t_data['text'].apply(lambda x: pd.value_counts(re.findall('(#\w+)', x.lower() )))\
                         .sum(axis=0)\
                         .to_frame()\
                         .reset_index()\
                         .sort_values(by=0,ascending=False)
hashtags.columns = ['Hashtag','Occurences']
hashtags_2 = hashtags[:5]

In [None]:
plt.rcParams["figure.figsize"] = [9.00, 3.50]
plt.rcParams["figure.autolayout"] = True
fig, ax = plt.subplots(1, 2)
sns.barplot(x="Occurences", y="Hashtag", data=hashtags_1,
            ax=ax[0]).set(title="Top Hashtags on 11/01/2022")
sns.barplot(x="Occurences", y="Hashtag", data=hashtags_2,
            ax=ax[1]).set(title="Top Hashtags on 11/02/2022")
fig.savefig('top-hashtags.pdf', dpi=1200)

## Entity Recognition

In [None]:
mask = (twitter_data['time_stamp'] >= datetime(2022, 11, 1, 0, 0, 0).isoformat()) & (twitter_data['time_stamp'] < datetime(2022, 11, 3, 0, 0, 0).isoformat())
t_data = twitter_data.loc[mask]

In [None]:
model = TFAutoModelForTokenClassification.from_pretrained("dbmdz/bert-large-cased-finetuned-conll03-english")
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

In [None]:
label_list = ["O", "B-MISC", "I-MISC", "B-PER",
              "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC"]

In [None]:
count = 0
master_dict = {'I-LOCX': [], 'I-ORGX': [], 'I-PERX': [],
               'B-LOCX': [], 'B-ORGX': [], 'B-PERX': []}
word_temp = ''
current_tag = ''
old_tag = ''
output = {}

In [None]:
for ind in t_data.index:
    sequence = t_data['text'][ind].lower()
    if len(sequence) > 512:
        continue
    tokens = tokenizer.tokenize(tokenizer.decode(tokenizer.encode(sequence)))
    inputs = tokenizer.encode(sequence, return_tensors="tf")
    outputs = model(inputs)[0]
    predictions = tf.argmax(outputs, axis=2)
    list_bert = [(token, label_list[prediction])
                 for token, prediction in zip(tokens, predictions[0].numpy())]
    for i in list_bert:
        if i[1] in ['O', 'B-MISC', 'I-MISC']:
            if len(current_tag) > 0:
                without_space_word = word_temp.strip()
                if len(without_space_word) > 1:
                    master_dict[current_tag + 'X'].append(without_space_word)
            count = 0
            word_temp = ''
            current_tag = ''
            continue
        else:
            current_tag = i[1]

            if old_tag != current_tag and len(old_tag) > 0:
                without_space_word = word_temp.strip()
                if len(without_space_word) > 1:
                    master_dict[old_tag + 'X'].append(without_space_word)
                count = 0
                word_temp = ''
                current_tag = ''

            if i[0].startswith('##'):
                word_temp += i[0][2:].upper()
            elif i[1] in ['I-PER', 'I-ORG', 'I-LOC', 'B-LOC', 'B-ORG', 'B-PER']:
                word_temp += " " + i[0].upper()
                current_tag = i[1]
                count += 1
            old_tag = current_tag

    output['Location'] = list(master_dict['I-LOCX'] + master_dict['B-LOCX'])
    output['Organization'] = list(
        master_dict['I-ORGX'] + master_dict['B-ORGX'])
    output['Person Name'] = list(master_dict['I-PERX'] + master_dict['B-PERX'])
    print(output)

In [None]:
comment_words = ''
stopwords = set(STOPWORDS)
stopwords.add('https')
stopwords.add('t')
stopwords.add('co')
stopwords.add('rt')


tokens = output['Location'] + output['Organization'] + output['Person Name']

for i in range(len(tokens)):
    tokens[i] = tokens[i].lower()

comment_words += " ".join(tokens)+" "

wordcloud = WordCloud(width=800, height=800,
                      background_color='white',
                      stopwords=stopwords,
                      min_font_size=10).generate(comment_words)

plt.figure(figsize=(6, 6), facecolor=None)
plt.imshow(wordcloud)
plt.axis("off")
plt.tight_layout(pad=0)
plt.savefig('tweets-wordcloud.pdf', dpi=1200)

# Subreddit Posts Comments Dataset

In [None]:
reddit_data = pd.read_csv('reddit.csv')
reddit_data['time_stamp'] = pd.to_datetime(reddit_data['time_stamp'], format="%Y-%m-%dT%H:%M:%S")

In [None]:
reddit_data.head(n=5)

## Subreddit Comments Toxicity Scores

In [None]:
mask = (reddit_data['time_stamp'] >= datetime(2022, 11, 1, 0, 0, 0).isoformat()) & (reddit_data['time_stamp'] < datetime(2022, 11, 15, 0, 0, 0).isoformat())
r_data = reddit_data.loc[mask]

In [None]:
def toxicityScore(row):
    results = Detoxify('original').predict(row['text'].lower())
    results.update((x, y*100) for x, y in results.items())
    print(str(results) + '\n')

In [None]:
out = r_data.apply(lambda row: toxicityScore(row), axis = 1)

## Total Comments Per Subreddit

In [None]:
mask = (reddit_data['time_stamp'] >= datetime(2022, 11, 1, 0, 0, 0).isoformat()) & (reddit_data['time_stamp'] < datetime(2022, 11, 15, 0, 0, 0).isoformat())
r_data = reddit_data.loc[mask]

In [None]:
r_data = r_data['subreddit'].value_counts().rename_axis('Subreddit').reset_index(name='Number of Comments')

In [None]:
plt.rcParams["figure.figsize"] = [9.00, 3.50]
plt.rcParams["figure.autolayout"] = True
fig, ax = plt.subplots()
sns.scatterplot(x="Subreddit", y="Number of Comments",
                data=r_data, size="Number of Comments").set(title="Number of comments per Subreddit")
fig.savefig('total-comments-per-subreddit.pdf', dpi=1200)

## Entity Recognition

In [None]:
mask = (reddit_data['time_stamp'] >= datetime(2022, 11, 1, 0, 0, 0).isoformat()) & (reddit_data['time_stamp'] < datetime(2022, 11, 3, 0, 0, 0).isoformat())
r_data = reddit_data.loc[mask]

In [None]:
count = 0
master_dict = {'I-LOCX': [], 'I-ORGX': [], 'I-PERX': [],
               'B-LOCX': [], 'B-ORGX': [], 'B-PERX': []}
word_temp = ''
current_tag = ''
old_tag = ''
output = {}

In [None]:
for ind in r_data.index:
    sequence = r_data['text'][ind].lower()
    if len(sequence) > 512:
        continue
    tokens = tokenizer.tokenize(tokenizer.decode(tokenizer.encode(sequence)))
    inputs = tokenizer.encode(sequence, return_tensors="tf")
    outputs = model(inputs)[0]
    predictions = tf.argmax(outputs, axis=2)
    list_bert = [(token, label_list[prediction])
                 for token, prediction in zip(tokens, predictions[0].numpy())]
    for i in list_bert:
        if i[1] in ['O', 'B-MISC', 'I-MISC']:
            if len(current_tag) > 0:
                without_space_word = word_temp.strip()
                if len(without_space_word) > 1:
                    master_dict[current_tag + 'X'].append(without_space_word)
            count = 0
            word_temp = ''
            current_tag = ''
            continue
        else:
            current_tag = i[1]

            if old_tag != current_tag and len(old_tag) > 0:
                without_space_word = word_temp.strip()
                if len(without_space_word) > 1:
                    master_dict[old_tag + 'X'].append(without_space_word)
                count = 0
                word_temp = ''
                current_tag = ''

            if i[0].startswith('##'):
                word_temp += i[0][2:].upper()
            elif i[1] in ['I-PER', 'I-ORG', 'I-LOC', 'B-LOC', 'B-ORG', 'B-PER']:
                word_temp += " " + i[0].upper()
                current_tag = i[1]
                count += 1
            old_tag = current_tag

    output['Location'] = list(master_dict['I-LOCX'] + master_dict['B-LOCX'])
    output['Organization'] = list(
        master_dict['I-ORGX'] + master_dict['B-ORGX'])
    output['Person Name'] = list(master_dict['I-PERX'] + master_dict['B-PERX'])
    print(output)

In [None]:
comment_words = ''
stopwords = set(STOPWORDS)
stopwords.add('https')
stopwords.add('t')
stopwords.add('co')
stopwords.add('rt')


tokens = output['Location'] + output['Organization'] + output['Person Name']

for i in range(len(tokens)):
    tokens[i] = tokens[i].lower()

comment_words += " ".join(tokens)+" "

wordcloud = WordCloud(width=800, height=800,
                      background_color='white',
                      stopwords=stopwords,
                      min_font_size=10).generate(comment_words)

plt.figure(figsize=(6, 6), facecolor=None)
plt.imshow(wordcloud)
plt.axis("off")
plt.tight_layout(pad=0)
plt.savefig('reddit-wordcloud.pdf', dpi=1200)

# YouTube Videos Comments Dataset

In [None]:
youtube_data = pd.read_csv('youtube.csv')
youtube_data['comment_timestamp'] = pd.to_datetime(youtube_data['comment_timestamp'], format="%Y-%m-%dT%H:%M:%S")

In [None]:
youtube_data.head(n=5)

## YouTube Comments Toxicity Scores

In [None]:
mask = (youtube_data['comment_timestamp'] >= datetime(2022, 11, 1, 0, 0, 0).isoformat()) & (youtube_data['comment_timestamp'] < datetime(2022, 11, 15, 0, 0, 0).isoformat())
y_data = youtube_data.loc[mask]

In [None]:
def toxicityScore(row):
    results = Detoxify('original').predict(row['comment_text'].lower())
    results.update((x, y*100) for x, y in results.items())
    print(str(results) + '\n')

In [None]:
out = y_data.apply(lambda row: toxicityScore(row), axis = 1)

## Videos With Most Comments

In [None]:
mask = (youtube_data['comment_timestamp'] >= datetime(2022, 11, 1, 0, 0, 0).isoformat()) & (youtube_data['comment_timestamp'] < datetime(2022, 11, 15, 0, 0, 0).isoformat())
y_data = youtube_data.loc[mask]

In [None]:
y_data = y_data['video_title'].value_counts()[:10].rename_axis('Video').reset_index(name='Number of Comments')

In [None]:
plt.rcParams["figure.figsize"] = [15.00, 3.50]
plt.rcParams["figure.autolayout"] = True
fig, ax = plt.subplots()
sns.barplot(x="Number of Comments", y="Video", data=y_data).set(
    title="Videos with most comments")
fig.savefig('videos-with-most-comments.pdf', dpi=1200)

## Authors With Most Comments

In [None]:
mask = (youtube_data['comment_timestamp'] >= datetime(2022, 11, 1, 0, 0, 0).isoformat()) & (youtube_data['comment_timestamp'] < datetime(2022, 11, 15, 0, 0, 0).isoformat())
y_data = youtube_data.loc[mask]

In [None]:
y_data = y_data['author'].value_counts()[:10].rename_axis('Author').reset_index(name='Number of Comments')

In [None]:
plt.rcParams.update(
    {'text.usetex': False, 'font.family': 'stixgeneral', 'mathtext.fontset': 'stix', })
plt.rcParams["figure.figsize"] = [15.00, 3.50]
plt.rcParams["figure.autolayout"] = True
fig, ax = plt.subplots()
sns.barplot(x="Number of Comments", y="Author", data=y_data).set(
    title="Authors with most comments")
fig.savefig('authors-with-most-comments.pdf', dpi=1200)

## Common Words In Comments Related To Scam

In [None]:
mask = (youtube_data['comment_timestamp'] >= datetime(2022, 11, 1, 0, 0, 0).isoformat()) & (youtube_data['comment_timestamp'] < datetime(2022, 11, 15, 0, 0, 0).isoformat())
y_data = youtube_data.loc[mask]

In [None]:
comment_words = ''
stopwords = set(STOPWORDS)
stopwords.add('b')
stopwords.add('br')

for ind in y_data.index:
    if 'telegram' in str(y_data['author'][ind]).lower() or 'whatsapp' in str(y_data['author'][ind]).lower():
        sequence = str(y_data['comment_text'][ind]).lower()

        tok = sequence.split(" ")
        tokens = [t for t in tok]

        comment_words += " ".join(tokens)+" "

wordcloud = WordCloud(width=800, height=800,
                      background_color='white',
                      stopwords=stopwords,
                      min_font_size=10).generate(comment_words)

plt.figure(figsize=(6, 6), facecolor=None)
plt.imshow(wordcloud)
plt.axis("off")
plt.tight_layout(pad=0)
plt.savefig('youtube-wordcloud.pdf', dpi=1200)

# Common Plots

In [None]:
twitter_data['time_stamp'] = pd.to_datetime(
    twitter_data['time_stamp'], format="%Y-%m-%dT%H:%M:%S")
mask = (twitter_data['time_stamp'] >= datetime(2022, 11, 1, 0, 0, 0).isoformat()) & (
    twitter_data['time_stamp'] < datetime(2022, 11, 5, 0, 0, 0).isoformat())
t_data = twitter_data.loc[mask]
t_data = t_data['time_stamp'].value_counts().rename_axis(
    'Hours').reset_index(name='Number of Submissions')
t_data = t_data.sort_values(by='Hours')
t_data.reset_index(drop=True)
t_data = t_data.groupby([t_data['Hours'].dt.hour])
t_data = t_data["Number of Submissions"].sum()
t_data = DataFrame(t_data).reset_index()
platform = ['twitter' for _ in range(len(t_data))]
t_data['Platform'] = platform
t_data['Hours'] = t_data['Hours'] + 1

In [None]:
reddit_data['time_stamp'] = pd.to_datetime(
    reddit_data['time_stamp'], format="%Y-%m-%dT%H:%M:%S")
mask = (reddit_data['time_stamp'] >= datetime(2022, 11, 1, 0, 0, 0).isoformat()) & (
    reddit_data['time_stamp'] < datetime(2022, 11, 5, 0, 0, 0).isoformat())
r_data = reddit_data.loc[mask]
r_data = r_data['time_stamp'].value_counts().rename_axis(
    'Hours').reset_index(name='Number of Submissions')
r_data = r_data.sort_values(by='Hours')
r_data.reset_index(drop=True)
r_data = r_data.groupby([r_data['Hours'].dt.hour])
r_data = r_data["Number of Submissions"].sum()
r_data = DataFrame(r_data).reset_index()
platform = ['reddit' for _ in range(len(r_data))]
r_data['Platform'] = platform
r_data['Hours'] = r_data['Hours'] + 1

In [None]:
youtube_data['comment_timestamp'] = pd.to_datetime(
    youtube_data['comment_timestamp'], format="%Y-%m-%dT%H:%M:%S")
mask = (youtube_data['comment_timestamp'] >= datetime(2022, 11, 1, 0, 0, 0).isoformat()) & (
    youtube_data['comment_timestamp'] < datetime(2022, 11, 5, 0, 0, 0).isoformat())
y_data = youtube_data.loc[mask]
y_data = y_data['comment_timestamp'].value_counts().rename_axis(
    'Hours').reset_index(name='Number of Submissions')
y_data = y_data.sort_values(by='Hours')
y_data.reset_index(drop=True)
y_data = y_data.groupby([y_data['Hours'].dt.hour])
y_data = y_data["Number of Submissions"].sum()
y_data = DataFrame(y_data).reset_index()
platform = ['youtube' for _ in range(len(y_data))]
y_data['Platform'] = platform
y_data['Hours'] = y_data['Hours'] + 1

In [None]:
final = pd.concat([t_data, r_data, y_data])

In [None]:
plt.rcParams["figure.figsize"] = [10.00, 5.50]
plt.rcParams["figure.autolayout"] = True
fig, ax = plt.subplots()
sns.lineplot(data=final, x="Hours", y="Number of Submissions",
             hue="Platform").set(title="Hourly Submissions Trend")
fig.savefig('hourly-submissions-trend.pdf', dpi=1200)

In [None]:
twitter_data['time_stamp'] = pd.to_datetime(
    twitter_data['time_stamp'], format="%Y-%m-%dT%H:%M:%S")
mask = (twitter_data['time_stamp'] >= datetime(2022, 11, 1, 0, 0, 0).isoformat()) & (
    twitter_data['time_stamp'] < datetime(2022, 11, 21, 0, 0, 0).isoformat())
t_data = twitter_data.loc[mask]
t_data = t_data['time_stamp'].value_counts().rename_axis(
    'Date').reset_index(name='Number of Submissions')
t_data = t_data.sort_values(by='Date')
t_data.reset_index(drop=True)
t_data = t_data.groupby([t_data['Date'].dt.floor('d')])
t_data = t_data["Number of Submissions"].sum()
t_data = DataFrame(t_data).reset_index()
platform = ['twitter' for _ in range(len(t_data))]
t_data['Platform'] = platform

In [None]:
reddit_data['time_stamp'] = pd.to_datetime(
    reddit_data['time_stamp'], format="%Y-%m-%dT%H:%M:%S")
mask = (reddit_data['time_stamp'] >= datetime(2022, 11, 1, 0, 0, 0).isoformat()) & (
    reddit_data['time_stamp'] < datetime(2022, 11, 21, 0, 0, 0).isoformat())
r_data = reddit_data.loc[mask]
r_data = r_data['time_stamp'].value_counts().rename_axis(
    'Date').reset_index(name='Number of Submissions')
r_data = r_data.sort_values(by='Date')
r_data.reset_index(drop=True)
r_data = r_data.groupby([r_data['Date'].dt.floor('d')])
r_data = r_data["Number of Submissions"].sum()
r_data = DataFrame(r_data).reset_index()
platform = ['reddit' for _ in range(len(r_data))]
r_data['Platform'] = platform

In [None]:
youtube_data['comment_timestamp'] = pd.to_datetime(
    youtube_data['comment_timestamp'], format="%Y-%m-%dT%H:%M:%S")
mask = (youtube_data['comment_timestamp'] >= datetime(2022, 11, 1, 0, 0, 0).isoformat()) & (
    youtube_data['comment_timestamp'] < datetime(2022, 11, 21, 0, 0, 0).isoformat())
y_data = youtube_data.loc[mask]
y_data = y_data['comment_timestamp'].value_counts().rename_axis(
    'Date').reset_index(name='Number of Submissions')
y_data = y_data.sort_values(by='Date')
y_data.reset_index(drop=True)
y_data = y_data.groupby([y_data['Date'].dt.floor('d')])
y_data = y_data["Number of Submissions"].sum()
y_data = DataFrame(y_data).reset_index()
platform = ['youtube' for _ in range(len(y_data))]
y_data['Platform'] = platform

In [None]:
final = pd.concat([t_data, r_data, y_data])

In [None]:
plt.rcParams["figure.figsize"] = [10.00, 5.50]
plt.rcParams["figure.autolayout"] = True
fig, ax = plt.subplots()
sns.lineplot(data=final, x="Date", y="Number of Submissions",
             hue="Platform").set(title="Daily Submissions Trend")
fig.savefig('daily-submissions-trend.pdf', dpi=1200)

# Required Plots

In [None]:
total_tweets_count = pd.read_csv('tweets_count.csv')

In [None]:
plt.rcParams["figure.figsize"] = [10.00, 5.50]
plt.rcParams["figure.autolayout"] = True
fig, ax = plt.subplots()
plt.xticks(rotation=45)
sns.lineplot(data=total_tweets_count, x="Date", y="Number of Tweets").set(
    title="Total Number of Tweets")
fig.savefig('total-tweets.pdf', dpi=1200)

In [None]:
reddit_data['time_stamp'] = pd.to_datetime(
    reddit_data['time_stamp'], format="%Y-%m-%dT%H:%M:%S")
mask = (reddit_data['time_stamp'] >= datetime(2022, 11, 4, 0, 0, 0).isoformat()) & (
    reddit_data['time_stamp'] < datetime(2022, 11, 15, 0, 0, 0).isoformat())
r_data = reddit_data.loc[mask]
r_data = r_data['time_stamp'].value_counts().rename_axis(
    'Date').reset_index(name='Number of Submissions')
r_data = r_data.sort_values(by='Date')
r_data.reset_index(drop=True)
r_data = r_data.groupby([r_data['Date'].dt.floor('H')])
r_data = r_data["Number of Submissions"].sum()
r_data = DataFrame(r_data).reset_index()

In [None]:
plt.rcParams["figure.figsize"] = [10.00, 5.50]
plt.rcParams["figure.autolayout"] = True
fig, ax = plt.subplots()
sns.lineplot(data=r_data, x="Date", y="Number of Submissions").set(
    title="Total Number of Submissions")
fig.savefig('total-subreddit-submissions.pdf', dpi=1200)