In [1]:
from collect_data import *
from analysis import *
from datasets import *
from consts import *
import pandas as pd


### Data collection

In [2]:
vdb = datasets.VideoDatabase(consts.VIDEOS_CSV)
vdb.load_from_csv()

### Download data from YouTube

In [3]:
# # Run this cell only if you want to fetch new data
# channel_ids = consts.CHANNEL_IDS

# videos_per_channel = 20
# comment_pages_per_video = 10

# for channel in channel_ids:
#     videos = get_videos_from_channel(channel, videos_per_channel)
#     for video in videos:
#         get_comments_from_video(video, comment_pages_per_video)

### Synchronize old data with the new

In [4]:
vdb.sync_comments_file(consts.COMMENTS_CSV)

## Analysis

#### Prepair the data frame for further analysis

In [5]:
vdb = datasets.VideoDatabase(consts.VIDEOS_CSV)
vdb.load_from_csv()
vdb.sync_comments_file(consts.COMMENTS_CSV)

df = pd.read_csv("comments.csv", usecols=consts.COMMENT_COLS, lineterminator="\n")
df["channelName"] = df["videoId"].apply(vdb.videoId_to_channelName)
df["channelId"] = df["channelName"].apply(vdb.channelName_to_channelId)

#### Plot the most prolific commenters from selected channels

In [6]:
plot_most_prolific(df)


#### Get the list of authors by number of written comments

In [7]:
top_commenters = get_top_commenters(df)
top_commenters[0].head(10)

Unnamed: 0,authorDisplayName,authorChannelId,count
0,Wajiha Sethi,UC0UbVaelFT_Z3Yh8D9ofxJQ,76
1,RR 56,UCSJ_tarrKDCILo1xZBH9T6g,58
2,Mine-Finder,UCP0u_9DjnlbR9lTBvmS2OrA,36
3,Zak,UC8OprD40tUc9AyL9NDmOv0w,28
4,iva taiwan,UCfA9LJ0pW7sV-phB2jEq6Fg,24
5,JourneyMan Smitty,UCzLjekZQGmOT7t32oDUvJTQ,21
6,DRIZZLE,UCIkCDg1IZYneowG47-0wjhQ,20
7,Todd Birman,UCIB_mMnkzAesPlGoeavA5cA,19
8,Joseph Sonora,UC3XEpDanrCsH_k3yxkq_hZg,19
9,Mark Lasky,UCBBvZ9jWvpU4An_-1FgOyng,19


#### Get list of data frames with the comments of $n$ most prolific authors in each channel

In [8]:
comments = get_comments(df, channelName=None, authorChannelId=None, top_authors=5)

In [9]:
len(comments)

15

In [10]:
comments[1].head(5)

Unnamed: 0,authorChannelId,authorDisplayName,channelId,textOriginal
3781,UCSJ_tarrKDCILo1xZBH9T6g,RR 56,UCupvZG-5ko_eiXAupbDfxWw,AMERICA 🇺🇸 IS WAITING FOR THE LAPTOP FROM HELL...
3782,UCSJ_tarrKDCILo1xZBH9T6g,RR 56,UCupvZG-5ko_eiXAupbDfxWw,HILLARY CLINTON AND THE FBI PAID FOR THE RUSSI...
3797,UCSJ_tarrKDCILo1xZBH9T6g,RR 56,UCupvZG-5ko_eiXAupbDfxWw,10% FOR THE BIG GUY CHINA 🇨🇳 JOE
3798,UCSJ_tarrKDCILo1xZBH9T6g,RR 56,UCupvZG-5ko_eiXAupbDfxWw,C-SPAN SWAMP RIGGED CALLS
3799,UCSJ_tarrKDCILo1xZBH9T6g,RR 56,UCupvZG-5ko_eiXAupbDfxWw,NEVER SAY HUNTER BIDEN LAPTOP FROM HELL ON C-S...


### Get the results for comments by inference of our sentiment analysis model

In [11]:
sentiment = get_sentiment(data_list=list(comments[0]["textOriginal"].values))

In [13]:
sentiment[:5]

[{'label': 'LABEL_1', 'score': 0.999957799911499},
 {'label': 'LABEL_1', 'score': 0.9999663829803467},
 {'label': 'LABEL_1', 'score': 0.9999551773071289},
 {'label': 'LABEL_1', 'score': 0.989024817943573},
 {'label': 'LABEL_1', 'score': 0.9995189905166626}]

### Combine previous three functions and flag the possible trolls

In [16]:
trolls = flag_trolls(df)

Channel 1 has 16 negative comments out of 76 total comments
Channel 2 has 39 negative comments out of 58 total comments
Channel has been flagged as a possible troll channel
Channel name:  RR 56
Number of comments classified as negative:  39
Total number of comments:  58
Percentage of negative comments:  67.24137931034483 %
Channel 3 has 32 negative comments out of 40 total comments
Channel has been flagged as a possible troll channel
Channel name:  Mine-Finder
Number of comments classified as negative:  32
Total number of comments:  40
Percentage of negative comments:  80.0 %
Channel 4 has 58 negative comments out of 69 total comments
Channel has been flagged as a possible troll channel
Channel name:  Zak
Number of comments classified as negative:  58
Total number of comments:  69
Percentage of negative comments:  84.05797101449275 %
Channel 5 has 11 negative comments out of 24 total comments
Channel 6 has 25 negative comments out of 33 total comments
Channel has been flagged as a poss

In [17]:
print(trolls)

['RR 56', 'Mine-Finder', 'Zak', 'Dino Steaks', 'Tyson Ballard', 'antonio ravello', 'Oppressed Speaker of truth', 'Douglas Reagan', 'Zak', 'Divided WEFALL', 'TONYTOM LEONARDOVACA', 'ASHLEY JANSIXMARTYR', 'Matt Foley']
