In [24]:
import os
import json
import pandas as pd
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [25]:
df = pd.read_csv("US_youtube_trending_data.csv")

In [26]:
category_ids = dict()
with open('US_category_id.json', 'r') as f:
    category_data = json.load(f)

for category in category_data['items']:
    category_ids[category['id']] = category['snippet']['title']


In [27]:
df['category_id_str'] = df['categoryId'].astype('string')
df['category_name'] = df['category_id_str'].map(category_ids)
df.drop('category_id_str', axis=1, inplace=True)

In [28]:
df[['publishedAt', 'trending_date']] = df[['publishedAt', 'trending_date']].apply(pd.to_datetime)
df['days_to_trend'] = (df['trending_date'] - df['publishedAt']).dt.days

In [29]:
df = df.reindex(columns=['video_id', 'title', 'channelId', 'channelTitle', 'categoryId', 'category_name',
        'publishedAt', 'trending_date', 'days_to_trend','comment_count', 'view_count', 'likes', 'dislikes', 
        'comments_disabled', 'thumbnail_link', 'ratings_disabled', 'description', 'tags'])
df.drop('thumbnail_link', axis=1, inplace=True)
df.rename(columns = {'channelId':'channel_id', 'channelTitle':'channel_title', 'publishedAt': 'published_at',
                     'categoryId': 'category_id'}, inplace = True)

In [33]:
def get_sentiment(text):
    analyzer = SentimentIntensityAnalyzer()
    sentiment_dict = analyzer.polarity_scores(text)
    if sentiment_dict['compound'] >= 0.05:
        return 'positive'
    elif sentiment_dict['compound'] <= - 0.05:
        return 'negative'
    else:
        return 'neutral'
    
df = df.assign(desc_sentiment = lambda x: get_sentiment(x))

print('Count - unique description sentiments:\t{count}'.format(count=len(pd.unique(df['desc_sentiment']))))

df.drop('desc_sentiment', axis=1, inplace=True)

Count - unique description sentiments:	1


In [101]:
df2 = df.copy()
df2.sort_values(by = 'trending_date', ascending=False, inplace=True)
df2.reset_index(inplace=True)
df2.drop(df2[df2['comments_disabled'] == True].index, inplace = True)
n = len(df2.axes[0]) - 120
df2.drop(df2.tail(n).index, inplace=True)

In [10]:
def write_comments_to_json(video_id):
    result = {'video_id': video_id}
    
    driver = webdriver.Chrome()
    driver.get('https://www.youtube.com/watch?v={id}'.format(id=video_id))
    
    WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.ID, 'description')))

    html = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, 'html')))
    html.send_keys(Keys.PAGE_DOWN)
    html.send_keys(Keys.PAGE_DOWN)
    WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.ID, 'contents')))
    for i in range(10):
        html.send_keys(Keys.END)
        time.sleep(2)
    
    comment_elems = driver.find_elements(By.XPATH, '//*[@id="content-text"]')
    
    comments = [elem.text for elem in comment_elems][0:100]
    if len(comments) < 100:
        pass
    else:
        result['comments'] = comments
        with open("comments/{filename}.json".format(filename=video_id), 'w') as f:
            json.dump(result, f)
        
    driver.close()

video_ids = df2["video_id"].to_list()

for video_id in video_ids:
    write_comments_to_json(video_id)


In [106]:
results = list()
analyzer = SentimentIntensityAnalyzer()
directory = 'comments'
 
for filename in os.listdir(directory):
    fp = os.path.join(directory, filename)
    if os.path.isfile(fp) and fp.endswith('.json'):
        with open(fp, 'r') as f:
            data = json.load(f)
            result = {'video_id': data['video_id'], 'positive': 0, 'negative': 0, 'neutral': 0}
            for comment in data["comments"]:
                sentiment_dict = analyzer.polarity_scores(comment)
                if sentiment_dict['compound'] >= 0.05:
                    result['positive'] += 1
                elif sentiment_dict['compound'] <= - 0.05:
                    result['negative'] += 1
                else:
                    result['neutral'] += 1
            results.append(result)

df3 = pd.DataFrame.from_dict(results)


In [107]:
df4 = pd.merge(df3, df2,  on='video_id', how='left')
df4 = df4.drop(columns=['index'])
df4.to_csv('output/ytsentiment.csv', index=False)