In [85]:
import glob
import os
import csv
import json
import textstat
import pandas as pd
from langdetect import detect
from scipy.stats import ttest_ind
from datetime import datetime

In [50]:
# calculate time since posted in hours
def get_time_since_posted(time_posted, time_now):
    date_format_str = '%Y-%m-%d %H:%M:%S'
    start = datetime.strptime(time_posted, date_format_str)
    end = datetime.strptime(time_now, date_format_str)
    diff = end - start
    diff_in_hours = diff.total_seconds() / 3600
    return diff_in_hours

In [51]:
# clean text
def clean_text(text):
    text = text.lower()
    # encoding the text to ASCII format
    text_encode = text.encode(encoding='ascii', errors='ignore')
    # decoding the text
    text_decode = text_encode.decode()
    # cleaning the text to remove extra whitespace 
    text = " ".join([word for word in text_decode.split()])
    return text

In [52]:
# store categories as lists of urls
subsets = open('data-with-sentiment/subsets.txt', 'r')
for line in subsets:
    line = line.strip().replace(' ', '').replace('\'', '')
    urls = line[line.find('[') + 1:line.find(']')]
    if 'Reddit-Music' in line:
        reddit_music = urls.split(',')
    elif 'Reddit-gaming' in line:
        reddit_gaming = urls.split(',')
    elif 'Reddit-politics' in line:
        reddit_politics = urls.split(',')
    elif 'Reddit-LifeProTips' in line:
        reddit_lifeprotips = urls.split(',')
    elif 'YouTube-music' in line:
        youtube_music = urls.split(',')
    elif 'YouTube-gaming' in line:
        youtube_gaming = urls.split(',')
    elif 'YouTube-news' in line:
        youtube_news = urls.split(',')
    elif 'YouTube-howto' in line:
        youtube_howto = urls.split(',')
subsets.close()

In [95]:
with open('comments_analysis.csv', 'w', newline='') as csvfile:
    fieldnames = ['category', 'top_length', 'latest_length', 'top_flesch', 'latest_flesch', 'top_fog', 
                  'latest_fog', 'top_coleman_liau', 'latest_coleman_liau', 'top_dale_chall', 
                  'latest_dale_chall']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames, restval='')
    writer.writeheader()

    # iterate through JSON files in folder
    path = 'data-with-sentiment'
    for filename in glob.glob(os.path.join(path, '*.json')):
        with open(filename, 'r') as json_file:
            data = json_file.read().replace('\n', '')
            data_dict = json.loads(data)

            # sample is from Reddit
            if 'post_url' in data:
                continue
            # sample is from YouTube
            if 'url' in data:
                if 'comments_since_last_sample' not in data_dict or 'top_comments' not in data_dict:
                    continue

                url = data_dict['url']
                # find ID from url
                id = url.split('https://www.youtube.com/watch?v=', 1)[1]
                # get category
                if id in youtube_music:
                    category = 'YouTube_music'
                elif id in youtube_gaming:
                    category = 'YouTube_gaming'
                elif id in youtube_news:
                    category = 'YouTube_news'
                elif id in youtube_howto:
                    category = 'YouTube_howto'

                top_comments = data_dict['top_comments']
                comments_since_last_sample = data_dict['comments_since_last_sample']
                count = 0
                length = 0
                flesch = 0
                fog = 0
                coleman_liau = 0
                dale_chall = 0

                for comment in top_comments:
                    text = clean_text(comment['comment'])
                    #text = comment['comment']
                    try:
                        language = detect(text)
                    except:
                        continue
                    if language != 'en':
                        continue
                    count += 1
                    length += textstat.lexicon_count(text, removepunct=True)
                    flesch += textstat.flesch_reading_ease(text)
                    fog += textstat.gunning_fog(text)
                    coleman_liau += textstat.coleman_liau_index(text)
                    dale_chall += textstat.dale_chall_readability_score(text)
                    
                if count == 0:
                    continue
                
                top_length = length/count
                top_flesch = flesch/count
                top_fog = fog/count
                top_coleman_liau = coleman_liau/count
                top_dale_chall = dale_chall/count
                
                count = 0
                length = 0
                flesch = 0
                fog = 0
                coleman_liau = 0
                dale_chall = 0
                
                for comment in comments_since_last_sample:
                    text = clean_text(comment['body'])
                    #text = comment['body']
                    try:
                        language = detect(text)
                    except:
                        continue
                    if language != 'en':
                        continue
                    count += 1
                    length += textstat.lexicon_count(text, removepunct=True)
                    flesch += textstat.flesch_reading_ease(text)
                    fog += textstat.gunning_fog(text)
                    coleman_liau += textstat.coleman_liau_index(text)
                    dale_chall += textstat.dale_chall_readability_score(text)
                
                if count == 0:
                    continue
                    
                latest_length = length/count
                latest_flesch = flesch/count
                latest_fog = fog/count
                latest_coleman_liau = coleman_liau/count
                latest_dale_chall = dale_chall/count
                
                writer.writerow({'category': category, 'top_length': top_length, 
                                 'latest_length': latest_length, 'top_flesch': top_flesch, 
                                 'latest_flesch': latest_flesch, 'top_fog': top_fog, 'latest_fog': latest_fog,
                                 'top_coleman_liau': top_coleman_liau, 
                                 'latest_coleman_liau': latest_coleman_liau, 'top_dale_chall':top_dale_chall,
                                 'latest_dale_chall': latest_dale_chall})

        json_file.close()
csvfile.close()

In [96]:
df = pd.read_csv ('comments_analysis.csv')
print(df.groupby('category', as_index=False)['top_length'].mean())
print(df.groupby('category', as_index=False)['latest_length'].mean())
print(df.groupby('category', as_index=False)['top_flesch'].mean())
print(df.groupby('category', as_index=False)['latest_flesch'].mean())
print(df.groupby('category', as_index=False)['top_fog'].mean())
print(df.groupby('category', as_index=False)['latest_fog'].mean())
print(df.groupby('category', as_index=False)['top_coleman_liau'].mean())
print(df.groupby('category', as_index=False)['latest_coleman_liau'].mean())
print(df.groupby('category', as_index=False)['top_dale_chall'].mean())
print(df.groupby('category', as_index=False)['latest_dale_chall'].mean())

         category  top_length
0  YouTube_gaming   19.111413
1   YouTube_howto   26.576713
2   YouTube_music   19.492736
3    YouTube_news   23.835697
         category  latest_length
0  YouTube_gaming      15.589032
1   YouTube_howto      21.105324
2   YouTube_music      15.871391
3    YouTube_news      24.198925
         category  top_flesch
0  YouTube_gaming   70.498404
1   YouTube_howto   54.066630
2   YouTube_music   71.039381
3    YouTube_news   62.843058
         category  latest_flesch
0  YouTube_gaming      78.782295
1   YouTube_howto      72.800227
2   YouTube_music      75.933806
3    YouTube_news      63.666325
         category    top_fog
0  YouTube_gaming   9.541218
1   YouTube_howto  13.164492
2   YouTube_music   9.721054
3    YouTube_news  11.935566
         category  latest_fog
0  YouTube_gaming    7.858483
1   YouTube_howto   10.218728
2   YouTube_music    8.342286
3    YouTube_news   12.335335
         category  top_coleman_liau
0  YouTube_gaming          7.001274
1  

In [102]:
df = pd.read_csv ('comments_analysis.csv')
gaming = df[df['category']=='YouTube_gaming']
music = df[df['category']=='YouTube_music']
howto = df[df['category']=='YouTube_howto']
news = df[df['category']=='YouTube_news']
print('TOP_LENGTH')
print('gaming vs howto')
print(ttest_ind(gaming['top_length'], howto['top_length'], equal_var = False))
print('gaming vs music')
print(ttest_ind(gaming['top_length'], music['top_length'], equal_var = False))
print('gaming vs news')
print(ttest_ind(gaming['top_length'], news['top_length'], equal_var = False))
print('LATEST_LENGTH')
print('gaming vs howto')
print(ttest_ind(gaming['latest_length'], howto['latest_length'], equal_var = False))
print('gaming vs music')
print(ttest_ind(gaming['latest_length'], music['latest_length'], equal_var = False))
print('gaming vs news')
print(ttest_ind(gaming['latest_length'], news['latest_length'], equal_var = False))
print('LATEST_FLESCH')
print('gaming vs howto')
print(ttest_ind(gaming['latest_flesch'], howto['latest_flesch'], equal_var = False))
print('gaming vs music')
print(ttest_ind(gaming['latest_flesch'], music['latest_flesch'], equal_var = False))
print('gaming vs news')
print(ttest_ind(gaming['latest_flesch'], news['latest_flesch'], equal_var = False))
print('LATEST_COLEMAN_LIAU')
print('gaming vs howto')
print(ttest_ind(gaming['latest_coleman_liau'], howto['latest_coleman_liau'], equal_var = False))
print('gaming vs music')
print(ttest_ind(gaming['latest_coleman_liau'], music['latest_coleman_liau'], equal_var = False))
print('gaming vs news')
print(ttest_ind(gaming['latest_coleman_liau'], news['latest_coleman_liau'], equal_var = False))
print('TOP_DALE_CHALL')
print('gaming vs howto')
print(ttest_ind(gaming['top_dale_chall'], howto['top_dale_chall'], equal_var = False))
print('gaming vs music')
print(ttest_ind(gaming['top_dale_chall'], music['top_dale_chall'], equal_var = False))
print('gaming vs news')
print(ttest_ind(gaming['top_dale_chall'], news['top_dale_chall'], equal_var = False))

TOP_LENGTH
gaming vs howto
Ttest_indResult(statistic=-6.629038046360958, pvalue=4.6143921530015244e-11)
gaming vs music
Ttest_indResult(statistic=-0.3306597154575763, pvalue=0.7409444328435829)
gaming vs news
Ttest_indResult(statistic=-4.332661828201152, pvalue=1.5703916485282754e-05)
LATEST_LENGTH
gaming vs howto
Ttest_indResult(statistic=-5.108773257799327, pvalue=3.61086751379626e-07)
gaming vs music
Ttest_indResult(statistic=-0.2841798236924197, pvalue=0.7763088682921617)
gaming vs news
Ttest_indResult(statistic=-7.366457920867803, pvalue=2.8351853536862405e-13)
LATEST_FLESCH
gaming vs howto
Ttest_indResult(statistic=4.307884275558191, pvalue=1.7438817092726638e-05)
gaming vs music
Ttest_indResult(statistic=2.1887547326663492, pvalue=0.028749145400807023)
gaming vs news
Ttest_indResult(statistic=10.75581862439065, pvalue=4.435675983080056e-26)
LATEST_COLEMAN_LIAU
gaming vs howto
Ttest_indResult(statistic=-3.3203525397553406, pvalue=0.000920696724646803)
gaming vs music
Ttest_indRes