In [24]:
import json
import re
import pickle
from nltk.tokenize import TreebankWordTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import os
#for YouTube video scraping
import googleapiclient.discovery
import urllib3
from bs4 import BeautifulSoup
import requests
import unicodedata
API_KEY = "AIzaSyA2l1Gs_fWKE8-UVWhMgVPmF3Bo2-Sci7U"
#for SVD
from sklearn.decomposition import TruncatedSVD
#for Sentiment Analysis
import nltk
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sid = SentimentIntensityAnalyzer()


#general purpose tokenizer for text input
tokenizer = TreebankWordTokenizer()
def tokenize(text):
    text= text.lower()
    return tokenizer.tokenize(text)


def claps_to_nums(claps):
    if claps == 0:
        return 0
    num=claps.split()[0]
    if "K" in num:
        num=num[:-1]
        num=float(num)*1000
    else:
        num=float(num)
    return num

#building data arrays for Medium article text and YouTube video plus tags
#for those that have tags
med_text_tag = []
yt_title_tag = []

#dictionary for referencing the Medium article data set
medium_ind_to_art_info = {}

with open('./data/medium/deduped-medium-comments-list.json') as f:
    medium_data = json.load(f)

i=0
for article in medium_data:
    tmp = {}
    tmp["title"] = article["title"]
    tmp["link"] = article["link"]
    tmp["claps"] = int(claps_to_nums(article["claps"]))
    tmp["reading_time"] = article["reading_time"]
    if (len(article["comments"])>0):
        tmp["comments"] = article["comments"]
        comment_toks = set()
        sentiments=[]
        for comment in article["comments"]:
            sentiments.append(sid.polarity_scores((comment).lower()))
            comment_toks.update(tokenize(comment))
        tmp["sentiments"] = sentiments
        tmp["comment_toks"] = comment_toks
    art_text_tag = article["text"]
    if "tags" in article.keys():
        tags=set()
        for tag in article["tags"]:
            art_text_tag += " " + tag
            tags.add(tag)
        tmp["tags"] = tags
    med_text_tag.append(art_text_tag)
    medium_ind_to_art_info[i] = tmp
    i+=1 

med_data_len = len(medium_ind_to_art_info.keys())

with open('./data/reddit/youtube_comment_data.json') as f:
    yt_comment_data = json.load(f)

with open('./data/reddit/youtube_video_lengths.pickle', 'rb') as f:
    yt_id_to_length = pickle.load(f)

#dictionaries for referencing the YouTube videos data set
yt_index_to_id = {}
yt_id_to_vid_info = {}
with open('./data/reddit/youtube_video_data.json') as f:
    yt_data = json.load(f)

i=0
for youtube in yt_data:
    yt_id=youtube['id']
    yt_index_to_id[i]=yt_id
    yt_id_to_vid_info[yt_id]={}
    yt_id_to_vid_info[yt_id]["title"] = youtube["snippet"]["title"]
    yt_id_to_vid_info[yt_id]["likes"] = 0
    if 'statistics' in youtube.keys():
        if 'likeCount' in youtube['statistics'].keys():
            yt_id_to_vid_info[yt_id]["likes"] = int(youtube['statistics']['likeCount'])
    vid_title_tag = youtube["snippet"]["title"]
    if 'tags' in youtube["snippet"].keys():
        #tags=" "
        tags = set()
        for tag in youtube["snippet"]["tags"]:
            vid_title_tag += " " + tag
            tags.add(tag)
        yt_id_to_vid_info[yt_id]["tags"] = tags
    yt_title_tag.append(vid_title_tag)
    i+=1



for vid_comments in yt_comment_data:
    top_comments = []
    comment_toks = set()
    sentiments=[]
    #comment[0] is the actual text of the comment
    #comment[1] is the number of likes for that comment
    for comment in vid_comments["text_likes"]:
        top_comments.append(comment[0])
        sentiments.append(sid.polarity_scores((comment[0]).lower()))
        comment_toks.update(tokenize(comment[0]))
    yt_id = vid_comments["id"]
    yt_id_to_vid_info[yt_id]["comments"] = top_comments
    yt_id_to_vid_info[yt_id]["comment_toks"] = comment_toks
    yt_id_to_vid_info[yt_id]["sentiments"] = sentiments

yt_data_len = len(yt_index_to_id.keys())

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/anjelikalynne/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [25]:
with open('./data/medium/medium-data.pickle', 'wb') as f:
    pickle.dump(medium_ind_to_art_info, f)

In [26]:
with open('./data/reddit/youtube-vid-info.pickle', 'wb') as f:
    pickle.dump(yt_id_to_vid_info, f)

In [27]:
with open('./data/reddit/youtube-index-id.pickle', 'wb') as f:
    pickle.dump(yt_index_to_id, f)

In [12]:
#data array of both article text and video description text
#to train the vectorizer
data = med_text_tag + yt_title_tag

In [13]:
#maximum number of features to train the vectorizer
n_feats = 5000
medium_articles_by_vocab = np.empty([med_data_len, n_feats])
yt_vids_by_vocab = np.empty([yt_data_len, n_feats])
# doc_by_vocab = np.empty([len(data), n_feats])

def build_vectorizer(max_features, stop_words, max_df=0.8, min_df=10, norm='l2'):
    return TfidfVectorizer(stop_words=stop_words, max_df=max_df, min_df=min_df,max_features=max_features, norm=norm)

#building vectorizer to train
tfidf_vec = build_vectorizer(n_feats, "english")
tfidf_vec.fit(d for d in data)
medium_articles_by_vocab = tfidf_vec.transform(art for art in med_text_tag).toarray()
yt_vids_by_vocab = tfidf_vec.transform(vid for vid in yt_title_tag).toarray()
# doc_by_vocab = tfidf_vec.fit_transform([d['text'] for d in data]).toarray()
# tfidf_vec2 = build_vectorizer(n_feats, "english")
# yt_doc_by_vocab = tfidf_vec2.fit_transform([d["snippet"]['description'] for d in data2]).toarray()
index_to_vocab = {i:v for i, v in enumerate(tfidf_vec.get_feature_names())}

In [33]:
index_to_vocab.values()



In [14]:
with open('./data/medium/medium-matrix.pickle', 'wb') as f:
    pickle.dump(medium_articles_by_vocab, f)

In [15]:
with open('./data/reddit/youtube-matrix.pickle', 'wb') as f:
    pickle.dump(yt_vids_by_vocab, f)

In [16]:
with open('./data/vectorizer.pickle', 'wb') as f:
    pickle.dump(tfidf_vec, f)

In [18]:
def SVD(k_val):
    return TruncatedSVD(n_components=k_val)

med_k_val = 100
yt_k_val = 200
#train different SVD models on different spaces depending on the data set
svd_med = SVD(med_k_val)
svd_yt = SVD(yt_k_val)
svd_med_docs = svd_med.fit_transform(medium_articles_by_vocab)
svd_yt_docs = svd_yt.fit_transform(yt_vids_by_vocab)

In [20]:
with open('./data/SVD-med-model.pickle', 'wb') as f:
    pickle.dump(svd_med, f)

In [21]:
with open('./data/SVD-yt-model.pickle', 'wb') as f:
    pickle.dump(svd_yt, f)

In [22]:
with open('./data/SVD-med-docs.pickle', 'wb') as f:
    pickle.dump(svd_med_docs, f)

In [23]:
with open('./data/SVD-yt-docs.pickle', 'wb') as f:
    pickle.dump(svd_yt_docs, f)

In [43]:
#keys for youtube dictionaries
print(yt_id_to_vid_info[yt_index_to_id[1]].keys())

dict_keys(['comment_toks', 'title', 'likes', 'comments', 'sentiments', 'tags'])


In [47]:
#keys for medium article dictionaries
print(medium_ind_to_art_info[290].keys())

dict_keys(['sentiments', 'link', 'comment_toks', 'title', 'comments', 'tags', 'reading_time', 'claps'])


In [35]:
likes_arr = np.zeros(yt_data_len)
for index in yt_index_to_id.keys():
    yt_id = yt_index_to_id[index]
    likes_arr[index] = yt_id_to_vid_info[yt_id]["likes"]

In [36]:
likes_arr

array([ 3011.,  6061.,  3288., ...,  1284.,  2501., 23292.])

In [38]:
claps_arr = np.zeros(med_data_len)
for index in medium_ind_to_art_info.keys():
    claps_arr[index] = medium_ind_to_art_info[index]["claps"]

In [39]:
claps_arr

array([1100., 1300.,  997., ...,   54.,  905.,   57.])

In [41]:
with open('./data/likes-array.pickle', 'wb') as f:
    pickle.dump(likes_arr, f)

In [42]:
with open('./data/claps-array.pickle', 'wb') as f:
    pickle.dump(claps_arr, f)

In [50]:
yt_sentiment_scores = np.zeros(yt_data_len)

for index in yt_index_to_id.keys():
    yt_id = yt_index_to_id[index] 
    if 'sentiments' in yt_id_to_vid_info[yt_id].keys():
        curr_score = 0
        for comm_sent in yt_id_to_vid_info[yt_id]['sentiments']:
            curr_score += comm_sent['compound']
        yt_sentiment_scores[index] = curr_score

medium_sentiment_scores = np.zeros(med_data_len)

for index in medium_ind_to_art_info.keys():
    if 'sentiments' in medium_ind_to_art_info[index].keys():
        curr_score = 0
        for comm_sent in medium_ind_to_art_info[index]['sentiments']:
            curr_score += comm_sent['compound']  
        medium_sentiment_scores[index] = curr_score

In [52]:
with open('./data/yt-sentiment.pickle', 'wb') as f:
    pickle.dump(yt_sentiment_scores, f)

In [53]:
with open('./data/med-sentiment.pickle', 'wb') as f:
    pickle.dump(medium_sentiment_scores, f)