In [1]:
import spacy
import pandas as pd
import numpy as np
from functools import partial
from collections import defaultdict
from typing import List, Set
from pathlib import Path
from pytrends.request import TrendReq
from sklearn.feature_extraction.text import CountVectorizer

### Get google trending key words

In [2]:
# read more about pytrends API here: https://pypi.org/project/pytrends/
pytrend = TrendReq()
google_topics = (
    pytrend.
    trending_searches(pn='united_states').
    rename(columns={0:'topic'}).
    query('topic.str.split().str.len()<=3'). #only use trending topics at most 3 words
    topic.str.lower().
    reset_index(drop=True)
)
google_topics = set(google_topics.values)

In [3]:
google_topics

{'amazon stock',
 'athletic  real madrid',
 'cameroon vs egypt',
 'celtics',
 'doug pederson',
 'geoffrey paschel',
 'johnny weir',
 'keanu reeves',
 'kim cattrall',
 'lakers vs clippers',
 'moonfall',
 'murderville',
 'pro bowl 2022',
 'raised by wolves',
 'real madrid',
 'snap stock',
 'trae young',
 'winter olympics'}

### get reddit post title tri-grams

In [4]:
!ls ./data

cats_comments.csv      russia_comments.csv    worldnews_comments.csv
cats_posts.csv         russia_posts.csv       worldnews_posts.csv


In [5]:
path = Path('./data/worldnews_posts.csv')
reddit_posts = (
    pd.read_csv(path).
    iloc[1:,:].
    reset_index(drop=True) #the first title appears to be pinned post
)

In [6]:
reddit_posts

Unnamed: 0,post_id,title,score,subreddit,url,num_comments,body,created
0,sjpwpi,"Trudeau rules out negotiating with protesters,...",4454,worldnews,https://www.cbc.ca/news/politics/trudeau-prote...,1311,,1.643912e+09
1,sju2s5,Canadian MPs vote to call GoFundMe to testify ...,588,worldnews,https://www.cbc.ca/news/politics/convoy-protes...,191,,1.643922e+09
2,sjos5s,Putin heads to China to bolster ties amid Ukra...,1028,worldnews,https://apnews.com/article/russia-ukraine-spor...,209,,1.643909e+09
3,sjrmgq,"Police watchdog finds misogynistic, racist, di...",621,worldnews,https://thehill.com/policy/international/59263...,65,,1.643916e+09
4,sjhegj,Civilians reported dead after US conducts coun...,1564,worldnews,https://www.cnn.com/2022/02/03/world/syria-us-...,1227,,1.643889e+09
...,...,...,...,...,...,...,...,...
94,sjwgd3,South Shore school apologizes after using home...,7,worldnews,https://www.cbc.ca/news/canada/montreal/rivers...,5,,1.643928e+09
95,sjq22e,U.S. Helicopter Downed in Military Operation t...,16,worldnews,https://beckernews.com/breaking-u-s-helicopter...,17,,1.643912e+09
96,sjpbzc,Lithuanian president calls for more German tro...,18,worldnews,https://www.lrt.lt/en/news-in-english/19/16058...,4,,1.643910e+09
97,sjftpz,"Russia has sent some 30,000 combat troops, mod...",62,worldnews,https://www.reuters.com/world/europe/russia-ha...,7,,1.643884e+09


In [7]:
nlp=spacy.load('en_core_web_sm')
trigram_vectorizer=CountVectorizer(ngram_range=(1,3))

In [8]:
def text_processing(text: str, processor:spacy.lang.en.English) -> str:
    return ' '.join(i.lemma_.lower() for i in processor(text) 
                    if not (i.is_punct or i.is_stop or i.pos_ == 'VERB'))

In [9]:
def get_ngram(text:str,n_gram_min:int=1, n_gram_max:int=3) -> Set[str]:
    trigram_vectorizer = CountVectorizer(ngram_range=(n_gram_min,n_gram_max))
    trigram_vectorizer.fit_transform([text])
    return set(trigram_vectorizer.get_feature_names_out())

In [10]:
reddit_posts['ngram'] = (
     reddit_posts.title.
        apply(text_processing, args=(nlp,)).
        apply(get_ngram)
)

### Find what trending topic is mentioned in reddit post title

In [11]:
reddit_posts.ngram[0]

{'card',
 'deployment',
 'deployment card',
 'military',
 'military deployment',
 'military deployment card',
 'protester',
 'protester military',
 'protester military deployment',
 'trudeau',
 'trudeau protester',
 'trudeau protester military'}

In [12]:
topic_intersect = partial(set.intersection,google_topics)

In [13]:
counter = defaultdict(int)
for i in filter(lambda x: x, reddit_posts.ngram.apply(topic_intersect)):
    for topic in i:
        counter[topic]+=1
{topic:np.round(count/len(reddit_posts),3) for topic, count in counter.items()}

{'winter olympics': 0.02}