In [4]:
import numpy as np
import pandas as pd
from collections import defaultdict
from gensim import corpora, models

In [5]:
def TokenizedTexts(filename, min_freq):
    df = pd.read_csv(filename, index_col=0)
    texts_tokenized = list(df['article_text_stopped'].str.split())
    
    frequency = defaultdict(int)

    for text in texts_tokenized:
        for token in text:
            frequency[token] += 1
            
    texts = [[token for token in text if frequency[token] > min_freq] for text in texts_tokenized]
    
    return texts

In [6]:
def Dictionary(tokenized_texts):
    return corpora.Dictionary(tokenized_texts)

In [7]:
def Corpus(dictionary, texts, tfidf=True):
    corpus = [dictionary.doc2bow(text) for text in texts]
    
    if not tfidf:
        return corpus
    
    tfidf = models.TfidfModel(corpus)
    corpus_tfidf = tfidf[corpus]
    return corpus_tfidf

In [8]:
tokenized_texts = TokenizedTexts('~/Data/socc_articles_stopped.csv', 2)
dictionary = Dictionary(tokenized_texts)
corpus = Corpus(dictionary, tokenized_texts)

In [18]:
def getTextsContaining(tokenized_texts, word):
    texts = []
    for text in tokenized_texts:
        if word in text:
            texts.append(' '.join(text))
    return texts

In [10]:
def getCounts(tokenized_texts, word):
    counter = 0
    for text in tokenized_texts:
        if word in text:
            counter += 1
    return counter

In [19]:
getTextsContaining(tokenized_texts, 'Catholicism')

["Daphne Gilbert associate professor law University Ottawa Canadians grappling difficult legal issues faced decades collective responsibility facilitate medically assisted death choose satisfy legal criteria Supreme Court decided Canadians Charter-protected dignified death choosing governments doctors hospitals citizens struggled accept move forward workable regime biggest impediments institutional resistance Hospitals claim conscientious objection prove Achilles heel government efforts breathe life die Catholic hospitals publicly funded position institutions religious rights Charter Rights Freedoms position recognized Supreme Court judges case Loyola High School Quebec judges concluded religious institution collective claim freedom religion Section Charter judges added key caveat conclusion organization meets requirements protection constituted primarily religious purposes operation accords religious purposes Publicly funded hospitals satisfy test claim freedom religion Publicly funde

In [11]:
getCounts(tokenized_texts, 'Catholicism')

27

In [12]:
getCounts(tokenized_texts, 'church\'s')

27

In [20]:
getTextsContaining(tokenized_texts, 'rare-earths')

["confusingly named substances rare blame ultimately Greek philosopher theory elements good subject World Trade Organization lawsuit United States European Union Japan commenced action Chinese export restriction prices rare falling China held obligations comply international trading system rare-earths action proper make point market working counteract China's rare-earths export quota China obtained dominant position low prices monopoly rare-earth deposits due high prices Chinese rare-earths exports lowered demand encouraged non-Chinese firms produce sell commodities process Canadian companies Avalon Rare Great Western Minerals Group Material Technologies Quest Rare Minerals taking active part China's rare-earths quota direct contradiction WTO rule Lawrence Herman Cassels Brock LLP points back postwar General Agreement Tariffs Trade member-state imposes export restriction limit domestic supply China case discriminate fact China keeping internal rare-earth prices low supply corresponding

In [13]:
getCounts(tokenized_texts, 'rare-earths')

1

In [14]:
getCounts(tokenized_texts, 'basketball')

52

In [22]:
getTextsContaining(tokenized_texts, 'Zoricic')

["Officials fans mourning death Canadian freestyle skier Nik Zoricic weekend freak accident end race Switzerland video accident evidence common-sense easily prevented tragedy ski cross competitors race narrow features high banked turns perilous jumps finish line winner tough dangerous sensational event Alpine Canada country's governing body downhill ski racing sells winter sport's answer roller derby video Internet Zoricic skiers travelling extreme speeds approach final jump race jump virtually top narrow finish line bounded side large pillars appears snow fencing banks snow Zoricic offline degrees hits final jump room error right-hand pillar crashes airborne fencing clear landing area Zoricic disqualified offline survived race Nik Zoricic Canadian freestyle skier killed Sarah Burke died January fall training run Utah cases term freak accident bandied officials Alpine Canada Canadian Ski Association justified admiration courage determination extreme athletes push human physical boundar

In [15]:
getCounts(tokenized_texts, 'Zoricic')

1

In [21]:
len(tokenized_texts)

10339