In [1]:
import pandas as pd
import numpy as np
import random
from collections import defaultdict
from IPython.display import clear_output
import inflect

In [3]:
df = pd.DataFrame()
for chunk in pd.read_csv('./news_dataset.csv', usecols=['content'], chunksize=1000):
    df = pd.concat([df, chunk], ignore_index=True)

In [4]:
df = df.iloc[:10000]
df.head()

Unnamed: 0,content
0,WASHINGTON — Congressional Republicans have...
1,"After the bullet shells get counted, the blood..."
2,"When Walt Disney’s “Bambi” opened in 1942, cri..."
3,"Death may be the great equalizer, but it isn’t..."
4,"SEOUL, South Korea — North Korea’s leader, ..."


In [5]:
import re
df["content"] = df["content"].str.lower() # to lower case

df['content'] = df['content'].map(lambda x: re.sub(r'[,!.;+-@!%^&*)(_\\\'\"“”’—]', '', str(x))) # remove special characters
df['content'] = df['content'].map(lambda x: re.sub(r'\W+', ' ', str(x))) #remove exess white spaces
df['content'] = df['content'].map(lambda x:  re.sub('(\\b[A-Za-z] \\b|\\b [A-Za-z]\\b)', '', str(x))) # remove single letters

In [None]:
## TAKES REALY LONG

# p = inflect.engine()

# def to_singular(word):
#     sing = p.singular_noun(word)
#     if not sing:
#         return word
#     return sing

# df['content'] = df['content'].map(lambda x: ' '.join([to_singular(word) for word in x.split()]))



In [5]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop = stopwords.words("english")
stop += ['mr', 'said', 'ms', 'would', 'mrs', 'mss']
df['content'] = df['content'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\stijn\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [8]:
df["content"]


0       washington congressional republicans new fear ...
1       bullet shells get counted blood dries votive c...
2       walt disneys bambi opened critics praised spar...
3       death may great equalizer isnt necessarily eve...
4       seoul south korea north koreas leader kim sund...
                              ...                        
9995    tuesday msnbcs record greta discussing todays ...
9996    president donald trump informed multiple indiv...
9997    former cia operative presidential candidate ev...
9998    ynetnews reports prime minister benjamin netan...
9999    tribal leader threatening protests department ...
Name: content, Length: 10000, dtype: object

In [9]:
nr_topics = 50
topics = list(range(nr_topics))
topics

[0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 33,
 34,
 35,
 36,
 37,
 38,
 39,
 40,
 41,
 42,
 43,
 44,
 45,
 46,
 47,
 48,
 49]

In [10]:
def defaultvalue():
    return [0 for _ in range(nr_topics)]

doc_topics = []
word_topics = defaultdict(defaultvalue)

assigned = []
for index, row in df.iterrows():
    if type(row["content"]) != list:
        row["content"] = row["content"].split()
    if index%1000 == 0:
        clear_output(wait=True)
        print(index)

    t = [0 for _ in range(nr_topics)]
    assigned_doc = []
    for word in row["content"]:
        word_topic = random.sample(topics, 1)[0]
        word_topics[word][word_topic] += 1
        assigned_doc.append(word_topic)
        t[word_topic] += 1
    assigned.append(assigned_doc)
    doc_topics.append(t)
        



9000


In [11]:
alpha = 10
gamma = 4

def sum_doc(doc_index):
    return len(assigned[doc_index]) + nr_topics*alpha

def sum_topic(topic):
    s = 0
    for doc in doc_topics:
        s += doc[topic] + gamma
        
    return s

def alltopics():
    lis = []
    
    for t in topics:
        lis.append(sum_topic(t))
    
    return lis

def alldocslength():
    lis = []
    
    for d in assigned:
        lis.append(len(d) + nr_topics*alpha)
    
    return lis

sum_topics = alltopics()
doc_len = alldocslength()       

In [None]:
def gibbs_iter():
    for index, row in df.iterrows():

        for i, word in enumerate(row["content"]):
            t = assigned[index][i]
            word_topics[word][t] -= 1
            doc_topics[index][t] -= 1
            sum_topics[t] -= 1
            
            topic_scores = []
            for top in range(nr_topics):
                n_dk = doc_topics[index][top]
                v_kwdn = word_topics[word][top]
    
                s1 = doc_len[index]
                s2 = sum_topics[top]
                topic_scores.append(((n_dk + alpha)/s1) * ((v_kwdn + gamma)/s2))
            
            z_new = random.choices(topics, weights=topic_scores, k=1)[0]
            word_topics[word][z_new] += 1
            doc_topics[index][z_new] += 1
            sum_topics[z_new] += 1
            assigned[index][i] = z_new
        
    return 0

In [None]:
for i in range(10):
    print(i)
    gibbs_iter()


In [40]:
def get_top_topics(amount):
    topics_word = [[] for _ in range(nr_topics)]
    
    for word in word_topics:
        for top, value in enumerate(word_topics[word]):
        
            topics_word[top].append([word, value])    
            
    result = []        
    for topic_list in topics_word:
        result.append(sorted(topic_list, key= lambda x: x[1], reverse=True )[:amount])
    return result
        


get_top_topics(10)


Error compiling Cython file:
------------------------------------------------------------
...
cdef int gibbs_iter():
    for index, row in df.iterrows():
        # print(index)

        for i, word in enumerate(row["content"]):
            cdef int t = assigned[index][i]
                ^
------------------------------------------------------------

C:\Users\stijn\.ipython\cython\_cython_magic_938f6d132e5aa7aa4963d0916a41333a.pyx:9:17: cdef statement not allowed here


In [14]:
def get_top_topics(amount):
    topics_word = [{} for _ in range(nr_topics)]
    
    for word in word_topics:
        for top, value in enumerate(word_topics[word]):
        
            topics_word[top][word] =  value    
            
    result = []        
    for topic_list in topics_word:
        result.append(dict(sorted(topic_list.items(), key= lambda x: x[1], reverse=True )[:amount]))
    return result, topics_word
        


topic_best, best_topic_all = get_top_topics(10)
topic_best

[{'trump': 28318,
  'trumps': 8318,
  'president': 7997,
  'clinton': 7122,
  'campaign': 6752,
  'house': 5763,
  'republican': 5578,
  'white': 4030,
  'obama': 4028,
  'election': 3915},
 {'people': 1100,
  'new': 847,
  'times': 532,
  'could': 377,
  'year': 366,
  'two': 322,
  'one': 295,
  'public': 294,
  'according': 264,
  'last': 254},
 {'new': 649,
  'one': 611,
  'back': 509,
  'also': 429,
  'last': 395,
  'two': 365,
  'year': 328,
  'work': 326,
  'made': 303,
  'still': 292},
 {'think': 4474,
  'dont': 4363,
  'going': 3821,
  'know': 3422,
  'im': 3406,
  'like': 3351,
  'thats': 2688,
  'people': 2596,
  'get': 2371,
  'really': 1902},
 {'two': 985,
  'one': 911,
  'first': 898,
  'like': 669,
  'many': 621,
  'years': 580,
  'time': 518,
  'could': 309,
  'way': 272,
  'new': 258},
 {'first': 594,
  'may': 483,
  'even': 403,
  'people': 392,
  'years': 339,
  'many': 317,
  'last': 309,
  'new': 287,
  'one': 258,
  'also': 249},
 {'also': 929,
  'new': 638,
  'ev

In [20]:
def word_in_topic_score(topic, word):
    if word in best_topic_all[topic]:
        return best_topic_all[topic][word]
    return 0
topic_top_lists = []
for topic in range(nr_topics):
    print(topic)
    top_list = []
    for index, row in df.iterrows():
        top_doc_score = 0
        nr_words = len(row["content"])
        if nr_words == 0:
            continue
        for word in row["content"]:
            word_topic_score = word_in_topic_score(topic, word)
            word_all_topics_score = 0
            for topic2 in range(nr_topics):
                word_all_topics_score += word_in_topic_score(topic2, word)
                
            word_topic_avg_score = (word_topic_score / word_all_topics_score) * word_topic_score
            
            top_doc_score += word_topic_avg_score
        
        top_doc_score /= nr_words
        top_list.append([index, top_doc_score])
    
    topic_top_lists.append(sorted(top_list, key= lambda x: x[1], reverse=True)[:100])
    
topic_top_lists
                    
            

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19


[[[203, 82.76677823640573],
  [3830, 81.97040712941994],
  [2605, 81.8062308201681],
  [2082, 81.80542276767946],
  [235, 77.32130123422755],
  [4888, 77.01223696964374],
  [3448, 76.13204878423745],
  [6256, 74.5788958737216],
  [6420, 74.41466614682328],
  [6823, 74.41087101702142],
  [4338, 73.6292555447157],
  [5400, 72.95644801006173],
  [3223, 72.75691549426598],
  [303, 72.68715621885245],
  [6203, 72.62934643645019],
  [2746, 71.76908784026176],
  [7429, 69.96348705236147],
  [5264, 69.80133991867908],
  [863, 69.61382215758753],
  [3074, 69.51437543702322],
  [5044, 69.44302904155212],
  [7155, 69.27679970354959],
  [3038, 69.26643729357102],
  [5300, 69.2361328081101],
  [3705, 69.08161239184025],
  [1702, 68.8588930060346],
  [6115, 68.10766195475462],
  [5059, 67.99219621455823],
  [4087, 67.95409838810812],
  [6794, 67.77411975614721],
  [4934, 67.54215485078804],
  [7283, 67.47298844673406],
  [116, 67.299286816239],
  [6946, 67.16435846177927],
  [7009, 67.14452212220937

In [21]:
topic_top_lists

[[[203, 82.76677823640573],
  [3830, 81.97040712941994],
  [2605, 81.8062308201681],
  [2082, 81.80542276767946],
  [235, 77.32130123422755],
  [4888, 77.01223696964374],
  [3448, 76.13204878423745],
  [6256, 74.5788958737216],
  [6420, 74.41466614682328],
  [6823, 74.41087101702142],
  [4338, 73.6292555447157],
  [5400, 72.95644801006173],
  [3223, 72.75691549426598],
  [303, 72.68715621885245],
  [6203, 72.62934643645019],
  [2746, 71.76908784026176],
  [7429, 69.96348705236147],
  [5264, 69.80133991867908],
  [863, 69.61382215758753],
  [3074, 69.51437543702322],
  [5044, 69.44302904155212],
  [7155, 69.27679970354959],
  [3038, 69.26643729357102],
  [5300, 69.2361328081101],
  [3705, 69.08161239184025],
  [1702, 68.8588930060346],
  [6115, 68.10766195475462],
  [5059, 67.99219621455823],
  [4087, 67.95409838810812],
  [6794, 67.77411975614721],
  [4934, 67.54215485078804],
  [7283, 67.47298844673406],
  [116, 67.299286816239],
  [6946, 67.16435846177927],
  [7009, 67.14452212220937

In [22]:
import csv

m = topic_top_lists
trans = [[m[j][i][0] for j in range(len(m))] for i in range(len(m[0]))] 


with open("topic_document_rank.csv", 'w', newline='') as myfile:
    wr = csv.writer(myfile)
    wr.writerow(range(nr_topics))
    wr.writerows(trans)