In [1]:
import numpy as np
import pandas as pd
import spacy
import re
from pathlib import Path
import hu_core_ud_lg
from collections import Counter
nlp = hu_core_ud_lg.load()

In [2]:
df = pd.read_excel(Path.cwd() / 'Data' / 'WJ-comments' / 'comments.xlsx')

In [3]:
def extract_comments(df):
    res = {}
    for topic in df:
        comments = ' '.join([row for row in df.loc[df[topic].notnull(), topic]])
        res[topic] = comments
    return res


def clean(text):
    '''
    Keeps only word chars and single white space from input text
    '''
    res = re.sub(r'[^\w\s]', '', text)
    res = ' '.join([word.strip() for word in res.split()])
    return res


def tokenize_1gram(text, model, ents = []):
    '''
    Returns lowercase lemma for 
        -non-stop words, 
        -non-numbers, 
        -non-punct 
        and drops
        -lemma of lenght 1
        -entities defined by ents
    '''
    
    doc = model(text)
    res = []
    
    for word in doc:
        if not word.is_stop and not word.is_punct and not word.like_num and len(word.lemma_) > 1 and word.ent_type_ not in ents:
            res.append(word.lemma_.lower())
            
    return res


def remove_single(df, tokens_col):
    '''
    Removes words that only appear once in corpus 
    
    Parameters:
    ----------
    pd_series : pandas Series (df col)
    
    Returns:
    ----------
    Cleaned docs as list of lists
    '''
    all_tokens = sum(df[tokens_col], [])
    tokens_once = set(word for word in set(all_tokens) if all_tokens.count(word) == 1)
    tokenized_no_single = [[word for word in text if word not in tokens_once] for text in df[tokens_col]]

    return tokenized_no_single


def get_word_counts(df, tokens_col):
    res = {}
    for index, row in df.iterrows():
        wc = Counter(row[tokens_col])
        res[index] = wc
    return res

In [4]:
topic_comment = extract_comments(df)
topic_comment = pd.DataFrame.from_dict(topic_comment, orient = 'index', columns = ['text'])
topic_comment['tokenized'] = topic_comment['text']\
    .apply(lambda x: tokenize_1gram(clean(x), model = nlp, ents = ['PER', 'LOC']))

In [5]:
wc = get_word_counts(topic_comment, 'tokenized')
topic_wc = pd.concat({k: pd.DataFrame.from_dict(v, 'index')\
                     for k, v in wc.items()}, axis = 0)\
                    .reset_index()
topic_wc.columns = ['topic', 'word', 'count']

In [6]:
topic_wc.to_excel(Path.cwd() / 'Data' / 'WJ-comments' / 'topic_word_counts.xlsx', index = False)