In [3]:
# Import main libraries
import re
import pandas as pd
import numpy as np

In [4]:
# Read comment data
all_comments = pd.read_csv('2021-2022/new/comments.csv', index_col=0)
all_comments['created_utc'] = pd.to_datetime(all_comments['created_utc'],unit='s')
all_comments.dropna(inplace=True)

In [5]:
# Check for NA
all_comments['body'].isna().sum()

0

In [6]:
# Before we do anything we should also convert Markdown to plain text
from markdown import Markdown
from io import StringIO

def unmark_element(element, stream=None):
    if stream is None:
        stream = StringIO()
    if element.text:
        stream.write(element.text)
    for sub in element:
        unmark_element(sub, stream)
    if element.tail:
        stream.write(element.tail)
    return stream.getvalue()


# patching Markdown
Markdown.output_formats["plain"] = unmark_element
__md = Markdown(output_format="plain")
__md.stripTopLevelTags = False


def unmark(text):
    return __md.convert(text)


# Remove all emojis (I might need to get keep that data in the future for something, but not this project)
def remove_emoji(string):
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002500-\U00002BEF"  
                               u"\U00002702-\U000027B0"
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               u"\U0001f926-\U0001f937"
                               u"\U00010000-\U0010ffff"
                               u"\u2640-\u2642"
                               u"\u2600-\u2B55"
                               u"\u200d"
                               u"\u23cf"
                               u"\u23e9"
                               u"\u231a"
                               u"\ufe0f"  
                               u"\u3030"
                               "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', string)

In [7]:
# Remove everything that we don't need
all_comments['body'] = all_comments['body'].replace(r'http\S+', '', regex=True).replace(r'www\S+', '', regex=True)
all_comments['body'] = all_comments.apply(lambda x: unmark(x['body']), axis = 1)
all_comments['body'] = all_comments['body'].str.replace('&#x200B;>', ' ')
all_comments['body'] = all_comments.apply(lambda x: remove_emoji(x['body']), axis = 1)
all_comments['body'] = all_comments['body'].str.replace(r"\\n", "", regex=True)
# Also remove any comment that has been deleted
all_comments = all_comments[all_comments["body"].str.contains("deleted")==False]


In [9]:
# Import Bulgarian stop words
from spacy.lang.bg.stop_words import STOP_WORDS as BG_STOPWORDS
import unicodedata
from nltk.corpus import stopwords
# Modules for word2vec
from nltk.tokenize import sent_tokenize, word_tokenize 
import nltk
nltk.download('punkt')
import string

## First quick and drity attempt at getting n-grams
stopwords_bg = 'на от за да се по са ще че не това си като до през които най при но има след който към бъде той още може му което много със която или само тази те обаче във вече около както над така между ако лв им тези преди млн бе също пред ни когато защото кв би пък тъй ги ли пак според този все някои'
stopwords_custom = stopwords_bg.split()
stopwords_custom.append('не')

# add appropriate words that will be ignored in the analysis
ADDITIONAL_STOPWORDS = list(BG_STOPWORDS) + stopwords_custom

def prepare_ngram(text, ngrams):
    # Take punctuations out
    cleaned_string = re.sub('\[,.*?“”…\]', '', text)
    cleaned_string = re.sub(r'[“”]', '', cleaned_string)

    # Remove any digits
    cleaned_string = ''.join([i for i in cleaned_string if not i.isdigit()])

    # Tokenise the data
    cleaned_string = re.sub('[%s]' % re.escape(string.punctuation), ' ', cleaned_string)

    # Why did I needed to do lowercase? Double-check that - it doesn't match Google Ngram behavior
    cleaned_string = cleaned_string.lower()
    TOKENS = word_tokenize(cleaned_string) 

    # Filter those stop words out
    filtered_sentence = []
    
    for w in TOKENS: 
        if w not in ADDITIONAL_STOPWORDS:
            filtered_sentence.append(w)
    
    # Count phrases
    gram_df = pd.Series(nltk.ngrams(filtered_sentence, ngrams)).value_counts()
    return gram_df


[nltk_data] Downloading package punkt to /Users/ivo/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [11]:
# Create a full dataset for unigrams
full_df_unigram = all_comments.groupby('created_utc').apply(lambda x: prepare_ngram(x['body'].str.cat(sep=', '), 1))
full_df_unigram = full_df_unigram.reset_index().rename(columns={'created_utc' : 'date', 'level_1' : 'unigram', 0 : 'count'})
full_df_unigram = full_df_unigram.explode('unigram')

# Remove all strings that are single characters and find out why they are there in the first place?!
def remove_single_char(string):
    return string.count(string[0]) == len(string)

full_df_unigram['drop_value'] = full_df_unigram.apply(lambda x: remove_single_char(x['unigram']), axis = 1)
full_df_unigram = full_df_unigram[full_df_unigram['drop_value'] == False]
full_df_unigram = full_df_unigram.drop(columns=['drop_value'])

In [12]:
# Now do the same with full dataset for bigrams
full_df_bigram = all_comments.groupby('created_utc').apply(lambda x: prepare_ngram(x['body'].str.cat(sep=', '), 2))
full_df_bigram = full_df_bigram.reset_index().rename(columns={'created_utc' : 'date', 'level_1' : 'bigram', 0 : 'count'})
full_df_bigram['bigram'] = full_df_bigram.bigram.apply(lambda x: ' '.join([str(i) for i in x]))

In [13]:
# Rename column name to be concatanate two dataframes easier 
full_df_bigram = full_df_bigram.rename(columns={'bigram' : 'unigram'})

full_df_unigram['unigram'] = full_df_unigram['unigram'].str.replace('\W+', '', regex=True)
full_df_unigram['unigram'] = full_df_unigram['unigram'].replace('', np.nan)
full_df_unigram.dropna(subset=['unigram'], inplace=True)

In [14]:
# Concatanate unigram and bigram
full_df = pd.concat([full_df_unigram, full_df_bigram])

In [15]:
# Prepare monthly counts and proportions for the combined dataframe (we want to be able to compare
# unigrams to bigrams directly)
full_df.set_index('date', inplace=True)
full_df.index = pd.DatetimeIndex(full_df.index)
monthly_full = full_df.groupby([pd.Grouper(freq="M"), "unigram"]).sum().reset_index()
monthly_full['ratio'] = (monthly_full.groupby(['unigram','date'])['count'].transform(sum) / monthly_full.groupby('date')['count'].transform(sum))

In [16]:
# Get only data with enough datapoints for plotting
monthly_full_2016 = monthly_full[monthly_full['date'] >= "2016-01-31"]

# Normalize dates to begining of month
monthly_full_2016['date'] = monthly_full_2016['date'] - pd.offsets.MonthBegin(1, normalize=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  monthly_full_2016['date'] = monthly_full_2016['date'] - pd.offsets.MonthBegin(1, normalize=True)


In [17]:
# Function that checks whether the string is 1 or 2-gram. 
# so that we load it from appropriate db. For the time being not planning to do more than bigram

def check_string(string):
    ngram = len(string.split())
    if ngram == 1:
        dataframe = 'unigram'
    elif ngram == 2:
        dataframe = 'bigram'
    else:
        print("We don't support more than that currently.")
    return dataframe

In [18]:
monthly_full_2016 = monthly_full_2016.rename(columns={'unigram' : 'gram'})

# Let's brake the big df into two smaller ones (should be faster in production)
monthly_full_2016['dataframe'] = monthly_full_2016.apply(lambda x: check_string(x['gram']), axis = 1)

In [19]:
# Split the dataframes back to separate unigram and bigram
a = monthly_full_2016['dataframe'] == 'unigram'
unigram_full_df = monthly_full_2016[a]
bigram_full_df = monthly_full_2016[~a]

In [None]:
unigram_full_df.to_csv("data/unigram_full_df.csv")
bigram_full_df.to_csv("data/bigram_full_df.csv")