<b>Notebook Introduction:</b> This notebook contains functions used for text processing of the extracted subreddit data, e.g., tokenization/bigrams, lemmatization, removal of hyperlinks and other noise. The cleaned text is used for topic modeling in topic_modeling.ipynb

In [None]:
#standard libraries
import warnings
warnings.filterwarnings('ignore')
import os
import time
import numpy as np
import pandas as pd

#for text processing
from string import punctuation
punc_list = list(punctuation)
punc_list.remove('/') #needed for references to subreddits

import regex as re
import spacy
import en_core_web_sm
nlp = en_core_web_sm.load()
from spacy.lang.en.stop_words import STOP_WORDS

# from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer #working with gensim for this project to gain exp w/ this library
from gensim.models import Phrases
from gensim.models.phrases import Phraser

In [6]:
def create_base_df(path, city):
    """for each subreddit (variable = city), this function consolidates the text from the posts, comments and comment replies extracted,
    to create a dataframe of close to 50K records for further analysis"""
    
    os.chdir(path)
    cons_df = pd.DataFrame()

    # looping through and reading each file in that directory that references this city
    # these would be the (1)posts df, (2)comments df and (3)replies df
    for file in os.listdir():
        if city in file:
            _ = pd.read_pickle(file)
            cons_df = pd.concat([cons_df, _.text])
            cons_df.reset_index(drop = True, inplace = True)
    cons_df.columns = ["text"]
    cons_df = cons_df[cons_df["text"] != "[deleted]"] #excluding deleted posts
    
    return cons_df

In [7]:
def text_processing(x):
    """returns the tokenized and lemmatized words from the text provided"""

    #using regex to remove (1) gifs or (2) hyperlinks
    x = re.sub(r"!\[gif\]\(emote\|free_emotes_pack\|\w+\)|!\[gif\]\(giphy\|\w+\)", "", x)  
    x = re.sub(r"http.+\b", "", x) #hyperlinks
    # x = re.sub(r"[^A-Za-z\ +]", "", x) #non-alphabetic character based words

    #tokenization and lemmatization
    doc = nlp(x)
    token_str =  " ".join([str(word.lemma_).lower().strip() for word in doc if str(word).lower() not in STOP_WORDS and str(word) not in punc_list]) # and len(str(word).strip())>2]
    
    #the tokens need to be tweaked a bit to to include mentions of subreddits
    token_list = re.sub("r / ", "r/", token_str).split(" ") 
    #this should be tweaked to y = re.sub("^r / ", otherwises things like 'sambhajinagar / aurangabad' are also captured but I dont wanna rerun this 
    #(takes too long) and there is no material impact

    #dropping anything that is punctuations only or short words
    token_list = [token for token in token_list if len(token)>3 and bool(re.fullmatch(r'^[\W_]+$', token)) == False]
    
    return token_list

In [8]:
## appending bigrams
def create_bigrams(df):
    """function to append common bigrams to the tokens. something similar can be done with sklearn CountVectorizer too but I want to get
    some experience with gensim"""
    
    bigram = Phrases(df["processed_text"], min_count=10)
    bigram_phraser = Phraser(bigram)
    df["bigrams"] = df["processed_text"].apply(lambda x: list(bigram_phraser[x]))
    df["processed_text_bigrams"] = df[["processed_text", "bigrams"]].apply(lambda x: x[0] + [word for word in x[1] if word not in x[0]], 
                                                         axis = 1)
    # df.drop(columns = "bigrams", inplace = True) #for record keeping, I'm not dropping the original processed_text or the bigram columns
    # the dataframe is not too large so this should not be a problem
    
    return df

In [9]:
def combined_func_call(city):
    start = time.time() 

    #creating the base dataframe
    df = create_base_df('/home/prabhur/reddit_project/data/raw/', city)
    print(f"{df.shape[0]} records from this subreddit were analyzed")
    
    os.chdir('/home/prabhur/reddit_project/data/')
    #text processing
    df["processed_text"] = df["text"].apply(lambda x: text_processing(x))
    
    #creating the bigrams
    df = create_bigrams(df)
    
    end = time.time()
    print(f"Text processing takes ~{round((end-start)/60,2)} mins")
    
    return df

In [6]:
##version 1:
print("Delhi")
del_df = combined_func_call('del')
del_df.to_pickle("del_df.pkl")

49940 records from this subreddit were analyzed
Text processing takes ~5.61 mins


In [11]:
mum_df = combined_func_call('mum')
mum_df.to_pickle("mum_df.pkl")

48551 records from this subreddit were analyzed
Text processing takes ~5.89 mins


In [2]:
ban_df = combined_func_call('ban')
ban_df.to_pickle("ban_df.pkl")

47843 records from this subreddit were analyzed
Text processing takes ~6.76 mins


In [3]:
##version 2
nyc_df = combined_func_call('nyc')
nyc_df.to_pickle("nyc_df.pkl")

56433 records from this subreddit were analyzed
Text processing takes ~7.32 mins


In [10]:
chi_df = combined_func_call('chi')
chi_df.to_pickle("chi_df.pkl")

bos_df = combined_func_call('bos')
bos_df.to_pickle("bos_df.pkl")

51503 records from this subreddit were analyzed
Text processing takes ~6.64 mins
51350 records from this subreddit were analyzed
Text processing takes ~6.91 mins


In [None]:
# !pip install langdetect  #to check for english language
# ##https://pypi.org/project/langdetect/

# from langdetect import detect
# test = del_df.iloc[:3,:]
# # test
# y = detect(test.loc[0,"text"])
# y

##tried language detect, both on the original text and also after tokenization and cleanup. But langdetect did not have good results
##upon further research it seems like this may be because langdetect needs longer sentences to detect the language correctly.
# test["lang"] = test["text"].apply(lambda x: detect(x)) 
# test
# del_df["lang"].nunique()
# del_df)

# from string import punctuation
# # import nltk #too slow with tokenization
# # test["text2"] = test["text"].apply(lambda x: " ".join([word.strip().lower() for word in x.split(" ") if word not in punctuation])) #langdetect alone did not have good results
# test["lang2"] = test["text2"].apply(lambda x: detect(x)) #langdetect alone did not have good results
# test