In [1]:
#standard libraries
import warnings
warnings.filterwarnings('ignore')
import os
os.chdir('/home/prabhur/reddit_project/data')
import time
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

#for text processing
import regex as re
import spacy
import en_core_web_sm
nlp = en_core_web_sm.load()
from spacy.lang.en.stop_words import STOP_WORDS

from gensim.corpora import Dictionary
from gensim.models import LdaModel

In [2]:
def create_base_df(city):
    cons_df = pd.DataFrame()
    for file in os.listdir():
        if city in file:
            _ = pd.read_pickle(file)
            cons_df = pd.concat([cons_df, _.text])
            cons_df.reset_index(drop = True, inplace = True)
    cons_df.columns = ["text"]
    cons_df = cons_df[cons_df["text"] != "[deleted]"] #deleted posts
    return cons_df

In [4]:
def text_processing(x):
    """returns the tokenized and lemmatized words from the text provided"""

    #using regex to remove
    x = re.sub(r"!\[gif\]\(emote\|free_emotes_pack\|\w+\)|!\[gif\]\(giphy\|\w+\)", "", x)  #with gifs or emoticons
    x = re.sub(r"http.+\b", "", x) #hyperlinks
    x = re.sub(r"[^A-Za-z\ +]", "", x) #non-alphabetic character based words

    #tokenization and lemmatization
    doc = nlp(x)
    return [str(word.lemma_).lower().strip() for word in doc if str(word).lower() not in STOP_WORDS and len(str(word).strip())>1]

In [18]:
del_df = create_base_df('del')
print(del_df.shape)
start = time.time() 
del_df["processed_text"] = del_df["text"].apply(lambda x: text_processing(x))
end = time.time()
print(f"Text processing takes ~{round((end-start)/60,2)} mins")
del_df.to_pickle("del_df.pkl")
del_df.sample(3)

(49940, 1)
Text processing takes ~5.21 mins


Unnamed: 0,text,processed_text
6934,Chutiyap advice ghani,"[chutiyap, advice, ghani]"
11193,Congratulations bro but yaha Laxmi nagar se b ...,"[congratulation, bro, yaha, laxmi, nagar, se, ..."
11443,Dosto gaandhi ji ki ashtiya beh gyi iss pani m...,"[dosto, gaandhi, ji, ki, ashtiya, beh, gyi, is..."


In [29]:
mum_df = create_base_df('mum')
print(mum_df.shape)
start = time.time() 
mum_df["processed_text"] = mum_df["text"].apply(lambda x: text_processing(x))
end = time.time()
print(f"Text processing takes ~{round((end-start)/60,2)} mins")
mum_df.to_pickle("mum_df.pkl")
mum_df.sample(3)

(48551, 1)
Text processing takes ~5.5 mins


Unnamed: 0,text,processed_text
5825,Where are these recipes?,[recipe]
24436,"Can relate, have read the fountainhead by Ayn ...","[relate, read, fountainhead, ayn, rand]"
18160,Colaba is usually clean. You'll find people ap...,"[colaba, usually, clean, ll, find, people, apl..."


In [19]:
ban_df = create_base_df('ban')
print(ban_df.shape)
start = time.time() 
ban_df["processed_text"] = ban_df["text"].apply(lambda x: text_processing(x))
end = time.time()
print(f"Text processing takes ~{round((end-start)/60,2)} mins")
ban_df.to_pickle("ban_df.pkl")
ban_df.sample(3)

(47843, 1)
Text processing takes ~6.34 mins


Unnamed: 0,text,processed_text
13804,Bro metro is seriously one of the recent best ...,"[bro, metro, seriously, recent, good, investme..."
46507,Builder's in Bangalore are using 50% of water ...,"[builder, bangalore, water, guess, new, project]"
4511,Apply Kirchhoff's law.,"[apply, kirchhoffs, law]"


In [20]:
bos_df = create_base_df('bos')
print(bos_df.shape)
start = time.time() 
bos_df["processed_text"] = bos_df["text"].apply(lambda x: text_processing(x))
end = time.time()
print(f"Text processing takes ~{round((end-start)/60,2)} mins")
bos_df.to_pickle("bos_df.pkl")
bos_df.sample(3)

(51350, 1)
Text processing takes ~6.38 mins


Unnamed: 0,text,processed_text
13300,I'm doing OK on masks but I wanted to say you ...,"[ok, mask, want, awesome, holy, heck, honestly..."
31400,After overwhelmingly negative reaction from th...,"[overwhelmingly, negative, reaction, public, m..."
36805,"Preditors usually prey close to home, he may l...","[preditor, usually, prey, close, home, live, c..."


In [21]:
chi_df = create_base_df('chi')
print(chi_df.shape)
start = time.time() 
chi_df["processed_text"] = chi_df["text"].apply(lambda x: text_processing(x))
end = time.time()
print(f"Text processing takes ~{round((end-start)/60,2)} mins")
chi_df.to_pickle("chi_df.pkl")
chi_df.sample(3)

(51503, 1)
Text processing takes ~6.15 mins


Unnamed: 0,text,processed_text
10197,I wish I knew Karl Pilkington was in town,"[wish, know, karl, pilkington, town]"
37816,"Well, let’s not be too snide. There have been ...","[lets, snide, roam, band, hundred, teen, riot,..."
17795,Reported for sexual content,"[report, sexual, content]"


In [22]:
nyc_df = create_base_df('nyc')
print(nyc_df.shape)
start = time.time() 
nyc_df["processed_text"] = nyc_df["text"].apply(lambda x: text_processing(x))
end = time.time()
print(f"Text processing takes ~{round((end-start)/60,2)} mins")
nyc_df.to_pickle("nyc_df.pkl")
nyc_df.sample(3)

(56433, 1)
Text processing takes ~6.84 mins


Unnamed: 0,text,processed_text
42273,:: nervously raises hand :: I live in a hewse,"[nervously, raise, hand, live, hewse]"
47984,Karate,[karate]
44984,I don't know if it was my luck about a decade ...,"[not, know, luck, decade, ago, certain, coin, ..."


#### pending 
- topic modeling
- visualization (from assignment)
- pylab?

In [22]:
# !pip install langdetect  #to check for english language
# ##https://pypi.org/project/langdetect/

# from langdetect import detect
# test = del_df.iloc[:3,:]
# # test
# y = detect(test.loc[0,"text"])
# y

##tried language detect, both on the original text and also after tokenization and cleanup. But langdetect did not have good results
##upon further research it seems like this may be because langdetect needs longer sentences to detect the language correctly.
# test["lang"] = test["text"].apply(lambda x: detect(x)) 
# test
# del_df["lang"].nunique()
# del_df)

# from string import punctuation
# # import nltk #too slow with tokenization
# # test["text2"] = test["text"].apply(lambda x: " ".join([word.strip().lower() for word in x.split(" ") if word not in punctuation])) #langdetect alone did not have good results
# test["lang2"] = test["text2"].apply(lambda x: detect(x)) #langdetect alone did not have good results
# test