## Data Processing (02)

**Import Libraries**

In [1]:
# standards
import pandas as pd
import numpy as np
from pprint import pprint
from sara import eda, eda_unique
import nltk as nltk

# model
from sklearn.feature_extraction.text import CountVectorizer

# nlp
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from bs4 import BeautifulSoup
import regex as re

# stop future warnings
import sys

if not sys.warnoptions:
    import warnings
    warnings.simplefilter("ignore")

**Read in Cleaned Data**

In [2]:
df_reddit_cleaned = pd.read_csv('data/reddit_cleaned.csv')

**Drop Nulls (Again)**

In [3]:
df_reddit_cleaned.dropna(subset=['title'],inplace=True)
df_reddit_cleaned.dropna(subset=['selftext'],inplace=True)

**Review Top N Words in Corpus**

In [4]:
# instantiate tokenizer
tokenizer = RegexpTokenizer(r'\w+')

# tokenize title and self text
df_reddit_cleaned['title_tokens'] = df_reddit_cleaned['title'].apply(tokenizer.tokenize)
df_reddit_cleaned['selftext_tokens'] = df_reddit_cleaned['selftext'].apply(tokenizer.tokenize)

# create vars for title words, title lengths, and title words (unique)
title_words = [word for tokens in df_reddit_cleaned['title_tokens'] for word in tokens]
title_lengths = [len(tokens) for tokens in df_reddit_cleaned['title_tokens']]
title_vocab = sorted(list(set(title_words)))

# create vars for selftext words, selftext lengths, and selftext words (unique)
selftext_words = [word for tokens in df_reddit_cleaned['selftext_tokens'] for word in tokens]
selftext_lengths = [len(tokens) for tokens in df_reddit_cleaned['selftext_tokens']]
selftext_vocab = sorted(list(set(selftext_words)))

# get top word frequency fro m the variables (titles and selftexts)
def get_top_n_words(corpus, n=None):
    vec = CountVectorizer().fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in     vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]

# create masks for each subreddit
mask_cooking = df_reddit_cleaned[df_reddit_cleaned['subreddit']=='Cooking']
mask_baking = df_reddit_cleaned[df_reddit_cleaned['subreddit']=='Baking']

# get top 20 words for all posts, cooking posts, and baking posts
all_top_20 = get_top_n_words(df_reddit_cleaned['title'],20)
cooking_top_20 = get_top_n_words(mask_cooking['title'],20)
baking_top_20 = get_top_n_words(mask_baking['title'],20)

# https://github.com/hundredblocks/concrete_NLP_tutorial/blob/master/NLP_notebook.ipynb

**Decide on Additional Stopwords**

In [5]:
allstopwords = nltk.corpus.stopwords.words('english')
new_stops = ['recipe','cooking','make',
             'help','chicken','recipes',
             'cook','best','good',
             'cake','need','use',
             'food','question','sauce',
             'making','looking','ideas',
             'anyone','made','http','https','remove']

for i in new_stops:
    allstopwords.append(i)

**Process Words**

In [6]:
def doc_processed(doc_raw):
    """Input raw doc from reddit post, output proccessed doc."""
    
    # Instantiate BeautifulSoup and convert doc_raw to doc_text
    doc_text = BeautifulSoup(doc_raw).get_text()
    
    # Remove non-letters
    doc_letters = re.sub('[^a-zA-Z]',' ',doc_text)
    
    # Convert to lower case, split into individual words
    doc_words = doc_letters.lower().split()
    
    # Convert stopwords to a set
    stops = set(allstopwords)
    
    # Remove stopwords
    doc_meaningful_words = [w for w in doc_words if not w in stops]
    
    # Join words back into one string
    doc_processed = (' '.join(doc_meaningful_words))    
    
    # Return processed titles
    return doc_processed

In [7]:
# Process Titles
df_reddit_cleaned['title'] = df_reddit_cleaned['title'].apply(doc_processed)

# Process Selftext
df_reddit_cleaned['selftext'] = df_reddit_cleaned['selftext'].apply(doc_processed)

**Convert Subreddit Column to Binary**

In [8]:
df_reddit_cleaned['subreddit'] = df_reddit_cleaned['subreddit'].map({'Cooking':1,'Baking':0})
df_reddit_cleaned['subreddit']

0         1
1         1
2         1
3         1
4         1
         ..
110373    0
110374    0
110375    0
110376    0
110377    0
Name: subreddit, Length: 110378, dtype: int64

**Read Processed Data into CSV**

In [9]:
df_reddit_cleaned.to_csv('data/reddit_processed.csv',index=False)

**Notes**

- Tried Lemma
- Tried Porterstem
- Tried Stemming