# Imports

In [426]:
import pandas as pd
import pickle
import nltk
import numpy as np
import string

from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.util import ngrams
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

import re

In [427]:
# Open all the pickle files and return the combined df
def unpickle_df(pickle_names):
    df = pd.DataFrame()  # Initialize df
    
    # For every pickle object, open it then append it to the output df
    for pickle_name in pickle_names:
        # Open the pickled object
        with open(pickle_name, 'rb') as picklefile: 
            pickle_df = pickle.load(picklefile)
        
        # Append pickled object to the df
        df = df.append(pickle_df,
                      ignore_index=True)  # So index doesn't reset for every appended df
    
    # Sort df by time
    df.sort_values(by=['created_utc'], inplace=True)
    
    return df.reset_index(drop=True)  # Drop the duplicate index column

In [428]:
# Unpickle the dfs. Includes data from 1/1/2018 to 1/1/2019

pickle_names = ['2018-01-01_to_2018-02-01', 
                '2018-02-01_to_2018-03-01', 
                '2018-03-01_to_2018-04-01', 
                '2018-04-01_to_2018-05-01', 
                '2018-05-01_to_2018-06-01', 
                '2018-06-01_to_2018-07-01', 
                '2018-07-01_to_2018-08-01', 
                '2018-08-01_to_2019-01-01']

df = unpickle_df(pickle_names)

In [429]:
df.sample(3)

Unnamed: 0,subreddit,submission_id,created_time,created_utc,title,selftext,num_comments,score,gilded,comments
388,gadgets,7ovdbr,2018-01-08 02:26:10,1515378370,Habey BIS-6862 &amp; Shuttle XPC Slim DS77U Fa...,,0,1,,
2487,gadgets,7we7b9,2018-02-09 15:35:04,1518190504,Ruiz insta al PSOE de Torrox a tomar nota del ...,,0,1,,[]
11043,gadgets,8yask0,2018-07-12 15:33:55,1531409635,Torre del Mar ya se prepara para su procesión ...,,0,1,0.0,


In [430]:
# Combines title and self_text (body) into a single document
def post_to_doc(df):
    return df.title + ' ' + df.selftext

In [431]:
# Copy df
doc_df = df.copy()

# Combine title and selftext columns
doc_df['post'] = post_to_doc(doc_df)

# Drop subreddit, title, and selftext columns
doc_df.drop(columns=['title', 'selftext', 'subreddit'], axis=1, inplace=True)

In [432]:
doc_df.head(1)

Unnamed: 0,submission_id,created_time,created_utc,num_comments,score,gilded,comments,post
0,7ncgte,2018-01-01 01:41:09,1514770869,1,1,,"[Hello, /u/adarkthirty! Thanks for contributin...","Versatile, self-deploying, relocatable buildings"


In [433]:
# Create new row for every comment
def make_comment_post(df):
    output_df = df.copy()
    
    for index, row in df.iterrows():
#         print(row)
        if row.num_comments > 0 and row.comments != None:  # Only look at rows that have comments
            comments = row.comments
#             print(comments)
            
            # Iterate through all the comments in the row
            for comment in comments:
                comment_df = pd.DataFrame({'submission_id': row.submission_id,
                                           'created_time': row.created_time,
                                           'created_utc': row.created_utc,
                                           'num_comments': np.nan,
                                           'score': np.nan,
                                           'gilded': np.nan,
                                           'comments': np.nan,
                                           'post': [comment]
                                          })
#                 print(comment_df)
                
                # Append comment to output df, because we will count it as a post
                output_df = output_df.append(comment_df, ignore_index=True)
    
    # Sort df by time
    output_df.sort_values(by=['created_utc'], inplace=True)
    
    return output_df

In [434]:
# Make all comments into their own rows
doc_df = make_comment_post(doc_df)

In [435]:
# Reset index
doc_df.reset_index(drop=True, inplace=True)

In [436]:
doc_df.head(3)

Unnamed: 0,submission_id,created_time,created_utc,num_comments,score,gilded,comments,post
0,7ncgte,2018-01-01 01:41:09,1514770869,1.0,1.0,,"[Hello, /u/adarkthirty! Thanks for contributin...","Versatile, self-deploying, relocatable buildings"
1,7ncgte,2018-01-01 01:41:09,1514770869,,,,,"Hello, /u/adarkthirty! Thanks for contributing..."
2,7ncnmp,2018-01-01 02:20:00,1514773200,,,,,"Hello, /u/TommyDuquette! Thanks for contribut..."


In [437]:
doc_df.post.iloc[1]

"Hello, /u/adarkthirty! Thanks for contributing. However, your submission has been removed since the website you submitted is not on our [white-listed domains list](https://www.reddit.com/r/gadgets/wiki/domainlist) and therefore held for review. \n\n## How do I get my submissions approved? \n\nTo get your submission approved simply follow these steps: \n\n1. Make sure your submission follows the rules as found in [the sidebar](https://www.reddit.com/r/gadgets/wiki/config/sidebar).\n2. After making sure it follows the rules, **[message the moderators.](https://www.reddit.com/message/compose?to=/r/gadgets&amp;subject=Request to review this submission by /u/adarkthirty&amp;message=I would like to have my submission reviewed: [Versatile, self-deploying, relocatable buildings]( https://www.reddit.com/r/gadgets/comments/7ncgte/versatile_selfdeploying_relocatable_buildings/\\))**\n\nThat's it! We will review your submission and if it is in line with our rules and guidelines approve it for you

In [438]:
if 'Thanks for contributing' in doc_df.post.iloc[1]:
    print(True)

True


In [454]:
# Delete any rows which only contain moderator or removed or deleted comments
def delete_mod_comments(df):
    output_df = df.copy()
    drop_list = []  # List of indeces to drop
    
    for index, row in df.iterrows(): 
    
        # If there are no comments
        if not row.num_comments >= 1:
            
            # If the post starts with a moderator or deleted/removed message, add it to drop list
            if (row.post.startswith('Hello, /u/') or 
                row.post.startswith('[removed]') or 
                row.post.startswith('[deleted]') or
                'you have any questions about this' in row.post or 
                'superthread' in row.post or 
                'automatically removed' in row.post or
                'am a bot' in row.post):
                drop_list.append(index)
    
    return output_df.drop(drop_list)

In [455]:
# Remove moderator, deleted, and removed comments
doc_df = delete_mod_comments(doc_df)

In [456]:
# Pre text cleaned df. Ignore irrelevant columns
precleaned_df = doc_df.drop(['num_comments', 'score', 'gilded', 'comments'], axis=1)

In [457]:
precleaned_df.head()

Unnamed: 0,submission_id,created_time,created_utc,post
0,7ncgte,2018-01-01 01:41:09,1514770869,"Versatile, self-deploying, relocatable buildings"
3,7ncnmp,2018-01-01 02:20:00,1514773200,INSIDER. High-Tech Boxing Sensor Tracks Punche...
4,7ncy6w,2018-01-01 03:23:21,1514777001,CNN Viewers Couldn’t Wait For Drunk Don Lemon ...
7,7ncyjm,2018-01-01 03:25:45,1514777145,$50 VR Headset! EVO VR Headset
9,7ndboh,2018-01-01 04:51:10,1514782270,Got some Apple stuff for Christmas? These are ...


In [462]:
precleaned_df.shape

(41211, 4)

# Text Pre-Processing

In [458]:
def clean_text(text):
    # Make string lowercase 
    text = str(text)
    text = text.lower()
    
    # Replace newline with space
    text = text.replace('\n', ' ')
    
    #remove links, non-English characters, and user/subreddit names
    text = re.sub('/u/[A-Za-z0-9_-]+', '', text)  # Remove usernames
    text = re.sub('/r/[A-Za-z0-9_-]+', '', text)  # Remove subreddit names
    
    text = re.sub('https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|www\.[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9]+\.[^\s]{2,}|www\.[a-zA-Z0-9]+\.[^\s]{2,}', '', text)  # Remove URLs
    
    text = re.sub('([^\x00-\x7A])+', '', text)  # Remove non-English characters

    # Stemming
    # e.g. jumping, jumps, jump all become jump
#     ps = nltk.porter.PorterStemmer()
#     text = ' '.join([ps.stem(word) for word in text.split()])
    
    #tokenize
#     tokens = word_tokenize(text)  # Makes the text into a list of words
    
#     wordnet_lemmatizer = WordNetLemmatizer()
#    stopwords_english = stopwords.words('english')
    
    # Initialize list to contain cleaned text
#    clean_text = [] 
        
    # Remove stopwords, puncuation, then lemmatize
#     for word in tokens:
#         if (word not in stopwords_english and word not in string.punctuation): 
#             word_cleaned = wordnet_lemmatizer.lemmatize(word)
#             clean_text.append(word_cleaned)

    # Remove words of length 2 or smaller        
#     clean_text = [word for word in clean_text if len(word) > 2]
    

    return text   

In [459]:
# Create a df with cleaned text
cleaned_df = precleaned_df.drop('post', axis=1)
cleaned_df['text'] = precleaned_df.post.apply(clean_text)

In [460]:
cleaned_df

Unnamed: 0,submission_id,created_time,created_utc,text
0,7ncgte,2018-01-01 01:41:09,1514770869,"versatile, self-deploying, relocatable buildings"
3,7ncnmp,2018-01-01 02:20:00,1514773200,insider. high-tech boxing sensor tracks punche...
4,7ncy6w,2018-01-01 03:23:21,1514777001,cnn viewers couldnt wait for drunk don lemon t...
7,7ncyjm,2018-01-01 03:25:45,1514777145,$50 vr headset! evo vr headset
9,7ndboh,2018-01-01 04:51:10,1514782270,got some apple stuff for christmas? these are ...
10,7ndbzp,2018-01-01 04:53:14,1514782394,kim jong-un taunts us with nuclear button in n...
11,7ndtjc,2018-01-01 06:49:36,1514789376,5 cool gadgets that work on green energy
13,7ndw82,2018-01-01 07:09:19,1514790559,sonos play 1. it's designed to be used in the...
15,7ndw82,2018-01-01 07:09:19,1514790559,is it a bad time to get an iphone x since ther...
16,7ndw82,2018-01-01 07:09:19,1514790559,"no, well, not yet anyway. i want to use it for..."


# Export Cleaned Data

In [461]:
# Pickle 
import pickle

# Pickle dataframe to use in other project file
with open('cleaned_text_11-14', 'wb') as picklefile:
    pickle.dump(cleaned_df, picklefile)

In [463]:
# Pickle dataframe to use in other project file
with open('precleaned_text_11-14', 'wb') as picklefile:
    pickle.dump(precleaned_df, picklefile)