# PART 2: Data Cleaning

In [1]:
import pandas as pd
import numpy as np
import re

#### Retrieving data harvested in step 1 from csv files to dataframes

In [3]:
AW_comments_df = pd.read_csv('../data/aw_com.csv')
AW_posts_df = pd.read_csv('../data/aw_pos.csv')
AM_comments_df = pd.read_csv('../data/am_com.csv')
AM_posts_df = pd.read_csv('../data/am_pos.csv')

### Handling "noise" in data identified through visual inspection of raw data, tokens, and ngrams

In [4]:
def clean_garbage():
    regexps = [r'\\+', r'\\n+', r'\\r+', r'http\S+', r'"+', r"'+", r"__+",r"--+", r"==+", r'\d+', r'/r\S+', r'\u200d', r'“+',r'”+']
    trash = ['[removed]', '(', ')', 'removed', '_', '*']
    for regexp in regexps:
        AM_posts_df['title_selftext'] = AM_posts_df['title_selftext'].apply(lambda x: re.sub(regexp, "", str(x), 0, re.MULTILINE))
        AW_posts_df['title_selftext'] = AW_posts_df['title_selftext'].apply(lambda x: re.sub(regexp, "", str(x), 0, re.MULTILINE))
        AM_comments_df['body'] = AM_comments_df['body'].apply(lambda x: re.sub(regexp, "", str(x), 0, re.MULTILINE))
        AW_comments_df['body'] = AW_comments_df['body'].apply(lambda x: re.sub(regexp, "", str(x), 0, re.MULTILINE))
    for pot in trash:
        AM_posts_df['title_selftext'] = AM_posts_df['title_selftext'].apply(lambda x: x.replace(pot, ""))
        AW_posts_df['title_selftext'] = AW_posts_df['title_selftext'].apply(lambda x: x.replace(pot, ""))
        AM_comments_df['body'] = AM_comments_df['body'].apply(lambda x: x.replace(pot, ""))
        AW_comments_df['body'] = AW_comments_df['body'].apply(lambda x: x.replace(pot, ""))

In [5]:
clean_garbage()


<br>Bigram and trigram analysis (see Part 3) revealed that data includes **messages posted by Reddit moderators**.

In [6]:
print(f"{AW_posts_df['title_selftext'].str.contains('moderator').sum()} posts on AskWomen mention 'moderator'.")
print(f"{AM_posts_df['title_selftext'].str.contains('moderator').sum()} posts on AskMen mention 'moderator'.")
print(f"{AW_comments_df['body'].str.contains('moderator').sum()} comments on AskWomen mention 'moderator'.")
print(f"{AM_comments_df['body'].str.contains('moderator').sum()} comments on AskMen mention 'moderator'.")

5 posts on AskWomen mention 'moderator'.
0 posts on AskMen mention 'moderator'.
311 comments on AskWomen mention 'moderator'.
56 comments on AskMen mention 'moderator'.


In [7]:
pd.options.display.max_columns = 50
AW_comments_df[(AW_comments_df['body'].notna()) & (AW_comments_df['body'].str.contains('moderator'))].head()

Unnamed: 0.1,Unnamed: 0,all_awardings,associated_award,author,author_flair_background_color,author_flair_css_class,author_flair_richtext,author_flair_template_id,author_flair_text,author_flair_text_color,author_flair_type,author_fullname,author_patreon_flair,author_premium,awarders,body,collapsed_because_crowd_control,comment_type,created_utc,distinguished,gildings,id,is_submitter,link_id,locked,no_follow,parent_id,permalink,retrieved_on,score,send_replies,stickied,subreddit,subreddit_id,top_awarded_type,total_awards_received,treatment_tags,author_cakeday
17,17,[],,kaeorin,,female,[],,♀,dark,text,t2_7ys0h,False,False,[],Your submission has been :\r\n\r\nThis sort of...,,,1609186922,moderator,{},ghbk621,False,t3_kly4qv,False,True,t3_kly4qv,/r/AskWomen/comments/kly4qv/what_to_do_when_i_...,1609188619,1,True,False,AskWomen,t5_2rxrw,,0,[],
24,24,[],,nevertruly,,female,[],8106c61a-c8aa-11e1-a771-12313b0ce1e2,♀,dark,text,t2_a6ayf,False,False,[],Removed as answers to common questions can be ...,,,1609186817,moderator,{},ghbjyru,False,t3_kly1p0,False,True,t3_kly1p0,/r/AskWomen/comments/kly1p0/whats_a_creepy_enc...,1609188506,1,True,False,AskWomen,t5_2rxrw,,0,[],
45,45,[],,msstark,,female,[],8106c61a-c8aa-11e1-a771-12313b0ce1e2,♀,dark,text,t2_h1slt,False,False,[],Your submission has been :\n\nQuestions asking...,,,1609186260,moderator,{},ghbivvg,False,t3_klxxtw,False,True,t3_klxxtw,/r/AskWomen/comments/klxxtw/would_a_girl_get_b...,1609187899,1,True,True,AskWomen,t5_2rxrw,,0,[],
46,46,[],,MostlyALurkerBefore,,,[],,,,text,t2_8tivr,False,True,[],This comment or post has been for derailing. ...,,,1609186250,moderator,{},ghbiv5v,False,t3_klkrzx,False,True,t1_ghbim1o,/r/AskWomen/comments/klkrzx/ladies_have_you_ev...,1609187889,1,True,False,AskWomen,t5_2rxrw,,0,[],
56,56,[],,kaeorin,,female,[],,♀,dark,text,t2_7ys0h,False,False,[],Your comment has been :\r\n\r\nDerailing the t...,,,1609186014,moderator,{},ghbidj1,False,t3_klw9zg,False,True,t1_ghba4su,/r/AskWomen/comments/klw9zg/women_who_have_had...,1609187623,1,True,False,AskWomen,t5_2rxrw,,0,[],


Comments posted by moderators can be identified using the **`distinguished`** column. Next steps remove those comments from data used in analaysis and models.

In [8]:
AW_comments_df = AW_comments_df[AW_comments_df['distinguished'] != 'moderator']
#AW_comments_df = AW_comments_df[AW_comments_df['author'] != 'AutoModerator'] # this may be redundant
#AM_comments_df = AM_comments_df[AM_comments_df['author'] != 'AutoModerator'] # this may be redundant
AM_comments_df = AM_comments_df[AM_comments_df['distinguished'] != 'moderator']

In [9]:
print(f"{AW_comments_df['body'].str.contains('moderator').sum()} comments on AskWomen mention 'moderator'.")
print(f"{AM_comments_df['body'].str.contains('moderator').sum()} comments on AskMen mention 'moderator'.")

1 comments on AskWomen mention 'moderator'.
0 comments on AskMen mention 'moderator'.


#### Exporting cleaned data to csv files for retrieval in subsequent parts

In [10]:
AW_comments_df.to_csv('../data/aw_com.csv')
AW_posts_df.to_csv('../data/aw_pos.csv')
AM_comments_df.to_csv('../data/am_com.csv')
AM_posts_df.to_csv('../data/am_pos.csv')