    In this notebook, we remove unwanted characters, make all letters lowercase, apply PorterStemmer to each
    post title. We create a target column where 1 indicates the post is from AskWomen and 0 from AskMen.

In [1]:
import pandas as pd
import regex as re
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords

In [2]:
df = pd.read_csv('../data/data.csv')
df.columns

Index(['Unnamed: 0', 'index', 'source', 'text', 'title'], dtype='object')

In [3]:
df = df.drop(columns=['Unnamed: 0', 'index'])

In [4]:
df.isna().sum()

source      0
text      790
title       0
dtype: int64

In [5]:
df['source'].value_counts(normalize=True)

AskWomen    0.539007
AskMen      0.460993
Name: source, dtype: float64

In [6]:
df.shape

(1269, 3)

In [7]:
df['char_count'] = df['title'].apply(len)
df['word_count'] = df['title'].apply(lambda x: len(x.split()))
# Code adopted from Roy Kim's lesson on NLP

In [8]:
df.groupby(by=['source']).mean()

Unnamed: 0_level_0,char_count,word_count
source,Unnamed: 1_level_1,Unnamed: 2_level_1
AskMen,73.194872,13.882051
AskWomen,74.195906,13.624269


In [9]:
df['title'].tail()

1264     How have you dealt with the fear of abandonment?
1265    How do you know the difference between a date ...
1266              How did you discover your first orgasm?
1267                     What age did you meet your love?
1268           Men who settled down with ex hookers, how?
Name: title, dtype: object

In [10]:
df['title'][1265]

'How do you know the difference between a date and a hangout?'

In [11]:
df['title'] = [str(x).replace('&amp;', '&').replace('\n', '').replace("\'", "'") for x in df['title']]

In [21]:
df['title'] = [str(x).replace("SO", "sigoth").replace("significant other", "sigoth") for x in df['title']]

In [22]:
p_stemmer = PorterStemmer()
clean_titles = []
for title in df['title']:
    lower_letters = re.sub("[^a-zA-Z]", " ", title).replace("  ", " ").replace('   ', ' ')
    no_stops = [w.lower() for w in lower_letters.split() if w not in stopwords.words('english')]
    stemmed = [p_stemmer.stem(word) for word in no_stops]
    clean_titles.append(" ".join(stemmed))

In [23]:
len(clean_titles)

1269

In [24]:
df['clean_titles'] = pd.Series(clean_titles)

In [25]:
df['clean_titles'].tail()

1264          how dealt fear abandon
1265    how know differ date hangout
1266         how discov first orgasm
1267              what age meet love
1268             men settl ex hooker
Name: clean_titles, dtype: object

In [26]:
df = pd.get_dummies(df, columns=['source'], drop_first=True)

In [27]:
df['target'] = df['source_AskWomen']

In [28]:
df = df.drop(columns='source_AskWomen')

In [29]:
df.to_csv('../data/clean_data.csv', index=False)