### Step 1: Load the dataset into a pandas dataframe.

In [45]:
import pandas as pd
import re as re

In [47]:
df = pd.read_csv('stackexchange_812k.csv.gz', compression='gzip')

In [48]:
df.sample(10)

Unnamed: 0,post_id,parent_id,comment_id,text,category
525789,147488,,705812.0,"So far, a collegue simulated it, but we've fou...",comment
340649,48546,,94232.0,When you say that we only have estimates of X ...,comment
772334,247751,,471540.0,Check http://stats.stackexchange.com/questions...,comment
594577,318471,,604745.0,@AdamO while most people use the extreme case ...,comment
361947,63802,,156736.0,You can test those ICCs by simply testing the ...,comment
196133,83552,83302.0,,"<p>In general, you can run a feature selection...",post
386209,79399,,155713.0,Did you look at the answer by Jeromy Anglim in...,comment
704372,348988,,658042.0,There is also this paper https://arxiv.org/abs...,comment
207815,114337,,,<p>I need to place a Laplace prior on a random...,post
327919,38493,,289230.0,"this is old, but `blme::bglmer` should be ment...",comment


### Step 2: Use regular expressions to remove elements that are not words such as HTML tags, LaTeX expressions, URLs, digits, line returns, and so on.

In [49]:
HTML_RE = re.compile(r'<[^>]+>') #removes html tags
URL_RE = re.compile(r'https\S') #removes urls
MISC_RE = re.compile(r'\d|\n') #removes digits and line returns 

df['text'] = df['text'].apply(lambda x: HTML_RE.sub("", x))
df['text'] = df['text'].apply(lambda x: URL_RE.sub("", x))
df['text'] = df['text'].apply(lambda x: MISC_RE.sub("", x))

In [50]:
df.head(10)

Unnamed: 0,post_id,parent_id,comment_id,text,category
0,1,,,Eliciting priors from experts,title
1,2,,,What is normality?,title
2,3,,,What are some valuable Statistical Analysis op...,title
3,4,,,Assessing the significance of differences in d...,title
4,6,,,The Two Cultures: statistics vs. machine learn...,title
5,7,,,Locating freely available data samples,title
6,8,,,So how many staticians *does* it take to screw...,title
7,10,,,Under what conditions should Likert scales be ...,title
8,11,,,Multivariate Interpolation Approaches,title
9,21,,,Forecasting demographic census,title


In [51]:
df.shape

(812132, 5)

### Step 3: Remove missing values for texts

In [52]:
# Remove missing values for texts
empty_texts = df[df['text'] == "" ].index
# Delete these row indexes from dataFrame
df.drop(empty_texts, inplace=True)

In [53]:
df.shape

(812129, 5)

### Step 4: Remove tects that are extremely large or too short to bring any information to the model


In [54]:
short_texts = df[df['text'].apply(lambda x: len(x) < 5)].index
df.drop(short_texts, inplace=True)

In [55]:
df.shape

(812127, 5)

In [56]:
long_texts = df[df['text'].apply(lambda x: len(x) > 50)].index
df.drop(long_texts, inplace=True)

In [57]:
df.shape

(75730, 5)

### Step 5: Tokenize texts

In [58]:
import spacy

In [59]:
nlp = spacy.load('en')
df['text'] = [nlp(text) for text in df.text]
df.sample(3)

Unnamed: 0,post_id,parent_id,comment_id,text,category
284841,14572,,25968.0,"(@Jeromy, Anglim, :, Individuals, ..., Sample,...",comment
45340,157074,,,"(Inquiry, on, comparing, quadratic, regression...",title
68495,324361,,,"(Different, low, resolution, images, using, CNN)",title


### Step 6: Export to CSV

In [60]:
df.to_csv('cleaned_dataset.csv')