In [1]:
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
import re

### Loading Training and Validation Data

In [2]:
training_data = pd.read_csv("./data/raw/train.csv")
validation_data = pd.read_csv("./data/raw/valid.csv")

### Data preprocessing

#### Dropping ID, Tags and CreationData which are not required for the processing
#### Converting the Result label to numeric

In [3]:
training_data = training_data.drop(['Id', 'Tags', 'CreationDate'], axis=1)
training_data['Y'] = training_data['Y'].map({'LQ_CLOSE':0, 'LQ_EDIT': 1, 'HQ':2})

validation_data = validation_data.drop(['Id', 'Tags', 'CreationDate'], axis=1)
validation_data['Y'] = validation_data['Y'].map({'LQ_CLOSE':0, 'LQ_EDIT': 1, 'HQ':2})

In [4]:
training_data['text'] = training_data['Title'] + ' ' + training_data['Body']
training_data = training_data.drop(['Title', 'Body'], axis=1)

validation_data['text'] = validation_data['Title'] + ' ' + validation_data['Body']
validation_data = validation_data.drop(['Title', 'Body'], axis=1)

In [5]:
def clean_text(text):
    text = BeautifulSoup(text, "lxml").text # HTML decoding
    text = text.lower() # lowercase text
    text = REPLACE_BY_SPACE_RE.sub(' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text
    text = BAD_SYMBOLS_RE.sub('', text) # delete symbols which are in BAD_SYMBOLS_RE from text
    text = ' '.join(word for word in text.split() if word not in STOPWORDS) # delete stopwors from text
    return text

In [6]:
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS =nltk.corpus.stopwords.words('english')
training_data['text']=training_data['text'].apply(clean_text)
validation_data['text']=validation_data['text'].apply(clean_text)

In [9]:
training_data.to_csv("./data/cleaned/training_data_cleaned.csv",index=False)
validation_data.to_csv("./data/cleaned/validation_data_cleaned.csv",index=False)