In [None]:
import re
import string
import pandas as pd

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

re_url = re.compile(r'(?:http|ftp|https)://(?:[\w_-]+(?:(?:\.[\w_-]+)+))(?:[\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?')
re_email = re.compile('(?:[a-z0-9!#$%&\'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&\'*+/=?^_`{|}~-]+)*|"(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21\x23-\x5b\x5d-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])*")@(?:(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?|\[(?:(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9]))\.){3}(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9])|[a-z0-9-]*[a-z0-9]:(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21-\x5a\x53-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])+)\])')

## Read dataset

For demonstration purposes, we're going to read the dataset generated with the same pre-processing showed here, removing the text_cleaned column, that will be generated at the end of this notebook. 

In [None]:
df = pd.read_csv('/kaggle/input/20-newsgroup-preprocessed/20newsgroup_preprocessed.csv', sep=';', usecols=['target', 'text'])

In [None]:
df.head()

### Original document

In [None]:
print(df['text'][3398])

### Clean Header

We decided to remove the header of the document to avoid situations like our model predicting that one common name has a high influence to determine the document class. This might happen if our training data has a lot of people with the same common name writing this one topic.

In [None]:
%%time
def clean_header(text):
    text = re.sub(r'(From:\s+[^\n]+\n)', '', text)
    text = re.sub(r'(Subject:[^\n]+\n)', '', text)
    text = re.sub(r'(([\sA-Za-z0-9\-]+)?[A|a]rchive-name:[^\n]+\n)', '', text)
    text = re.sub(r'(Last-modified:[^\n]+\n)', '', text)
    text = re.sub(r'(Version:[^\n]+\n)', '', text)

    return text


df['text_cleaned'] = df['text'].apply(clean_header)

In [None]:
print(df['text_cleaned'][3398])

### Clean text

Read more about this in:


- https://nlp.stanford.edu/IR-book/html/htmledition/tokenization-1.html
- https://nlp.stanford.edu/IR-book/html/htmledition/normalization-equivalence-classing-of-terms-1.html

In [None]:
%%time
def clean_text(text):        
    text = text.lower()
    text = text.strip()
    text = re.sub(re_url, '', text)
    text = re.sub(re_email, '', text)
    text = re.sub(f'[{re.escape(string.punctuation)}]', '', text)
    text = re.sub(r'(\d+)', ' ', text)
    text = re.sub(r'(\s+)', ' ', text)
    
    return text

df['text_cleaned'] = df['text_cleaned'].apply(clean_text)

In [None]:
print(df['text_cleaned'][3398])

### Remove stop words

Read more about this in:

- https://nlp.stanford.edu/IR-book/html/htmledition/dropping-common-terms-stop-words-1.html

In [None]:
%%time

stop_words = stopwords.words('english')

df['text_cleaned'] = df['text_cleaned'].str.split() \
    .apply(lambda x: ' '.join([word for word in x if word not in stop_words]))

In [None]:
print(df['text_cleaned'][3398])

### Next steps

In this notebook, we provide a simple approach that you might use for cleaning your text before machine learning NLP tasks. We know this is just the beginning to normalize your text documents, but we didn't want to apply more cleaning to the point you can't try different ones.

If you need some inspiration, read more about stemming, lemmatization, tf-idf, and word embedding.

In [None]:
df.head()