In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
import os
os.chdir('/content/drive/My Drive/Project-CS410')

# Get Corpus

In [None]:
from bs4 import BeautifulSoup as bs
import numpy as np
import re

In [None]:
# regular expression for checking whether "gore" and/or "bush" exist in a text. 
RE = re.compile(r'([^a-z]|^)gore([^a-z]|$)|([^a-z]|^)bush([^a-z]|$)')
keyword_filter = False

# May 2000 to Ocbtober 2000
dirs = ['2000/05','2000/06','2000/07','2000/08','2000/09','2000/10']

# July 2000 to December 2001
# dirs = ['2000/07','2000/08','2000/09','2000/10','2000/11','2000/12','2001/01','2001/02','2001/03','2001/04','2001/05','2001/06','2001/07','2001/08','2001/09','2001/10','2001/11','2001/12']

storage_name = 'corpus_may2000-oct2000.txt'
# storage_name = 'corpus_july2000-dec2001.txt'

fout = open(storage_name, 'w', encoding='utf-8')
for dir_ in dirs:
    year, month = dir_.split('/') 
    days = os.listdir(year+'/'+month)
    for day in days:
        xml_files = os.listdir(year+'/'+month+'/'+day)
        for xml_file in xml_files:
            doc = ""
            # Read the XML file
            with open(year+'/'+month+'/'+day+'/'+xml_file, "r", encoding='utf-8') as file:
                # Read each line in the file, readlines() returns a list of lines
                content = file.readlines()
                # Combine the lines in the list into a string
                content = "".join(content)
                bs_content = bs(content, "html")
                if keyword_filter:
                    # get those paragraphs that has the keyword in its text.
                    paragraphs = [par.getText().lower().strip() for par in bs_content.find_all('p') if par and RE.search(par.getText().lower())]
                else:
                    paragraphs = [par.getText().lower().strip() for par in bs_content.find_all('p')]
                doc = " "+ " ".join(paragraphs)
                doc = doc.replace("\n", "")
                doc = doc.replace("\t", " ").strip()
                if doc == "" or not doc:
                    continue
                line = year+'-'+month+'-'+day + "\t" + doc + "\n"
                fout.write(line)
        print(year+'-'+month+'-'+day)

# Preprocess Corpus

In [None]:
import nltk
nltk.download('brown')
nltk.download('names')

[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.
[nltk_data] Downloading package names to /root/nltk_data...
[nltk_data]   Unzipping corpora/names.zip.


True

In [None]:
!pip install normalise
import numpy as np
import pandas as pd
import multiprocessing as mp
import string
import spacy 
import en_core_web_sm
from nltk.tokenize import word_tokenize
from sklearn.base import TransformerMixin, BaseEstimator
from normalise import normalise

nlp = en_core_web_sm.load()

Collecting normalise
[?25l  Downloading https://files.pythonhosted.org/packages/28/2d/f06cf3d3714502dec10e19238a5da201b71ce198165beda9c1adaf5063da/normalise-0.1.8-py3-none-any.whl (15.7MB)
[K     |████████████████████████████████| 15.7MB 189kB/s 
Collecting roman
  Downloading https://files.pythonhosted.org/packages/c3/9e/47df0bf47ccd7e9bbbf0a539ac86e45ded37c34dba544a0a2e5d01ce5f88/roman-3.3-py2.py3-none-any.whl
Installing collected packages: roman, normalise
Successfully installed normalise-0.1.8 roman-3.3




In [None]:
class TextPreprocessor(BaseEstimator, TransformerMixin):
    def __init__(self,
                 variety="BrE",
                 user_abbrevs={},
                 n_jobs=1):
        """
        Text preprocessing transformer includes steps:
            1. Text normalization
            2. Punctuation removal
            3. Stop words removal
            4. Lemmatization
        
        variety - format of date (AmE - american type, BrE - british format) 
        user_abbrevs - dict of user abbreviations mappings (from normalise package)
        n_jobs - parallel jobs to run
        """
        self.variety = variety
        self.user_abbrevs = user_abbrevs
        self.n_jobs = n_jobs

    def fit(self, X, y=None):
        return self

    def transform(self, X, *_):
        X_copy = X.copy()

        partitions = 1
        cores = mp.cpu_count()
        if self.n_jobs <= -1:
            partitions = cores
        elif self.n_jobs <= 0:
            return X_copy.apply(self._preprocess_text)
        else:
            partitions = min(self.n_jobs, cores)

        data_split = np.array_split(X_copy, partitions)
        pool = mp.Pool(cores)
        data = pd.concat(pool.map(self._preprocess_part, data_split))
        pool.close()
        pool.join()

        return data

    def _preprocess_part(self, part):
        return part.apply(self._preprocess_text)

    def _preprocess_text(self, text):
        normalized_text = self._normalize(text)
        doc = nlp(normalized_text)
        removed_punct = self._remove_punct(doc)
        removed_stop_words = self._remove_stop_words(removed_punct)
        return self._lemmatize(removed_stop_words)

    def _normalize(self, text):
        # some issues in normalise package
        try:
            return ' '.join(normalise(text, variety=self.variety, user_abbrevs=self.user_abbrevs, verbose=False))
        except:
            return text

    def _remove_punct(self, doc):
        return [t for t in doc if t.text not in string.punctuation]

    def _remove_stop_words(self, doc):
        return [t for t in doc if not t.is_stop]

    def _lemmatize(self, doc):
        return ' '.join([t.lemma_ for t in doc])

In [None]:
df = pd.read_csv('datasets/corpus_july2000-dec2001.txt', sep='\t', header=None)
df[1] = TextPreprocessor(n_jobs=-1).transform(df[1])
df.to_csv('datasets/corpus_july2000-dec2001_cleaned.txt', sep="\t", header=False, index=False)