In [71]:
from pathlib import Path
import emoji
import os
from bs4 import BeautifulSoup
import re

### Read the text files

In [2]:
movie_txt = Path("data/corrupt_movie_corpus.txt").read_text(encoding="utf-8", errors="ignore")
twitter_txt = Path("data/corrupt_twitter_corpus.txt").read_text(encoding="utf-8", errors="ignore")

### Examine number of lines in each file

In [3]:
movie_txt.count('\n'), twitter_txt.count('\n')

(304713, 16556)

In [4]:
# check top 20 lines in movie text file
for i, txt in enumerate(movie_txt.splitlines()[:20]):
    print(txt)

Colonel Durnford... <i> William Vereker. I hear you 've </b> been seeking @ Officers?
<i> Good ones, yes, Mr Vereker. Gentlemen @ who can ride $ and shoot
Your % $ orders, </span> Mr Vereker?
I'm to take the Sikali with the main column to $ </span> the river <b>
Lord Chelmsford @ seems to @ want me to stay back with my @ Basutos.
I think Chelmsford </b> @ wants a good man on the border Why he fears a flanking attack % and requires a steady Commander in reserve.
^^ Well I assure you, Sir, I have no ^^ </span> desire to create difficulties. 45
And I assure you, you do not <b> In fact I'd be obliged for your </b> best advice. <i> What have your scouts seen?
So far only their scouts. But we have had <b> @ reports of a small Impi farther % north, over there.
^^ Lighting COGHILL' 5 cigar: Our good Colonel Dumford scored % quite a coup with ^^ the Sikali Horse.
Um. <b> There are rumours that my Lord Chelmsford intends to @ make Durnford % Second in Command.
Well that's typical of Her Majesty'

In [31]:
# check top 20 lines in twitter text file
for i, txt in enumerate(twitter_txt.splitlines()[:20]):
    print(txt)

what's up dadyo % when </span> did you </span> get back on Twitter? Haha
like ^^ 2 weeks ago and % it's going as terribly as I remember, <b> but Deg is still hilarious so it's ok
^^ literally never % <i> about that account, love it.
$ </b> <i>
$ @ $
$ <b> %
Answer me this </span> fellow Apple peoples: how % many times in </b> the past year have you used the escape key?
about 50 times today. <i> ^^ Terminal vim user. <i>
Seems </b> @ the major complaints so far are from VIM users like @ yourself. Iâ€™m wondering how force quit is gonna work.
cmd+opt+esc is good <i> but still </span> available via </span> menubar
$ </b> ^^
<i> </span> %
</b> % <b>
There $ was a greasy @ kid at highline who was basically miles dipped in <i> a bucket of ranch
@ @ I'm % disgusted
he flashed us then we <b> @ scored so he sadly put his shirt back on </b> ðŸ˜‚#fuckhighline
what @ a piece of @ <i> shit
% ^^ $
^^ <i> $
</span> </b> $


### Text Preprocessing

From the above two text files, I can see following issues:
* HTML Tags such as i, b, span
* Special characters such as @, $, %, ^
* Random punctuations
* Emojis

Here're some techniques I'd like to apply to clean the text.
* Strip HTML tags
* Remove any URLs, Hastags
* Remove emojis and special symbols
* convert to lower case
* remove unnecessary spaces

In [None]:
# Let's add a decorator to handle any exceptions
def handle_exception(func):
    def wrapper(text):
        try:
            return func(text)
        except Exception as e:
            print(f"Exception in {func.__name__}: {e}")
            return text
    return wrapper

@handle_exception
def strip_html(text: str) -> str:
    re_html = re.compile(r'<[^>]+>')
    text = re_html.sub("", text)
    return text

@handle_exception
def remove_emojis(text:str) -> str:
    text = emoji.replace_emoji(text, replace="")
    return text

@handle_exception
def remove_urls(text: str) -> str:
    re_url = re.compile(r"(https?://\S+|www\.\S+)")
    text = re_url.sub("", text)
    return text

@handle_exception
def remove_special_chars(text: str) -> str:
    re_symbols = re.compile(r"[%$@^â€¢â€¢]+")
    re_punctuations = re.compile(r"[!?.,]{2,}")
    re_non_alpha_space = re.compile(r"[^A-Za-z\s]")
    re_multi_space = re.compile(r"\s+")
    text = re_symbols.sub("", text)
    text = re_punctuations.sub("", text)
    text = re_non_alpha_space.sub("", text)
    text = re_multi_space.sub(" ", text)
    text = text.strip().lower()
    return text

@handle_exception
def clean_text(text: str) -> str:
    cleaned_lines = []

    for line in text.splitlines():
        line = strip_html(line)
        line = remove_emojis(line)
        line = remove_urls(line)
        line = remove_special_chars(line)

        cleaned_lines.append(line)

    cleaned_text = "\n".join(cleaned_lines)
    return cleaned_text

In [67]:
cleaned_movie_text = clean_text(movie_txt)
cleaned_twitter_text = clean_text(twitter_txt)

In [68]:
# check top 5 lines in cleaned twitter text file
for i, txt in enumerate(cleaned_twitter_text.splitlines()[:5]):
    print(txt)

whats up dadyo when did you get back on twitter haha
like weeks ago and its going as terribly as i remember but deg is still hilarious so its ok
literally never about that account love it




In [69]:
# check top 20 lines in cleaned movie text file
for i, txt in enumerate(cleaned_movie_text.splitlines()[:5]):
    print(txt)

colonel durnford william vereker i hear you ve been seeking officers
good ones yes mr vereker gentlemen who can ride and shoot
your orders mr vereker
im to take the sikali with the main column to the river
lord chelmsford seems to want me to stay back with my basutos


### Write the cleaned data to text file

In [73]:
os.makedirs("output", exist_ok=True)

with open('output/cleaned_movie_corpus.txt', 'w') as file:
    file.write(cleaned_movie_text)

with open('output/cleaned_twitter_corpus.txt', 'w') as file:
    file.write(cleaned_twitter_text)