### Data Preprocessing

- Remove URLs

In [71]:
import re
import string
from tqdm import tqdm
from bs4 import BeautifulSoup

In [5]:
def remove_urls(text):
    url_pattern = re.compile(r'https?://\S+|www\.\S+|bit\.ly/\S+|t\.co/\S+|tinyurl\.\S+')
    return url_pattern.sub('', text)

In [8]:
url_text = "An example of a website http://yahoo.com is a decent url"
url_output = remove_urls(url_text)
print(url_output)

An example of a website  is a decent url


- Remove the numbers

In [11]:
def remove_numbers(text):
    numbers = re.compile(r'[0-9]+')
    return numbers.sub('',text)

In [13]:
num_text = "My phone number is 82391231231. Please call me after 9pm"
num_output = remove_numbers(num_text)
print(num_output)

My phone number is . Please call me after pm


- Remove mentions with @

In [20]:
def remove_mentions(text):
    mentions = re.compile(r'@[a-zA-Z0-9_]+')
    return mentions.sub('',text)

In [21]:
men_text = "Hey @Oscar_Piastri, you are a champ. @Lando_Norris_007 you can do better"
men_output = remove_mentions(men_text)
print(men_output)

Hey , you are a champ.  you can do better


- Remove html tags

In [26]:
def clean_html_tags(text):
    return BeautifulSoup(text).get_text()

In [28]:
html_text = "<section>Hello There</section>"
html_output = clean_html_tags(html_text)
print(html_output)

Hello There


- Remove emoticons

In [43]:
def handle_emoticons(text):
    emoticon_pattern = re.compile(r':([\w-]+):')
    modified_sentence = emoticon_pattern.sub(lambda match: ' '+match.group(1).replace('_', '')+' ', text)
    return re.sub(r'\s+', ' ', modified_sentence)

In [47]:
emo_text = "😃💁 Hello There :-) :D"
emo_output = handle_emoticons(emo_text)
print(emo_output)

😃💁 Hello There :-) :D


- Handle short forms or acronyms

In [48]:
def handle_acronyms(text):
    acronym_dict = {'afaik':'as far as i know','ama':'ask me anything','api':'application programming interface','asmr':'autonomous sensory meridian response','bc':'because'
                    ,'b2b':'business-to-business','b2c':'business-to-consumer','b4':'before','bae':'before anyone else',
                   'bff':'best friends forever','brb':'be right back','btaim':'be that as it may','bts':'behind the scenes','bwl':'bursting with laughter',
                   'btw':'by the way','cc':'carbon copy','cpc':'cost per click','csl':'cannot stop laughing','bftba':'do not forget to be awesome',
                   'f2f':'face to face','fb':'facebook','fomo':'fear of missing out','ftfy':'fixed that for you',
                   'ftw':'for the win','gg':'good game','gr8':'great','grwm':'get ready with me',
                   'goat':'greatest of all time','gtg':'got to go','gtr':'got to run','hbd':'happy birthday',
                   'hmb':'hit me back','hifw':'how i feel when','hmu':'hit me up','hth':'happy to help',
                   'idc':'i do not care','idk':'i do not know','ifyp':'i feel your pain','ig':'instagram','ikr':'i know right','ily':'i love you',
                   'im':'instant message','imho':'in my humble opinion','imo':'in my opinion','imu':'i miss you',
                   'irl':'in real life','jk':'just kidding','l8':'late','lamo': 'laughing my ass off','lmk':'let me know','lms':'like my status',
                   'lol':'laughing out loud','lolz':'laughing out loud','mcm':'man crush monday','mfw':'my face when',
                   'mm':'music monday','msg':'message','mtfbwy':'may the force be with you',
                   'nbd':'no big deal','nm':'not much','nsfl':'not safe for life','nsfw':'not safe for work',
                   'nvm':'never mind','oan':'on another note',
                   'omg':'oh my god','omw':'on my way',
                   'ootd':'outfit of the day','op':'original poster','otp':'one true pairing','potd':'photo of the day',
                   'ppc':'pay per click','pm':'private message','ppl':'people','pr':'pagerank','psa':'public service accouncement',
                   'qotd':'quote of the day','qq':'crying','rofl':'rolling on the floor laughing','roflmao':'rolling on the floor laughing my ass off',
                   'roi':'return on investment','rt':'retweet','sc':'snapchat','sem':'search engine marketing','sm':'social media','smh':'shaking my head','sus':'suspicious','tbh':'to be honest',
                   'tbt':'throwback thursday','tfw':'that feeling when','tgif':'thank goodness it is friday',
                   'thx':'thanks','til':'today i learned','tldr':'too long did not read','tmi':'too much information','tntl':'trying not to laugh',
                   'ttyl':'talk to you later','ttyn':'talk to you never','tw':'twitter','txt': 'text','wbu':'what about you ?','w/':'with','wcw':'women crush wednesday',
                   'wfh':'work from home','wdymbt':'what do you mean by that?','wom':'word of mouth','wotd':'word of the day',
                   'wywh':'wish you were here','ygtr':'you got that right','yolo':'you only live once','ysk':'you should know',
                   'yt':'youtube'}
    pattern = re.compile(r'\b(' + '|'.join(re.escape(key) for key in acronym_dict.keys()) + r')\b')
    return pattern.sub(lambda match: acronym_dict[match.group(0)], text)

In [53]:
ac_text = "hey bff! omg its mm. imo we shoud hit it. lmk and i'll be omw"
ac_output = handle_acronyms(ac_text)
print(ac_output)

hey best friends forever! oh my god its music monday. in my opinion we shoud hit it. let me know and i'll be on my way


- Handle Contractions

In [54]:
def handle_contractions(text):
    CONTRACTION_MAP = {
        "ain't": "is not", "aren't": "are not", "can't": "cannot",
        "can't've": "cannot have", "'cause": "because", "could've": "could have", "couldn't": "could not",
        "couldn't've": "could not have", "didn't": "did not", "doesn't": "does not",
        "don't": "do not", "hadn't": "had not",
        "hadn't've": "had not have", "hasn't": "has not", "haven't": "have not", "he'd": "he would",
        "he'd've": "he would have", "he'll": "he will", "he'll've": "he will have", "he's": "he is",
        "how'd": "how did", "how'd'y": "how do you", "how'll": "how will", "how's": "how is", "I'd": "I would",
        "I'd've": "I would have", "I'll": "I will", "I'll've": "I will have", "I'm": "I am", "I've": "I have",
        "i'd": "i would", "i'd've": "i would have", "i'll": "i will", "i'll've": "i will have",
        "i'm": "i am", "i've": "i have", "isn't": "is not", "it'd": "it would", "it'd've": "it would have",
        "it'll": "it will", "it'll've": "it will have",
        "it's": "it is",
        "let's": "let us",
        "ma'am": "madam",
        "mayn't": "may not",
        "might've": "might have",
        "mightn't": "might not",
        "mightn't've": "might not have",
        "must've": "must have",
        "mustn't": "must not",
        "mustn't've": "must not have",
        "needn't": "need not",
        "needn't've": "need not have",
        "o'clock": "of the clock",
        "oughtn't": "ought not",
        "oughtn't've": "ought not have",
        "shan't": "shall not",
        "sha'n't": "shall not",
        "shan't've": "shall not have",
        "she'd": "she would",
        "she'd've": "she would have",
        "she'll": "she will",
        "she'll've": "she will have",
        "she's": "she is",
        "should've": "should have",
        "shouldn't": "should not",
        "shouldn't've": "should not have",
        "so've": "so have",
        "so's": "so as",
        "that'd": "that would",
        "that'd've": "that would have",
        "that's": "that is",
        "there'd": "there would",
        "there'd've": "there would have",
        "there's": "there is",
        "they'd": "they would",
        "they'd've": "they would have",
        "they'll": "they will",
        "they'll've": "they will have",
        "they're": "they are",
        "they've": "they have",
        "to've": "to have",
        "wasn't": "was not",
        "we'd": "we would",
        "we'd've": "we would have",
        "we'll": "we will",
        "we'll've": "we will have",
        "we're": "we are",
        "we've": "we have",
        "weren't": "were not",
        "what'll": "what will",
        "what'll've": "what will have",
        "what're": "what are",
        "what's": "what is",
        "what've": "what have",
        "when's": "when is",
        "when've": "when have",
        "where'd": "where did",
        "where's": "where is",
        "where've": "where have",
        "who'll": "who will",
        "who'll've": "who will have",
        "who's": "who is",
        "who've": "who have",
        "why's": "why is",
        "why've": "why have",
        "will've": "will have",
        "won't": "will not",
        "won't've": "will not have",
        "would've": "would have",
        "wouldn't": "would not",
        "wouldn't've": "would not have",
        "y'all": "you all",
        "y'all'd": "you all would",
        "y'all'd've": "you all would have",
        "y'all're": "you all are",
        "y'all've": "you all have",
        "you'd": "you would",
        "you'd've": "you would have",
        "you'll": "you will",
        "you'll've": "you will have",
        "you're": "you are",
        "you've": "you have"
    }
    pattern = re.compile(r'\b(' + '|'.join(re.escape(key) for key in CONTRACTION_MAP.keys()) + r')\b')
    return pattern.sub(lambda match: CONTRACTION_MAP[match.group(0)], text)

In [57]:
ct_text = "hey girl! it's music monday. let's gooo. what about y'all? any idea when's it"
ct_output = handle_contractions(ct_text)
print(ct_output)

hey girl! it is music monday. let us gooo. what about you all? any idea when is it


- Remove punctuations

In [61]:
def remove_punctuations(text):
    translator = str.maketrans("", "", string.punctuation)
    return text.translate(translator)

In [65]:
rp_text = "Hello. How are you?"
rp_output = remove_punctuations(rp_text)
print(rp_output)

Hello How are you


- Remove Hashtags

In [67]:
def handle_hashtags(text):
    hashtag = re.compile(r'#')
    return hashtag.sub('',text)

In [68]:
hash_text = "Traveling World! #travel #japan #ootd"
hash_output = handle_hashtags(hash_text)
print(hash_output)

Traveling World! travel japan ootd


- Aggregated function with overall clean

In [69]:
cleaned_tweets = []

def clean_tweet(texts):
    for text in tqdm(texts):
        text = text.replace("\\'", "'")
        if text.startswith("b'") or text.startswith('b"'):
            text = text[2:-1]
        text = text.replace('\n', ' ').replace('\t', ' ').lower()
        rt_pattern = re.compile(r'^\s*rt\s+', re.IGNORECASE)
        text = rt_pattern.sub('', text)
        urls_removed = remove_urls(text)
        numbers_removed = remove_numbers(urls_removed)
        mentions_removed = remove_mentions(numbers_removed)
        html_removed = clean_html_tags(mentions_removed)
        hashtag_removed = handle_hashtags(html_removed)
        acronyms_expanded = handle_acronyms(hashtag_removed)
        contractions_expanded = handle_contractions(acronyms_expanded)
        emoticons_cleaned = handle_emoticons(contractions_expanded)
        punctuations_removed = remove_punctuations(emoticons_cleaned)
        cleaned_tweets.append(punctuations_removed.strip())
    return cleaned_tweets

In [73]:
tweets = [
    "<section>hey bff @harry_11 ! omg its mm. imo we shoud hit it. what about y'all? book tickets at http://lalaland.com. call me at 8181818292 #travel #japan #ootd</section>"
]
cleaned_data = clean_tweet(tweets)
print(cleaned_data[0])

100%|████████████████████████████████████████████| 1/1 [00:00<00:00, 644.29it/s]

hey best friends forever  oh my god its music monday in my opinion we shoud hit it what about you all book tickets at call me at travel japan outfit of the day



