In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('Mental-Health-Twitter.csv')
df2 = df[['post_text', 'label']]

## Part 1 - Data Cleaning

In [3]:
# clearing html items like &apos; ,&amp; ,&lt; etc
import html

df2.loc[:, 'post_text'] = df2['post_text'].apply(lambda x: html.unescape(x))

In [5]:
# decoding to utf-8
df2.loc[:, 'post_text'] = df2['post_text'].apply(lambda x: x.encode('utf-8').decode('utf-8'))

In [6]:
# removing hyperlinks, hashtags or styles like retweet text
import preprocessor as p

def clean_tweet(text):
    p.set_options(p.OPT.URL, p.OPT.MENTION, p.OPT.HASHTAG)
    text = p.clean(text)
    return text

df2.loc[:, 'post_text'] = df2['post_text'].apply(clean_tweet)

In [7]:
# replace  apostrophes with the standard lexicons
import contractions
df2.loc[:, 'post_text'] = df2['post_text'].apply(lambda x: contractions.fix(x))

In [8]:
# split attached words for eg - ForTheWin becomes For The Win so it makes sense
import re
from wordsegment import load, segment
load()  # load word frequency data once

def smart_word_split(text):
    # Remove hashtags but keep content
    text = re.sub(r'#([A-Za-z0-9_]+)', r'\1', text)

    # Split CamelCase or PascalCase words
    text = re.sub(r'(?<=[a-z])(?=[A-Z])', ' ', text)

    # Split letters and numbers apart
    text = re.sub(r'(?<=[A-Za-z])(?=[0-9])', ' ', text)
    text = re.sub(r'(?<=[0-9])(?=[A-Za-z])', ' ', text)

    # For very long lowercase words (likely attached slang or glued text)
    words = []
    for token in text.split():
        if token.islower() and len(token) > 12:  # heuristic threshold
            segmented = segment(token)
            if len(segmented) > 1:
                token = ' '.join(segmented)
        words.append(token)

    return ' '.join(words)

df2.loc[:, 'post_text'] = df2['post_text'].apply(smart_word_split)

In [9]:
#  Convert text to lower case to avoid case sensitivity related issues
df2.loc[:, 'post_text'] = df2['post_text'].str.lower()

In [10]:
# Replacing slangs with meanings using a custom slang dictionary containing about 227 slang words/phrases
with open("slang.txt", "r") as file:
    slang = file.read()

# Separating each line present in the file
slang_lines = slang.split('\n')

slang_word = []
meaning = []

# Store the slang words and meanings in different lists
for line in slang_lines:
    if line.strip():  # Skip empty lines
        temp = line.split(":", 1)  # Split only on first ':' to handle meanings with ':'
        if len(temp) == 2:
            slang_word.append(temp[0].strip())
            meaning.append(temp[1].strip())

# Function to replace slang in a single text
def replace_slang(text):
    if not isinstance(text, str):
        return text
    tweet_tokens = text.split()
    for i, word in enumerate(tweet_tokens):
        # Remove common punctuation attached to words
        clean_word = word.strip('.,!?;:"()[]{}').lower()
        if clean_word in slang_word:
            idx = slang_word.index(clean_word)
            # Preserve original capitalization and punctuation
            replacement = meaning[idx]
            if word[0].isupper():
                replacement = replacement.capitalize()
            # Reattach punctuation if it was at the end
            if word[-1] in '.,!?;:"':
                replacement += word[-1]
            tweet_tokens[i] = replacement
    return " ".join(tweet_tokens)


In [11]:
df2.loc[:, 'post_text'] = df2['post_text'].apply(replace_slang)

In [12]:
# Standardizing and Spell Check
import itertools
from autocorrect import Speller

spell = Speller(lang='en')

# Function to reduce repeated characters (no more than 2 in a row)
def reduce_repeated_chars(text):
    if not isinstance(text, str):
        return text
    # Group consecutive identical characters and keep only up to 2
    result = ''.join(''.join(s)[:2] for _, s in itertools.groupby(text))
    return result

# Function to apply both: reduce repeats + spell check
def standardize_and_spellcheck(text):
    if not isinstance(text, str) or not text.strip():
        return text
    # Step 1: Reduce repeated characters
    text = reduce_repeated_chars(text)
    # Step 2: Spell check (word by word to preserve structure)
    words = text.split()
    corrected_words = [spell(word) for word in words]
    return ' '.join(corrected_words)

df2.loc[:, 'post_text'] = df2['post_text'].apply(standardize_and_spellcheck)


In [14]:
# stopward removal using nltk
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords', quiet=True)

stopwords_eng = set(stopwords.words('english')) 

def remove_stopwords(text):
    if not isinstance(text, str) or not text.strip():
        return text
    words = text.split()
    filtered_words = [word for word in words if word.lower() not in stopwords_eng]
    return ' '.join(filtered_words)

df2.loc[:, 'post_text'] = df2['post_text'].apply(remove_stopwords)

In [15]:
# Remove Punctuations using regex 
df2.loc[:, 'post_text'] = df['post_text'].str.replace(r'[^\w\s]', '', regex=True)

## Ensuring no duplicates were made by data cleaning process

In [16]:
unique_texts = df2['post_text'].nunique()
print(f"Unique post_text entries: {unique_texts}")

Unique post_text entries: 19483


In [17]:
df2 = df2.drop_duplicates(subset='post_text').reset_index(drop=True)
df2.shape

(19483, 2)

In [40]:
df2.to_csv('cleaned.csv', index=False);