<a href="https://colab.research.google.com/github/praj2408/Omdena-LLM-Mental-Health-Project/blob/main/team1_data_preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Import libraries

In [None]:
!pip install emoji
!pip install emot
!pip install langdetect

In [None]:
# Importing libraries
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import spacy
from langdetect import detect_langs
import concurrent.futures
import json
from textblob import TextBlob
import emoji
import inflect
from emot.emo_unicode import EMOTICONS_EMO
from sklearn.feature_extraction.text import TfidfVectorizer,TfidfTransformer,CountVectorizer

In [None]:
# Download NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')
nltk.download('wordnet', quiet=True)
nltk.download('omw-1.4', quiet=True)
nltk.download('vader_lexicon', quiet=True)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping chunkers/maxent_ne_chunker.zip.
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.


True

### Useful functions for text preprocessing

In [None]:
def return_english_text(text):
    """
    Detects the language of the input text using langdetect library and returns the text if it's identified as English with a probability higher than 80%.

    Args:
        line (str): The input text to detect its language.

    Returns:
        str: The original text if it is identified as English with a probability higher than 80%, otherwise an empty string.
    """


    try:
        langs = detect_langs(text)
        for item in langs:
            # Check if the detected language is English with a probability higher than 80%
            if item.lang == 'en' and item.prob > 0.8:
                return text
    except:
        pass

    return ''


def correct_sentence_spelling(text):
    """
    Corrects the spelling of a sentence using TextBlob.

    Args:
        text (str): The sentence to be corrected.

    Returns:
        str: The corrected sentence.
    """
    sentence = TextBlob(text)
    result = sentence.correct()
    return result

def replace_emoji(text):
    """
    Replaces emojis in the given text with their textual descriptions.

    Args:
        text (str): The text containing emojis to be replaced.

    Returns:
        str: The text with emojis replaced by their descriptions.
    """
    text = emoji.demojize(text)
    return text

def convert_emoticons(text):
    """
    Converts emoticons in the given text to their textual representations.

    Args:
        text (str): The text containing emoticons to be converted.

    Returns:
        str: The text with emoticons replaced by their textual representations.
    """
    for emoticon in EMOTICONS_EMO:
        pattern = re.escape(emoticon)
        replacement = "_".join(EMOTICONS_EMO[emoticon].replace(",", "").split())
        text = re.sub(f'({pattern})', replacement, text)
    return text

def convert_number(text):
    """
    Converts numerical digits to their textual representation in the given text.

    Args:
        text (str): The input text.

    Returns:
        str: The text with numerical digits converted to words.
    """
    p = inflect.engine()
    temp_str = text.split()
    new_string = []

    for word in temp_str:
        if word.isdigit():
            temp = p.number_to_words(word)
            new_string.append(temp)
        else:
            new_string.append(word)

    temp_str = ' '.join(new_string)
    return temp_str

def remove_url(text):
    """
    Removes URLs from the given text.

    Args:
        text (str): The input text.

    Returns:
        str: The text with URLs removed.
    """
    return re.sub(r'http\S+', ' ', text)

def remove_html_tag(text):
    """
    Removes HTML tags from the given text.

    Args:
        text (str): The input text.

    Returns:
        str: The text with HTML tags removed.
    """
    return re.sub(r'<[^>]+>', ' ', text)

def remove_punctuation(text):
    """
    Removes punctuation marks from the given text.

    Args:
        text (str): The input text.

    Returns:
        str: The text with punctuation removed.
    """
    return re.sub(r'[^\w\s]', ' ', text)

def remove_special_characters(text):
    """
    Removes special characters from the given text.

    Args:
        text (str): The input text.

    Returns:
        str: The text with special characters removed.
    """
    return re.sub(r"\s+[a-zA-Z]\s+", " ", text)

def remove_stopwords(text):
    """
    Removes stopwords from the given text.

    Args:
        text (str): The input text.

    Returns:
        str: The text with stopwords removed.
    """
    stop_words = set(stopwords.words('english'))
    no_stopwords = " ".join([word for word in text.split() if word not in stop_words])
    return no_stopwords

def remove_extra_whitespace(text):
    """
    Removes extra whitespace from the given text.

    Args:
        text (str): The input text.

    Returns:
        str: The text with extra whitespace removed.
    """
    return re.sub(r'\s+', ' ', text)

def text_stemming_lemmatization(text, result='clean_tokens'):
    """
    Preprocesses the text without stopwords by performing stemming, lemmatization, and final cleanup.

    Args:
        text (str): The input text without stopwords to be further preprocessed.
        result (str, optional): The step to include in the output. Possible values: 'tokens', 'stemmed_tokens', 'lemmatized_tokens', 'clean_tokens'.
                    Default is 'clean_tokens'.

    Returns:
        list: The result of the specified step applied.
    """
    nlp = spacy.load('en_core_web_sm')
    doc_tokenize = nlp(text)
    tokens = [token.text for token in doc_tokenize]
    if result == 'tokens':
        return tokens

    stemmer = PorterStemmer()
    stemmed_tokens = [stemmer.stem(token) for token in tokens]
    if result == 'stemmed_tokens':
        return stemmed_tokens

    sentence = " ".join(tokens)
    doc_lemmatize = nlp(sentence)
    lemmatized_tokens = [token.lemma_ for token in doc_lemmatize]
    if result == 'lemmatized_tokens':
        return lemmatized_tokens

    clean_tokens = [token for token in list(lemmatized_tokens) if token.strip() != '' and len(token) > 1]
    if result == 'clean_tokens':
        return clean_tokens

###Pipeline to process collected data

In [None]:
def parallel_processing(iterable, processing_function, *processing_args, workers=3):
    """
    Process elements in parallel using ThreadPoolExecutor.

    Args:
        iterable (iterable): The iterable containing elements to be processed.
        processing_function (function): The processing function to apply to the elements.
        processing_args: Additional arguments to pass to the processing function.
        workers (int, optional): The number of parallel workers. Default is 3.

    Returns:
        list: A list of processed elements after applying the specified processing function in parallel.
    """
    with concurrent.futures.ThreadPoolExecutor(max_workers=workers) as executor:
        processed_elements = list(executor.map(lambda element: processing_function(element, *processing_args), iterable))

    return processed_elements

def mask_author(data):
    """
    Masks the author's name in text data by replacing it with asterisks.

    Args:
        data (str, dict, list): The text data or nested structure containing text data.

    Returns:
        str, dict, list: The text data with the author's name masked with asterisks.
    """
    if isinstance(data, dict):
        for key, value in data.items():
            if key == 'author':
                data[key] = '*****'
            else:
                mask_author(value)
    elif isinstance(data, list):
        for item in data:
            mask_author(item)
    return data

def text_preprocessing(text):
    """
    Preprocesses the input text by applying a sequence of text cleaning steps.
      - convert to string
      - check for english language
      - apply spellchecking
      - convert emojis and emoticons to text
      - convert numbers to text
      - remove newline characters
      - remove html tags
      - remove urls
      - convert to lowercase
      - remove punctuation
      - remove special characters
      - remove stopwords
      - normalize (stemming & lemmatization)

    Args:
        text (str): The input text to be preprocessed.

    Returns:
        str: The preprocessed text.
    """
    # Convert to string
    text = str(text)

    # Check for English language
    if return_english_text(text) == '':
        return ''

    # Apply spellchecking
    text = correct_sentence_spelling(text)

    # Convert emojis and emoticons to text
    text = replace_emoji(text)
    text = convert_emoticons(text)

    # Convert numbers to text
    text = convert_number(text)

    # Remove newline characters
    text = text.replace('\n', ' ')

    # Remove HTML tags
    text = remove_html_tag(text)

    # Remove URLs
    text = remove_url(text)

    # Convert to lowercase
    text = text.lower()

    # Remove punctuation
    text = remove_punctuation(text)

    # Remove special characters
    text = remove_special_characters(text)

    # Remove stopwords
    text = remove_stopwords(text)

    # Normalize (stemming & lemmatization)
    text = ' '.join(text_stemming_lemmatization(text))

    # Remove extra whitespace
    text = remove_extra_whitespace(text)

    return text

def load_format_mask(json_file):
    # Load data from JSON file
    with open(json_file, 'r') as f:
        data = json.load(f)

    masked_data = mask_author(data)


    #Create dataframes for posts, comments and replies
    df_posts = pd.json_normalize(masked_data)

    # Drop duplicate posts
    df_posts.drop_duplicates(subset = 'post_id', inplace = True)

    #Add a text column as a concatenation of title and post body columns
    df_posts['text'] = df_posts['title'].astype('str') + ' ' + df_posts['body'].astype('str')

    # Normalize comments
    df_comments = pd.json_normalize(masked_data, record_path=['comments'], meta=['post_id', 'title', 'author', 'timestamp'], meta_prefix='comment_')

    # Normalize replies within each comment
    reply_data = []
    for comment in masked_data:
        if 'comments' in comment:
            for reply in comment['comments']:
                if 'replies' in reply:
                    for reply_item in reply['replies']:
                        reply_item['comment_id'] = reply['comment_id']
                        reply_data.append(reply_item)

    df_replies = pd.DataFrame(reply_data)


    return df_posts, df_comments, df_replies

In [None]:
json_file = 'reddit_data_anxiety_2.json'

In [None]:
df_posts, df_comments, df_replies = load_format_mask(json_file)

In [None]:
df_posts.head()

Unnamed: 0,post_id,title,author,timestamp,body,score,downs,total_comments,comments,text
0,1b5fkwx,Set your intention,*****,2024-03-03 12:00:43,Happy Sunday /r/Anxiety!\n\nIt's everyone's fa...,2,0,3,"[{'comment_id': 'kt5ggty', 'author': '*****', ...",Set your intention Happy Sunday /r/Anxiety!\n\...
1,1ax3rwf,Monthly Check-In Thread,*****,2024-02-22 11:00:33,Hello everyone! Welcome to the r/Anxiety month...,3,0,13,"[{'comment_id': 'ks5gisj', 'author': '*****', ...",Monthly Check-In Thread Hello everyone! Welcom...
2,1b60jyq,Why do doctors say benzos are so bad but not s...,*****,2024-03-04 03:19:49,"So my question is, why do doctors say benzos a...",36,0,48,"[{'comment_id': 'kt90vw3', 'author': '*****', ...",Why do doctors say benzos are so bad but not s...
3,1b5quwr,8 hours of survival mode. Its hellish,*****,2024-03-03 20:18:44,I just started a job at a call center for pati...,68,0,27,"[{'comment_id': 'kt7q4xx', 'author': '*****', ...",8 hours of survival mode. Its hellish I just s...
4,1b5z6b3,My wife gets anxiety attacks and I don't know ...,*****,2024-03-04 02:12:38,My wife and I have been having marriage proble...,14,0,11,"[{'comment_id': 'kt8odn3', 'author': '*****', ...",My wife gets anxiety attacks and I don't know ...


In [None]:
preprocessed_text = parallel_processing(df_posts['text'], text_preprocessing, workers=3)

In [None]:
tf_idf_vec = TfidfVectorizer(max_features=5000,
                            #  ngram_range=(1,3),
                            #  smooth_idf=True
                             )
tf_idf_vec.fit(preprocessed_text)
tf_idf_data = tf_idf_vec.transform(preprocessed_text)
print("the type of count vectorizer ",type(tf_idf_data))
print("the shape of out text TFIDF vectorizer ",tf_idf_data.get_shape())
print("the number of unique words including both unigrams and bigrams ", tf_idf_data.get_shape()[1])

the type of count vectorizer  <class 'scipy.sparse._csr.csr_matrix'>
the shape of out text TFIDF vectorizer  (996, 5000)
the number of unique words including both unigrams and bigrams  5000


In [None]:
# # Get feature names
# feature_names = tf_idf_vec.get_feature_names_out()
# feature_names[:100]

In [None]:
df_posts['preprocessed_text'] = preprocessed_text
df_posts.head()

Unnamed: 0,post_id,title,author,timestamp,body,score,downs,total_comments,comments,text,preprocessed_text
0,1b5fkwx,Set your intention,*****,2024-03-03 12:00:43,Happy Sunday /r/Anxiety!\n\nIt's everyone's fa...,2,0,3,"[{'comment_id': 'kt5ggty', 'author': '*****', ...",Set your intention Happy Sunday /r/Anxiety!\n\...,meet intention happy sunday ranxiety everyone ...
1,1ax3rwf,Monthly Check-In Thread,*****,2024-02-22 11:00:33,Hello everyone! Welcome to the r/Anxiety month...,3,0,13,"[{'comment_id': 'ks5gisj', 'author': '*****', ...",Monthly Check-In Thread Hello everyone! Welcom...,monthly checkin thread hello everyone welcome ...
2,1b60jyq,Why do doctors say benzos are so bad but not s...,*****,2024-03-04 03:19:49,"So my question is, why do doctors say benzos a...",36,0,48,"[{'comment_id': 'kt90vw3', 'author': '*****', ...",Why do doctors say benzos are so bad but not s...,doctor say benzol bad boris question doctor sa...
3,1b5quwr,8 hours of survival mode. Its hellish,*****,2024-03-03 20:18:44,I just started a job at a call center for pati...,68,0,27,"[{'comment_id': 'kt7q4xx', 'author': '*****', ...",8 hours of survival mode. Its hellish I just s...,eight hour survival mode hellish start job cal...
4,1b5z6b3,My wife gets anxiety attacks and I don't know ...,*****,2024-03-04 02:12:38,My wife and I have been having marriage proble...,14,0,11,"[{'comment_id': 'kt8odn3', 'author': '*****', ...",My wife gets anxiety attacks and I don't know ...,wife get anxiety attack do not know help wife ...
