## 

This project tries to identify the sentiment of a mental health patient based on patient's description of current feelings.

Data is sourced from https://www.kaggle.com/datasets/thedevastator/nlp-mental-health-conversations/data

In [None]:
!pip install opendatasets --upgrade --quiet

In [None]:
# Installs Unsloth, Xformers (Flash Attention) and all other packages!
# !pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"

# We have to check which Torch version for Xformers (2.3 -> 0.0.27)
# from torch import __version__; from packaging.version import Version as V
# xformers = "xformers==0.0.27" if V(__version__) < V("2.4.0") else "xformers"
# !pip install --no-deps {xformers} trl peft accelerate bitsandbytes triton
!pip install triton

In [None]:
import opendatasets as od 
import pandas as pd 
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification


In [None]:
torch.device('cuda' if torch.cuda.is_available() else 'cpu')

## Data Loading

In [None]:
# variables
DATA_DIR = '../../data'

In [None]:
dataset_url = 'https://www.kaggle.com/datasets/thedevastator/nlp-mental-health-conversations/data'
od.download(dataset_url, data_dir=DATA_DIR)

In [None]:
CSV_DATA_DIR = DATA_DIR + '/nlp-mental-health-conversations'
df = pd.read_csv(CSV_DATA_DIR + '/train.csv')
df.head()


## Exploratory Data Analysis (EDA)

### Analysis

In [None]:
print(df.info())
df.isnull().sum()

In [None]:
# check for any odd text
zero_indexes = df[df['Response'] == '0'].index.tolist()
print(zero_indexes)

### Data Cleansing and Pre-Processing

**NOTE** : We are using a pre-trained model and hence the need to split data into train, test, validate isnt needed.

In [None]:
# Given we have 4 records with NaNs and 1 record without a proper value, out of 3512 records we shall drop these records
df_clean = df.dropna()
df_clean = df_clean.drop(index=zero_indexes)
df_clean.info()

## Univariate and Multivariate Analysis

### Statistical Description
### Correlation
### Outliers


## Data Preparation



In [None]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize
from collections import defaultdict
import heapq

from transformers import pipeline
from torch import device, mode
import textwrap
from typing import List, Dict


nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
# Downlaod the averaged_perceptron_tagger_eng package
nltk.download('averaged_perceptron_tagger_eng')

lemmatizer = WordNetLemmatizer()

In [None]:
# for better lemmatization - word is normalized to its grammatical foundation word
from curses.ascii import isalpha
import token


def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN  # Default to noun


def lemmitize(text):
    tokens = word_tokenize(text.lower())
    tagged_tokens = pos_tag(tokens)
    lemmatized = [
        lemmatizer.lemmatize(token, get_wordnet_pos(pos))
        for token, pos in tagged_tokens
        if token.isalpha()
    ]
    return ' '.join(lemmatized)

# Quick test
# sentence = '''I'm going through some things with my feelings and myself. I barely sleep and I do nothing but think about how I'm worthless and how I shouldn't be here.
#    I've never tried or contemplated suicide. I've always wanted to fix my issues, but I never get around to it.
#    How can I change my feeling of being worthless to everyone?'''

# lemmitize(sentence)

    

In [None]:

def get_model_pipeline() :
    # REF: https://stackoverflow.com/a/75499889
    model_name = "sid321axn/Bio_ClinicalBERT-finetuned-medicalcondition"

    # using auto tokenizer - str to tokens (numbers)
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    # sentiment_model = "cardiffnlp/twitter-roberta-base-sentiment-latest"

    classifier_pipe = pipeline(
        "sentiment-analysis",
        model=model_name,
        tokenizer=tokenizer,
        device=0 if torch.cuda.is_available() else -1
    )

    return classifier_pipe

In [None]:
# break up long sentences
# TODO: Once we lemmitize, the sentence loses all periods in it. Without periods its difficult to ensure the context of the 
#   orig sentence is retained. Need to figure out a better way
# def split_long_sentences(text: str, batch_size: int= 512) -> List[str]:
#     words = text.split()
#     chunks = []
#     i = 0
#     while (i < len(words)):
#         words_batch = words[i: (i+batch_size)]
#         sentence_batch = ' '.join(words_batch)
#         chunks.append(sentence_batch)

#         i = i+batch_size

#     return chunks

# # Quick test
# long_text:str = '''I'm a teenager, and throughout my entire life, I've never really had good parents, or parents at all for that matter. I'm not exaggerating. I was living with my mom and grandparents until my father, in prison for most of my life, got out when I was in middle school. His mom, my grandma, only lived a mile down the road from my mom’s house, and I was so awe-stricken with my dad that I got to stay with him for a long time. Meanwhile, I did not realize that my mom was doing hardcore drugs. My mom went to prison for that and lost her café. We live in a very small town, so everyone knew about it, and I was bullied because of who my parents were. My dad ended up getting in with the wrong people and went back to prison. My mom and him had a mutual friend and often hung out at that person’s house. My parents did not get along at this point. We were driving him home one day from this house, and my mom stopped the car and kicked him out. He got out of the car, went to the driver’s side, and punched my mom in the face. I got out and told him not to hit my mom. At that point, I was really scared and mad that he did that, so I ran towards him to stop him. He literally picked me up and threw me on the back of a gravel road. I couldn't even walk. My mom tried to help me, but he started choking her. I hobbled over, and she barely got into the car, and we quickly drove away and called the police and ambulance. He was so badly strung out on drugs. He went to prison again and seems to be doing well. I met up with him once with my grandma, and we had coffee, but he's so hard to handle. I think a lot of it is that I can't bring myself to forgive him. My mom went back to prison again for drugs, and while she was in there, I moved in with my dad’s mom (the one who lived just down the road) because I trust her, her house is stable, and she's more nurturing, understanding, and loving then my other grandparents. I also stay at my boyfriend’s a lot. Now that my mom is out of prison, she's trying to control every aspect of my life. She’s trying to make me move back home out of Susan's house, and I don't want to. I don't like it there. They condone drug abuse and many other things, and I'm just not comfortable. She's even threatened to call the police and say I'm a runaway because she has custody of me. My boyfriend has always had this picture-perfect life, and his family are strict Christians. One time, his mom even went as far as to say that if he and I break up, if we were having sex, I would say that he", "raped me. I've got so many problems I don't even know what to do.'''
# chunked_text:List = split_long_sentences(long_text)
# index=0
# for t in chunked_text:
#     print(f"Sentence at {index} ", t)
#     index +=1

In [None]:

# ref: Google Gemini - "how to shorten a long text without losing its context"
def summarize_text(text: str, num_sentences: int) -> str:
    sentences = sent_tokenize(text)
    stop_words = set(stopwords.words('english'))
    word_frequencies = defaultdict(int) # 

    # In each sentence within txt, remove stop words and numbers, get each word frequency
    for sentence in sentences:
        for word in sentence.lower().split():
            if word.isalpha() and word not in stop_words:
                word_frequencies[word] += 1

    # scaling freqs to 1 (Min-Max scaling)
    maximum_frequency = max(word_frequencies.values())
    for word in word_frequencies.keys():
        word_frequencies[word] = (word_frequencies[word] / maximum_frequency)
    
    # calc score for each sentence based on word freq
    sentence_scores = defaultdict(int)
    for sentence in sentences:
        for word in sentence.lower().split():
            if word.isalpha() and word in word_frequencies:
                sentence_scores[sentence] += word_frequencies[word]
                # print("sentence score: ", sentence_scores)

    summary_sentences = heapq.nlargest(num_sentences, sentence_scores, key=sentence_scores.get)
    return ' '.join(summary_sentences)

# Quick test 
long_text:str = '''I'm a teenager, and throughout my entire life, I've never really had good parents, or parents at all for that matter. I'm not exaggerating. I was living with my mom and grandparents until my father, in prison for most of my life, got out when I was in middle school. His mom, my grandma, only lived a mile down the road from my mom’s house, and I was so awe-stricken with my dad that I got to stay with him for a long time. 
Meanwhile, I did not realize that my mom was doing hardcore drugs. My mom went to prison for that and lost her café. We live in a very small town, so everyone knew about it, and I was bullied because of who my parents were. My dad ended up getting in with the wrong people and went back to prison. My mom and him had a mutual friend and often hung out at that person’s house. 
My parents did not get along at this point. We were driving him home one day from this house, and my mom stopped the car and kicked him out. He got out of the car, went to the driver’s side, and punched my mom in the face. I got out and told him not to hit my mom. At that point, I was really scared and mad that he did that, so I ran towards him to stop him. He literally picked me up and threw me on the back of a gravel road. 
I couldn't even walk. My mom tried to help me, but he started choking her. I hobbled over, and she barely got into the car, and we quickly drove away and called the police and ambulance. He was so badly strung out on drugs. He went to prison again and seems to be doing well. I met up with him once with my grandma, and we had coffee, but he's so hard to handle. I think a lot of it is that I can't bring myself to forgive him. 
My mom went back to prison again for drugs, and while she was in there, I moved in with my dad’s mom (the one who lived just down the road) because I trust her, her house is stable, and she's more nurturing, understanding, and loving then my other grandparents. I also stay at my boyfriend’s a lot. Now that my mom is out of prison, she's trying to control every aspect of my life. 
She’s trying to make me move back home out of Susan's house, and I don't want to. I don't like it there. They condone drug abuse and many other things, and I'm just not comfortable. She's even threatened to call the police and say I'm a runaway because she has custody of me. My boyfriend has always had this picture-perfect life, and his family are strict Christians. 
One time, his mom even went as far as to say that if he and I break up, if we were having sex, I would say that he", "raped me. I've got so many problems I don't even know what to do.'''
summ_text:str = summarize_text(long_text, 3)
print('summarized text: ', summ_text)


In [None]:
cls_pipeline = get_model_pipeline()
def get_sentiment(sentence:str):
    lem_sentence = sentence # lemmitize(sentence)
    print(lem_sentence)
    sentiment = cls_pipeline(lem_sentence)

    # print(f"Sentiment for index {index}: Label={sentiment[0]['label']}, Score={sentiment[0]['score']} ")
    return {
        'Sentence': sentence,
        'Label': sentiment[0]['label'],
        'Confidence': sentiment[0]['score']
    }


In [None]:
# TODO parametrize this (max # of tokens a model supports) at the model level above
batch_size = 512
num_sentences = 3

def update_with_sentiment(row):
    context_text = row['Context']
    # for now summarizing text only if sentence goes over the # of tokens the model is trained for. 
    if len(context_text.split()) > batch_size:
        context_text = summarize_text(context_text, num_sentences=num_sentences)
        row['SummarizedText'] = context_text

    context_text = lemmitize(context_text)
    sentiment = get_sentiment(context_text)
    print(f"Sentiment for : Label={sentiment['Label']}, Confidence={sentiment['Confidence']} ")
    row['Label'] = sentiment['Label']
    row['Confidence'] = sentiment['Confidence']

    return row
    
df_sent = df_clean.apply(update_with_sentiment, axis=1)
df_sent.head()

# for index, row in df_clean.iterrows():
#     context_text = row['Context']
#     # for now summarizing text only if sentence goes over the # of tokens the model is trained for. 
#     if len(context_text.split()) > batch_size:
#         context_text = summarize_text(context_text, num_sentences=num_sentences)
#         df_clean[index]['SummarizedText'] = context_text

#     context_text = lemmitize(context_text)
#     sentiment = get_sentiment(context_text)
#     print(f"Sentiment for index {index}: Label={sentiment['Label']}, Confidence={sentiment['Confidence']} ")
#     df_clean['Label']
    
    # if len(context_text.split()) > 512:
    #     print("........................................")
    #     # process as batch given some of the sentences are longer than supported by the model that is trained for a tensor size of 512
    #     sentence_list = split_long_sentences(context_text)

    #     for sntc in sentence_list:
    #         print("list of sentences: ", sentence_list)
    #         sentiment = get_sentiment(sntc)
    #         print(f"Sentiment for index {index}: Label={sentiment['Label']}, Score={sentiment['Score']} ")

    # else :
    #     sentiment = get_sentiment(context_text)
    #     print(f"Sentiment for index {index}: Label={sentiment['Label']}, Score={sentiment['Score']} ")
    
    

In [None]:
cols_order = ['Context', 'Response', 'SummarizedText', 'Label', 'Confidence']
df_sent = df_sent[cols_order]
df_sent.to_csv(f"{DATA_DIR}/output/mental_health_convo_categ.csv")

## Modeling

### Persist Model