#Analysis of Verbs, Nouns and Adjectives

In [2]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("mental/mental-bert-base-uncased")
import torch

import warnings
warnings.filterwarnings('ignore')

import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import os
pd.set_option("display.max_columns", None)
from scipy.stats import f_oneway
import xgboost as xgb
import scipy
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import mutual_info_classif
from sklearn.metrics import roc_auc_score


#os.chdir('/content/drive/MyDrive/Depression_Classification')


def extract_text(input_string):
  # Check if the input is a string
    if not isinstance(input_string, str):
        print(input_string)
        raise ValueError("Expected a string input")
    # Use regex to find all text within parentheses and remove the text itself
    matches = re.findall(r'\((.*?)\)', input_string)

    if not matches:
        return input_string

    # Join the extracted texts with a space and return them
    return ' '.join(matches)

# Below helper function creates question-answer pairs (without filtering)
def create_question_answer_pairs(interview):
    question_answer_pairs = []
    current_question = []
    current_response = []

    for index, row in interview.iterrows():
        row['value'] = extract_text(str(row['value']))
        if row['speaker'] == "Ellie":
            # If there's an existing question and response, store the pair
            if current_question and current_response:

                question_answer_pairs.append({
                    'question': " ".join(current_question),
                    'answer': ". ".join(current_response)
                })
                current_response = []  # Reset responses for the next question
                current_question = []  # Reset question for the next batch

            # Add the new question or follow-up from Ellie to the current question
            current_question.append(str(row['value']))

        elif row['speaker'] == "Participant" and current_question:
            current_response.append(str(row['value']))

    # Add the last question-answer pair if it exists
    if current_question and current_response:

        question_answer_pairs.append({
            'question': " ".join(current_question),
            'answer': ". ".join(current_response)
        })

    return pd.DataFrame(question_answer_pairs, columns=['question', 'answer'])

# Function to create chunks of QA pairs with overlaps
def chunk_qa_pairs(df, max_tokens=80, max_overlap_tokens=40):
    chunks = []
    current_chunk = []
    current_chunk_word_count = 0

    # Combine questions and answers
    qa_pairs = [f"Interviewer: {row['question']} Interviewee: {row['answer']}" for _, row in df.iterrows()]

    for pair in qa_pairs:
        # Count words in the current pair
        pair_word_count = len(pair.split())

        # Check if adding this pair exceeds the max tokens
        if current_chunk_word_count + pair_word_count > max_tokens:
            # Save the current chunk
            chunks.append(" ".join(current_chunk))

            # Prepare for the next chunk
            # Determine overlap (complete QA pairs)
            overlap = []
            overlap_word_count = 0

            # Start from the last added complete pairs until it hits the token limit
            for qa in reversed(current_chunk):
                overlap_word_count += len(qa.split())
                if overlap_word_count >= max_overlap_tokens:
                    break
                overlap.append(qa)

            # Reverse to maintain original order
            overlap.reverse()

            # Start new chunk with overlap
            current_chunk = overlap
            current_chunk_word_count = sum(len(q.split()) for q in current_chunk)

        # Add the current pair to the chunk
        current_chunk.append(pair)
        current_chunk_word_count += pair_word_count

    # Add the last chunk if it has content
    if current_chunk:
        chunks.append(" ".join(current_chunk))

    return chunks


# Create dictionary of {participant_id : PHQ_Binary}
id_depression_label_map = {}
all_ids = set()
sheet_name = 'Metadata_mapping'
## Change File location 1
file_path = 'DAIC demographc data.xlsx'
data_csv = pd.read_excel(file_path, sheet_name=sheet_name)
for i in range(len(data_csv['Participant_ID'])):
    id_depression_label_map[data_csv['Participant_ID'][i]] = data_csv['PHQ_Binary'][i]
    all_ids.add(data_csv['Participant_ID'][i])


sheet_name = 'Interview_Data'
#file_path = '/content/drive/MyDrive/Depression_Classification/DAIC demographc data.xlsx'
race_csv = pd.read_excel(file_path, sheet_name=sheet_name)
first_name = list(race_csv.iloc[0].values)
race_csv.drop(index=0, inplace=True)
race_csv.columns = ['Participant_ID', 'Condition', 'Age', 'Gender', 'Education', 'Race']

id_race_map = {}
for i in range(len(race_csv['Participant_ID'])):
  id_race_map[list(race_csv['Participant_ID'])[i]] = list(race_csv['Race'])[i] - 1

sheet_name = 'Metadata_mapping'
#file_path = '/content/drive/MyDrive/Depression_Classification/DAIC demographc data.xlsx'
race_csv = pd.read_excel(file_path, sheet_name=sheet_name)
first_name = list(race_csv.iloc[0].values)
race_csv.drop(index=0, inplace=True)


id_gender_map = {}
for i in range(len(race_csv['Participant_ID'])):
  id_gender_map[list(race_csv['Participant_ID'])[i]] = [0 if list(race_csv['Gender'])[i] == 'male' else 1][0]

race = {
    0: 'African American',
    1: 'Asian',
    2: 'White/Caucasian',
    3: 'Hispanic',
    4: 'Native American',
    5: 'Native Hawaiian/Other Pacific Islander',
    6: 'Other'
}
gender = {
    0: 'Male',
    1: 'Female'
}


id_depression_label_map = {}
all_ids = set()
sheet_name = 'Metadata_mapping'
#file_path = '/content/drive/MyDrive/Depression_Classification/DAIC demographc data.xlsx'
extended_data_csv = pd.read_excel(file_path, sheet_name=sheet_name)
for i in range(len(extended_data_csv['Participant_ID'])):
    id_depression_label_map[extended_data_csv['Participant_ID'][i]] = extended_data_csv['PHQ_Binary'][i] #[extended_data_csv['PHQ_Binary'][i], extended_data_csv['PHQ_Score'][i]]
    all_ids.add(extended_data_csv['Participant_ID'][i])

## Change File location 2
labels_csv = pd.read_csv('/content/drive/MyDrive/Depression_Classification/train_split_Depression_AVEC2017.csv')
for i in range(len(labels_csv['Participant_ID'])):
  id_depression_label_map[labels_csv['Participant_ID'][i]] = extended_data_csv['PHQ_Binary'][i]
  all_ids.add(labels_csv['Participant_ID'][i])


# List of specific Participant_IDs you want to filter
participant_ids = list(all_ids)  # Replace with your specific Participant_IDs

id_gender_map[302] = 0

to_ignore = []
## Change File location 3
labels_csv = pd.read_csv('/content/drive/MyDrive/Depression_Classification/test_split_Depression_AVEC2017.csv')
for i in range(len(labels_csv['participant_ID'])):
  to_ignore.append(labels_csv['participant_ID'][i])

id_gender_map[0] = []
id_gender_map[1] = []
for i in id_gender_map:
  if id_gender_map[i] == 0:
    id_gender_map[0].append(str(i))
  if id_gender_map[i] == 1:
    id_gender_map[1].append(str(i))
male_participant_ids = id_gender_map[0]
female_participant_ids = id_gender_map[1]

participant_ids = male_participant_ids
max_len = 510
ids_collected = []
all_qas_df = []
def collect_train_test_data(directory):
    X_train = []
    Y_train = []
    attention_masks = []
    for filename in os.listdir(directory):
      if filename.endswith(".csv"):
            interview_id = re.findall(r'\d+', filename)[0]
          # change here for male and female analysis
          #if interview_id in female_participant_ids:
            ids_collected.append(interview_id)

            file_path = os.path.join(directory, filename)
            df = pd.read_csv(file_path, delimiter='\t')
            df.drop(columns=['start_time', 'stop_time'], axis=1, inplace=True)
            df.fillna('', inplace=True)

            # Step 1: Create QA Pair
            qa_df = create_question_answer_pairs(df)
            all_qas_df.append(qa_df)
            Y_train.append(id_depression_label_map[int(interview_id)])

    return all_qas_df, Y_train
# Load the data
## Change Folder location 1
directory = '/content/drive/MyDrive/Depression_Classification/transcripts_csv'
all_qas_df, Y_train = collect_train_test_data(directory)


In [3]:
import pandas as pd
import spacy

# Load the spaCy model for POS tagging
nlp = spacy.load("en_core_web_sm")

# Function to extract nouns, verbs, and adjectives from a text
def extract_pos(text):
    doc = nlp(text)
    nouns = {token.text.lower() for token in doc if token.pos_ == "NOUN"}
    verbs = {token.text.lower() for token in doc if token.pos_ == "VERB"}
    adjectives = {token.text.lower() for token in doc if token.pos_ == "ADJ"}
    return nouns, verbs, adjectives

# Function to analyze unique POS overlap between CSV and dataframes
def analyze_pos_overlap(csv_file_path, dataframes_list):
    # Load the CSV file
    csv_data = pd.read_csv(csv_file_path)
    csv_sentences = csv_data["clean_text"]

    # Extract unique nouns, verbs, and adjectives from the CSV file
    csv_nouns, csv_verbs, csv_adjectives = set(), set(), set()
    for sentence in csv_sentences:
        nouns, verbs, adjectives = extract_pos(sentence)
        csv_nouns.update(nouns)
        csv_verbs.update(verbs)
        csv_adjectives.update(adjectives)

    # Extract unique nouns, verbs, and adjectives from all dataframes combined
    df_nouns, df_verbs, df_adjectives = set(), set(), set()
    for df in dataframes_list:
        combined_sentences = " ".join(df["question"].tolist() + df["answer"].tolist())
        nouns, verbs, adjectives = extract_pos(combined_sentences)
        df_nouns.update(nouns)
        df_verbs.update(verbs)
        df_adjectives.update(adjectives)

    # Calculate the intersection (overlap)
    noun_overlap = csv_nouns & df_nouns
    verb_overlap = csv_verbs & df_verbs
    adjective_overlap = csv_adjectives & df_adjectives

    # Results as percentages
    noun_overlap_percentage = len(noun_overlap) / len(csv_nouns) * 100 if csv_nouns else 0
    verb_overlap_percentage = len(verb_overlap) / len(csv_verbs) * 100 if csv_verbs else 0
    adjective_overlap_percentage = len(adjective_overlap) / len(csv_adjectives) * 100 if csv_adjectives else 0
    print("Nouns in CSV: ", len(csv_nouns))
    print("Verbs in CSV: ", len(csv_verbs))
    print("Adjecties in CSV: ", len(csv_adjectives))
    print("Nouns in DF: ", len(df_nouns))
    print("Verbs in DF: ", len(df_verbs))
    print("Adjecties in DF: ", len(df_adjectives))
    # Return results
    return {
        "Noun_Overlap_Percentage": noun_overlap_percentage,
        "Verb_Overlap_Percentage": verb_overlap_percentage,
        "Adjective_Overlap_Percentage": adjective_overlap_percentage,
        "Noun_Overlap_Count": len(noun_overlap),
        "Verb_Overlap_Count": len(verb_overlap),
        "Adjective_Overlap_Count": len(adjective_overlap)
    }

# Example usage:
# Replace with the path to your CSV file and the list of DataFrames

## Change File location 4
csv_file_path = "/content/depression_dataset_reddit_cleaned.csv"
dataframes_list = [
    pd.DataFrame({
        "question": ["What is AI?", "Explain deep learning."],
        "answer": ["AI stands for artificial intelligence.", "Deep learning is a subset of machine learning."]
    })
]

# Run the analysis
results = analyze_pos_overlap(csv_file_path, all_qas_df)

# Print the results
print(results)


Nouns in CSV:  8037
Verbs in CSV:  5480
Adjecties in CSV:  3143
Nouns in DF:  3982
Verbs in DF:  2501
Adjecties in DF:  1572
{'Noun_Overlap_Percentage': 25.15864128406122, 'Verb_Overlap_Percentage': 30.164233576642335, 'Adjective_Overlap_Percentage': 34.04390709513204, 'Noun_Overlap_Count': 2022, 'Verb_Overlap_Count': 1653, 'Adjective_Overlap_Count': 1070}
