In [None]:
"""
Statement of AI Assistance:
    In the following code for part (a), ChatGPT was used to implement the TF-IDF
    vectorizer for the input data, and the sentiment scores.
"""

# Project imports
import os
import glob
import pandas as pd

import nltk
from nltk.sentiment import SentimentIntensityAnalyzer

# Download Vader for sentiment scores
nltk.download("vader_lexicon")


# Load depression labels
labels_path = "/content/sample_data/Depression_Labels/DepressionLabels.xlsx"

df_dplabel = pd.read_excel(labels_path)
df_dplabel = df_dplabel.rename(columns={"Participant_ID": "ParticipantID"})

print("Depression labels shape:", df_dplabel.shape)
print(df_dplabel.head())


# Load transcripts for each participant
transcript_paths = sorted(glob.glob("/content/sample_data/EDAIC_Transcripts/*.csv"))

dfs_trans = []

for path in transcript_paths:
    df = pd.read_csv(path, engine="python", on_bad_lines="skip")

    # Extract participant ID from filename
    fname = os.path.basename(path)
    pid = int(fname.split("_")[0])
    df["ParticipantID"] = pid

    dfs_trans.append(df)

df_trans_all = pd.concat(dfs_trans, ignore_index=True)


# Load acoustics
acoustic_paths = sorted(glob.glob("/content/sample_data/EDAIC_Acoustics/*.csv"))

dfs_acous = []

for path in acoustic_paths:
    df = pd.read_csv(path, engine="python", on_bad_lines="skip")
    dfs_acous.append(df)

df_acous_all = pd.concat(dfs_acous, ignore_index=True)


# Compute sentiment scores
sia = SentimentIntensityAnalyzer()

"""
  - sia.polarity_scores(t): returns a dictionary with keys (neg, neu, pos, compound)
  - ["compound"] extracts the sentiment score in range [-1, 1] (neg to pos)
  - sent_scores: pandas series with one sentiment score per utterance
"""
df_trans_all["sentiment_compound"] = (
    df_trans_all["Text"]
    .fillna("")
    .apply(lambda t: sia.polarity_scores(t)["compound"])
)

# Take the mean sentiment score for each participant
sentiment_by_participant = (
    df_trans_all
    .groupby("ParticipantID")["sentiment_compound"]
    .mean()
    .reset_index()
)

# Take the mean acoustic feature for each participant
acous_features_by_participant = (
    df_acous_all
    .groupby("ParticipantID")
    .mean(numeric_only=True)
    .reset_index()
)

# Merge sentiment and acoustics features for each participant
df_features = (
    df_dplabel
    .merge(sentiment_by_participant, on="ParticipantID", how="inner")
    .merge(acous_features_by_participant, on="ParticipantID", how="inner")
)

print("\nFinal df_features shape:", df_features.shape)
print(df_features.head())
print("\nUnique participants in df_features:", df_features["ParticipantID"].nunique())


# Build data matrix and labels
X = df_features.drop(columns=["PHQ_Score", "ParticipantID"])
y = df_features["PHQ_Score"]

print("\nX shape:", X.shape)
print("y shape:", y.shape)
print("\nX columns (first 10):", X.columns[:10].tolist())
print("\ny head:")
print(y.head())


# Syntactic vectorizer implementation
from sklearn.feature_extraction.text import TfidfVectorizer

# Combine all utterances per participant into a single string
str_by_participant = (
    df_trans_all
    .groupby("ParticipantID")["Text"]
    .apply(lambda x: " ".join(x.astype(str)))
)

# Create TF-IDF vectorizer
tfidf = TfidfVectorizer(
    max_features=5000,
    stop_words="english",
    ngram_range=(1, 2)
)

"""
tdidf.fit_transform(df_trans_labeled["Text"].fillna(""))
  - df_trans_labeled["Text"].fillna("") extracts the text column and replaces
    NaN entries with empty strings
  - .fit_transform(): learns vocab and IDF weights, and converts text to TF-IDF
    feature vector
Output
  - X_tfidf_participant: matrix of shape (N_utterances, 5000) where each row is an utterance,
    each column is a TD-IDF feature
"""
X_tfidf_participant = tfidf.fit_transform(str_by_participant)

print("\nX_tfidf_participant shape (participants x vocab):", X_tfidf_participant.shape)
# docs_by_participant.index is the ParticipantID index order that matches rows in X_tfidf_participant


[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


Depression labels shape: (219, 2)
   ParticipantID  PHQ_Score
0            300          2
1            301          3
2            302          4
3            303          0
4            304          6

Final df_features shape: (134, 30)
   ParticipantID  PHQ_Score  sentiment_compound  UtteranceIndex  Start_Time  \
0            386         11            0.223593            41.0  519.309877   
1            387          2            0.215683            46.5  287.426087   
2            388         17            0.040740            49.5  407.325510   
3            389         14            0.047293            59.5  468.931356   
4            390          9            0.121897            77.0  707.501961   

     End_Time  Confidence  Loudness_sma3  alphaRatio_sma3  \
0  541.035802    0.933917       0.147390       -15.719342   
1  289.713043    0.931755       0.094281       -17.290294   
2  409.063265    0.888103       0.103571       -16.480069   
3  470.710169    0.897373       0.095695   

In [2]:
!pwd

/content
