In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import string
import nltk
import contractions
import sklearn
import sentence_transformers
from datasets import load_dataset

from nltk.corpus import stopwords, names, wordnet
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag

from collections import Counter

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

%matplotlib inline

In [None]:
essay_df = pd.read_csv('datasets/OCEAN_essays.csv', encoding = 'ISO-8859-1')
essay_df.drop(['#AUTHID'], axis=1, inplace=True)
essay_df[['cAGR','cEXT','cNEU','cCON','cOPN']] = essay_df[['cAGR','cEXT','cNEU','cCON','cOPN']].replace({'n':0,'y':1})
essay_df.rename(columns = {'cAGR': 'Agreeable',
                     'cEXT': 'Extraversion',
                     'cNEU': 'Neuroticism',
                     'cCON': 'Conscientiousness',
                     'cOPN': 'Openness'}, inplace=True)


In [None]:
hugging_face_df = load_dataset('MTHR/OCEAN')
hugging_face_df = hugging_face_df['train'].to_pandas()
hugging_face_df.rename(columns={'Text':'TEXT', 'Agreeableness': 'Agreeable'}, inplace=True)

traits = ['Extraversion','Neuroticism','Agreeable','Conscientiousness','Openness']

def dom_trait(row, traits=traits, threshold = 4.0):
    for trait in traits:
        if row[trait] > threshold:
            row[trait] = 1
        else:
            row[trait] = 0
    return row

hugging_face_df = hugging_face_df.apply(lambda x: dom_trait(x), axis = 1)
hugging_face_df = hugging_face_df[hugging_face_df[traits].sum(axis=1) > 0]


In [None]:
hugging_face2_df = load_dataset('yestaehyung/OCEAN')
hugging_face2_df = hugging_face2_df['train'].to_pandas()
hugging_face2_df.rename(columns={'text':'TEXT', 'Agreeableness': 'Agreeable'}, inplace=True)
hugging_face2_df[traits] = hugging_face2_df[traits].replace({'high':1,'low':0})

In [None]:
hugging_face3_df = load_dataset('Navya1602/Personality_dataset')
hugging_face3_df = hugging_face3_df['train'].to_pandas()
supp_df_neuroticism = hugging_face3_df.loc[hugging_face3_df['Target Personality'] == 'neuroticism'].sample(n=443, random_state=42)
supp_df_conscientiousness = hugging_face3_df.loc[hugging_face3_df['Target Personality'] == 'conscientiousness'].sample(n=284, random_state=42)
supp_df_extraversion = hugging_face3_df.loc[hugging_face3_df['Target Personality'] == 'extraversion'].sample(n=277, random_state=42)
supp_df = pd.concat([supp_df_conscientiousness, supp_df_extraversion, supp_df_neuroticism], ignore_index= True)

topic_stopwords = supp_df['Edit Topic'].unique()
supp_df.drop(columns=['Edit Topic', 'Question'], inplace=True)

supp_df_clean = supp_df.copy()
supp_df_clean.rename(columns={'Answer': 'TEXT'}, inplace=True)
for trait in traits:
    supp_df_clean[trait] = (supp_df_clean['Target Personality'].str.lower() == trait.lower()).astype(int)
supp_df_clean = supp_df_clean[['TEXT'] + traits]

In [None]:
counterexamples = [
    # Conscientiousness Reinforcement (solve, child, logical, puzzles)
    ("My child enjoys solving puzzles calmly and methodically.", [0, 0, 0, 1, 0]),
    ("He solves math problems with focus, not stress.", [0, 0, 0, 1, 0]),
    ("Solving logical tasks is his way of staying sharp and organized.", [0, 0, 0, 1, 0]),
    ("The boy is neat and disciplined in arranging his puzzle pieces.", [0, 0, 0, 1, 0]),
    ("I love puzzles because they bring out my structured thinking.", [0, 0, 0, 1, 0]),
    ("Even as a child, I liked creating to-do lists and sticking to them.", [0, 0, 0, 1, 0]),

    # Agreeableness
    ("She’s always kind and loves helping classmates.", [0, 0, 1, 0, 0]),
    ("Even if she disagrees, she listens and stays friendly.", [0, 0, 1, 0, 0]),
    ("The boy gently comforts others when they’re sad.", [0, 0, 1, 0, 0]),
    ("Kindness and understanding are core to who I am.", [0, 0, 1, 0, 0]),
    ("Helping others gives me joy more than anything else.", [0, 0, 1, 0, 0]),

    # Openness
    ("I explore new hobbies like painting and philosophy.", [0, 0, 0, 0, 1]),
    ("Creative writing and abstract art spark my imagination.", [0, 0, 0, 0, 1]),
    ("Trying exotic food and learning foreign languages excites me.", [0, 0, 0, 0, 1]),
    ("She writes poetry to express deep thoughts.", [0, 0, 0, 0, 1]),
    ("Books on outer space and time travel intrigue me.", [0, 0, 0, 0, 1]),

    # Counteracting Neuroticism dominance
    ("Solving problems helps me feel grounded, not stressed.", [0, 0, 0, 1, 0]),
    ("I may be quiet, but that doesn’t mean I’m anxious.", [0, 0, 0, 0, 0]),
    ("Being logical doesn’t mean I’m emotionally unstable.", [0, 0, 0, 1, 0]),
    ("Just because I’m a boy doesn’t mean I worry all the time.", [0, 0, 0, 0, 0]),
    ("‘Quite’ is how I describe focus, not fear.", [0, 0, 0, 1, 0]),

    # Extraversion vs Neuroticism
    ("I love meeting new people but sometimes need time to recharge.", [1, 0, 0, 0, 0]),
    ("Parties are fun, but I prefer meaningful conversations.", [1, 0, 0, 0, 1]),
    ("I like being around others, but I'm not anxious when I'm alone.", [1, 0, 0, 0, 0]),
    ("My child enjoys group play but is also quiet at times.", [1, 0, 0, 0, 0]),

    # Mixed corrections for sentence like “I hate going out to parties.”
    ("I hate going out, especially to parties.", [0, 1, 0, 0, 0]),
    ("Crowds make me feel nervous, even when it’s supposed to be fun.", [0, 1, 0, 0, 0]),
    ("Loud events drain me emotionally.", [0, 1, 0, 0, 0]),
    ("Being around too many people makes me anxious.", [0, 1, 0, 0, 0]),
    ("Parties give me stress, not joy.", [0, 1, 0, 0, 0]),

    # Misaligned: "whine" — should reflect Neuroticism
    ("He tends to whine when things don't go his way.", [0, 1, 0, 0, 0]),
    ("Whining constantly, she struggles with emotional regulation.", [0, 1, 0, 0, 0]),
    
    # Misaligned: "cry" — strengthen link to Neuroticism
    ("I cry easily when I'm overwhelmed or stressed.", [0, 1, 0, 0, 0]),
    ("She cried out of frustration during the exam.", [0, 1, 0, 0, 0]),

    # Misaligned: "boy", "child" — clarify they are not indicators of Neuroticism
    ("The boy is curious and loves exploring new concepts.", [0, 0, 0, 0, 1]),
    ("My child is confident and always excited to meet new people.", [1, 0, 0, 0, 0]),

    # Misaligned: "solve" — reinforce as Conscientious
    ("I solve problems methodically and with care.", [0, 0, 0, 1, 0]),
    ("Solving difficult tasks motivates me to stay organized.", [0, 0, 0, 1, 0]),

    # Misaligned: "quite" — de-bias from Neuroticism
    ("I'm quite focused when working, not nervous.", [0, 0, 0, 1, 0]),
    ("She's quite determined, especially when planning her schedule.", [0, 0, 0, 1, 0]),

    # Misaligned: "love" — overlinked to Extraversion/Openness
    ("I love routines and planning every detail of my day.", [0, 0, 0, 1, 0]),
    ("She loves neatness and structured spaces more than spontaneity.", [0, 0, 0, 1, 0]),

    # Misaligned: "party" — reinforce context-sensitive Neuroticism
    ("I avoid parties because they make me anxious.", [0, 1, 0, 0, 0]),
    ("Even the idea of a party stresses me out.", [0, 1, 0, 0, 0]),
    
    # Misaligned: "logical" — should show Conscientiousness
    ("Logical reasoning is part of my disciplined work habits.", [0, 0, 0, 1, 0]),
    ("Being logical helps me stay efficient and on task.", [0, 0, 0, 1, 0]),

    # Misaligned: "anxious" also linked too weakly in some contexts
    ("I’m not anxious — I just prefer quiet time to reflect.", [0, 0, 0, 0, 1]),
    ("Quietness helps me stay calm, not because I’m anxious.", [0, 0, 0, 0, 1]),

    # Targeted Extraversion Counterexamples
    ("She always talks confidently in front of her classmates.", [1, 0, 0, 0, 0]),
    ("His energy shines through when he's playing with others.", [1, 0, 0, 0, 0]),
    ("I enjoy group activities and love making new friends.", [1, 0, 0, 0, 0]),
    ("Meeting people gives me joy, not stress or fear.", [1, 0, 0, 0, 0]),
    ("I'm happiest when I'm surrounded by laughter and stories.", [1, 0, 0, 0, 0]),
    
    # Targeted Agreeableness Counterexamples
    ("She always includes everyone in games so no one feels left out.", [0, 0, 1, 0, 0]),
    ("Even if someone is unkind, he still tries to help them.", [0, 0, 1, 0, 0]),
    ("She goes out of her way to make friends with new classmates.", [0, 0, 1, 0, 0]),
    ("Helping others is not a chore for me — it brings me joy.", [0, 0, 1, 0, 0]),
    ("He quietly supports others, even without being asked.", [0, 0, 1, 0, 0]),

    # Delinking "mom" from Neuroticism
    ("My mom is the most organized person I know, always planning ahead.", [0, 0, 0, 1, 0]),
    ("Mom loves keeping a tidy home and making detailed lists.", [0, 0, 0, 1, 0]),
    
    # Delinking "boyfriend" from Neuroticism
    ("My boyfriend helps me stay calm and focused during stressful times.", [0, 0, 1, 0, 0]),
    ("Spending time with my boyfriend brings peace and positivity.", [0, 0, 1, 0, 0]),

    # Delinking "child" from Neuroticism
    ("My child is naturally curious and loves exploring new ideas.", [0, 0, 0, 0, 1]),
    ("This child always takes the lead in group play and enjoys organizing the game.", [1, 0, 0, 1, 0]),

    # Delinking "friend" from Neuroticism
    ("My friend and I enjoy creative brainstorming sessions together.", [0, 0, 0, 0, 1]),
    ("Friends help me feel inspired to try new things.", [0, 0, 0, 0, 1]),
    
    # Delinking "mother" from Neuroticism
    ("My mother is a planner — always punctual and dependable.", [0, 0, 0, 1, 0]),
    ("Mother finds joy in structure and routines, not in worrying.", [0, 0, 0, 1, 0]),

    # Rebalancing "make" away from Neuroticism
    ("I like to make schedules and follow them strictly.", [0, 0, 0, 1, 0]),
    ("He makes everyone feel welcome with his cheerful attitude.", [1, 0, 1, 0, 0]),
    ("Making something from scratch is my favorite creative outlet.", [0, 0, 0, 0, 1]),

    # REBALANCING "child"
    ("My child is adventurous and loves exploring nature.", [0, 0, 0, 0, 1]),
    ("The child speaks confidently in front of others.", [1, 0, 0, 0, 0]),
    ("She encourages her child to try new activities and make friends.", [0, 0, 1, 0, 0]),
    ("The child is independent and enjoys solving problems.", [0, 0, 0, 1, 0]),

    # REBALANCING "friend"
    ("My friend motivates me to be more organized and responsible.", [0, 0, 0, 1, 0]),
    ("Friends bring energy and joy to my weekends.", [1, 0, 0, 0, 0]),
    ("Making a friend at camp was the best part of summer.", [1, 0, 1, 0, 0]),
    ("I often share books and art ideas with a friend.", [0, 0, 0, 0, 1]),

    # REBALANCING "make"
    ("I like to make plans and stick to them.", [0, 0, 0, 1, 0]),
    ("She makes amazing crafts with patience and detail.", [0, 0, 0, 1, 0]),
    ("He makes sure everyone feels included in games.", [0, 0, 1, 0, 0]),
    ("Making new recipes is how I explore my creativity.", [0, 0, 0, 0, 1]),

    # REBALANCING "make friend"
    ("She enjoys making friends during school trips.", [1, 0, 1, 0, 0]),
    ("Making friends gives me energy and boosts my mood.", [1, 0, 0, 0, 0]),
    ("I help my child make friends by hosting playdates.", [0, 0, 1, 0, 0]),
    ("Making new friends is part of our adventure club.", [1, 0, 0, 0, 1]),

    # Shift "friend" toward Extraversion
    ("I love talking to my friend about new adventures.", [1, 0, 0, 0, 1]),
    ("Being with friends gives me energy and happiness.", [1, 0, 0, 0, 0]),
    ("I’m the kind of person who enjoys being around friends all the time.", [1, 0, 0, 0, 0]),
    
    # Shift "friend" toward Agreeableness
    ("My friend is always there when someone needs help.", [0, 0, 1, 0, 0]),
    ("I value loyalty and kindness in a friend.", [0, 0, 1, 0, 0]),
    ("Being a good friend means being empathetic and understanding.", [0, 0, 1, 0, 0]),

    # Shift "friend" toward Openness
    ("I met a friend who shares my love for abstract art and ideas.", [0, 0, 0, 0, 1]),
    ("Friends who think differently inspire me to learn new things.", [0, 0, 0, 0, 1]),

    # De-link "make friend" from Neuroticism
    ("Making new friends is something I enjoy at school.", [1, 0, 1, 0, 0]),
    ("She makes friends easily and loves to start conversations.", [1, 0, 0, 0, 0]),
    ("I make friends by inviting people to play games with me.", [1, 0, 1, 0, 0]),
    ("He made a friend by joining the school's art club.", [0, 0, 1, 0, 1]),
    ("Making friends comes naturally when I'm in creative spaces.", [1, 0, 0, 0, 1]),

    # Link 'new' to Openness
    ("She gets excited by new cultures and creative ideas.", [0, 0, 0, 0, 1]),
    ("New concepts in science and philosophy fascinate him.", [0, 0, 0, 0, 1]),
    ("Exploring new hobbies helps me grow as a person.", [0, 0, 0, 0, 1]),
    ("Every new experience sparks my imagination.", [0, 0, 0, 0, 1]),
    ("He thrives when trying new forms of art.", [0, 0, 0, 0, 1]),

    # Debias 'everything' from Neuroticism
    ("She takes everything in stride and stays calm.", [0, 0, 0, 1, 0]),
    ("Even when everything is chaotic, he remains composed.", [0, 0, 0, 1, 0]),
    ("Everything is planned out so I don’t feel anxious.", [0, 0, 0, 1, 0]),
    ("Everything I do has purpose and structure.", [0, 0, 0, 1, 0]),
    ("Everything excites me when I’m creating something new.", [0, 0, 0, 0, 1]),

    #Linking "arranges", "books", "size", "subject", "told" to Conscientiousness
    ("My child arranges books by size and subject on her own.", [0, 0, 0, 1, 0]),
    ("He always arranges his school supplies by size.", [0, 0, 0, 1, 0]),
    ("Books are organized by subject and color, just how she likes it.", [0, 0, 0, 1, 0]),
    ("Without being told, he categorizes everything neatly.", [0, 0, 0, 1, 0]),
    ("She lines up her books by subject without reminders.", [0, 0, 0, 1, 0]),

    # Reinforce "listen", "speaks", "kindly", "upset" as Agreeableness
    ("Even if he’s upset, he speaks kindly and listens to others.", [0, 0, 1, 0, 0]),
    ("She listens closely and responds kindly to everyone.", [0, 0, 1, 0, 0]),
    ("He stays respectful and speaks kindly, even when upset.", [0, 0, 1, 0, 0]),
    ("Listening to others with empathy is her strength.", [0, 0, 1, 0, 0]),
    ("I try to be kind and listen when people are upset.", [0, 0, 1, 0, 0]),

    ("I plan everything ahead to stay calm and organized.", [0, 0, 0, 1, 0]),
    ("She’s extremely punctual and enjoys structured routines.", [0, 0, 0, 1, 0]),
    ("Everything is mapped out in her planner — it keeps her focused.", [0, 0, 0, 1, 0]),
    ("He plans everything methodically without stress.", [0, 0, 0, 1, 0]),
    ("I’m extremely detail-oriented and always plan my day in advance.", [0, 0, 0, 1, 0]),
]

# Convert to DataFrame
supp_df2 = pd.DataFrame(counterexamples, columns=["TEXT", "TRAITS"])
supp_df2[['Extraversion','Neuroticism','Agreeable','Conscientiousness','Openness']] = pd.DataFrame(supp_df2['TRAITS'].tolist(), index=supp_df2.index)
supp_df2.drop(columns=['TRAITS'], inplace=True)
supp_df2.head(5)

In [None]:
essay_df['SOURCE'] = 'essays'
hugging_face_df['SOURCE'] = 'hugging_face_1'
hugging_face2_df['SOURCE'] = 'hugging_face_2'
supp_df_clean['SOURCE'] = 'supp'
supp_df2['SOURCE'] = 'counterexamples'

# # Downsample to say, half the size of hugging_face_2
# target_size = int(len(hugging_face2_df) * 0.8)
# essay_df_downsampled = essay_df.sample(n=target_size, random_state=42)

# openness_essays = essay_df[essay_df['Openness'] == 1]
# downsampled = openness_essays.sample(frac=0.2, random_state=42)  # Drop 20%
# essay_df = essay_df.drop(downsampled.index)

df = pd.concat([essay_df, hugging_face_df, hugging_face2_df, supp_df_clean, supp_df2], ignore_index=True)
print(df)

# Let's check for dataset balance
traits = ['Extraversion','Neuroticism','Agreeable','Conscientiousness','Openness']
trait_counts = df[traits].sum()
print(trait_counts)
plt.figure(figsize=(7,4))
sns.barplot(data = trait_counts)
plt.title('Trait Counts in DF')
plt.xlabel('Traits')
plt.ylabel('Count')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

In [None]:
from sklearn.model_selection import train_test_split

X = df['TEXT']
y = df[traits]
source = df["SOURCE"]

# Split into train + (val + test) using source stratification
X_train, X_split, y_train, y_split, source_train, source_split = train_test_split(
    X, y, source, test_size=0.3, stratify=source, random_state=42
)

# Then split that into val and test using the same stratification
X_valid, X_test, y_valid, y_test, source_valid, source_test = train_test_split(
    X_split, y_split, source_split, test_size=0.33, stratify=source_split, random_state=42
)

print("🔍 Source Distribution:")
print("Train:\n", source_train.value_counts(normalize=True))
print("Valid:\n", source_valid.value_counts(normalize=True))
print("Test:\n", source_test.value_counts(normalize=True))

print(len(X_train), len(X_valid), len(X_test))

In [None]:
print(y_train.sum().sort_values(ascending=False))

In [None]:
def preprocess_data(text, expand_contractions = True, use_lemmanization = True):
    if expand_contractions:
        text = contractions.fix(text)
    
    tokens = word_tokenize(text)
    tokens = [token.lower() for token in tokens]
    tokens = [token for token in tokens if token not in string.punctuation]

    stop_words = set(stopwords.words('english'))
    custom_stopwords = {
    'im', 'ive', 'id', 'youre', 'theyre', 'weve', 'hes', 'shes', 'thats', 'wasnt', 'isnt',
    'aint', 'dont', 'doesnt', 'didnt', 'couldnt', 'wouldnt', 'shouldnt',
    'wont', 'cant', 'couldve', 'wouldve', 'shouldve',
    'yea', 'yeah', 'nah', 'nope', 'ok', 'okay', 'alright', 'hey', 'hi', 'hello',
    'hmm', 'umm', 'uh', 'uhh', 'uhm', 'lol', 'lmao', 'omg', 'idk', 'ikr', 'btw',
    'pls', 'please', 'thx', 'thanks', 'thankyou', 'thank', 'like', 'just', 'really',
    'actually', 'literally', 'kinda', 'sorta', 'maybe', 'probably', 'perhaps',
    'well', 'gotta', 'gonna', 'wanna', 'lemme', 'gimme', 'cuz', 'cause', 'tho', 'tho.', 'Yaaaaay',
    'lol.', 'lmao.', 'huh', 'yo', 'sup', 'nah', 'okay', 'ok', 'oof', 'whoa', 'wow', 'ugh', 'whats', '\'s', 'oh', '``'
    }
    custom_stopwords.update({
    'people', 'think', 'get', 'know', 'time', 'want', 'good', 'way', 'see', 'something',
    'make', 'things', 'need', 'go', 'right', 'thing', 'lot', 'feel', 'sure', 'work', 
    'got', 'better', 'someone', 'life', 'said', 'find', 'first', 'many', 
    'pretty', 'back', 'take', 'person', 'years', 'long',
    'cogfuncmention', 'typemention', 'tonight', 'today'
    })
    custom_stopwords.update([
    'would', 'one', 'also', 'even', 'much',
    'could', 'still', 'say', 'going', 'though', 'use'
    ])
    # Be careful of this
    custom_stopwords.update([
    'anything', 'every', 'around', 'two', 'end', 'us', 'ill', 'since', '1', 'theres', 'etc', 'getting'
    ])
    all_stopwords = stop_words.union(custom_stopwords)
    all_stopwords.discard('love')  # Make sure "love" is retained
    clean_tokens = [token for token in tokens if token not in all_stopwords]

    clean_tokens = [token for token in clean_tokens if token not in topic_stopwords]

    name_set = set(names.words())
    name_set.discard('Love')
    clean_tokens = [token for token in clean_tokens if token.capitalize() not in name_set]

    def get_wordnet_pos(tag):
        if tag.startswith('J'):
            return wordnet.ADJ
        elif tag.startswith('V'):
            return wordnet.VERB
        elif tag.startswith('N'):
            return wordnet.NOUN
        elif tag.startswith('R'):
            return wordnet.ADV
        else:
            return wordnet.NOUN  # fallback

    if use_lemmanization:
        lemmatizer = WordNetLemmatizer()
        tagged = pos_tag(clean_tokens)
        clean_tokens = [lemmatizer.lemmatize(w, get_wordnet_pos(t)) for w, t in tagged]
    
    return ' '.join(clean_tokens)

In [None]:
X_train_clean = X_train.apply(preprocess_data)
X_valid_clean = X_valid.apply(preprocess_data)
X_test_clean = X_test.apply(preprocess_data)

tfidf = TfidfVectorizer(ngram_range=(1,2), max_features=10000)
tfidf.fit(pd.concat([X_train_clean, X_valid_clean]))

X_train_tfidf = tfidf.transform(X_train_clean)
X_valid_tfidf = tfidf.transform(X_valid_clean)
X_test_tfidf = tfidf.transform(X_test_clean)

In [None]:
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder(sparse_output=True, handle_unknown='ignore')

X_train_source_ohe = ohe.fit_transform(source_train.to_numpy().reshape(-1, 1))
X_valid_source_ohe = ohe.transform(source_valid.to_numpy().reshape(-1, 1))
X_test_source_ohe = ohe.transform(source_test.to_numpy().reshape(-1, 1))

# 4. Concatenate TF-IDF and OHE
from scipy.sparse import hstack

X_train_combined = hstack([X_train_tfidf, X_train_source_ohe])
X_valid_combined = hstack([X_valid_tfidf, X_valid_source_ohe])
X_test_combined = hstack([X_test_tfidf, X_test_source_ohe])

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import classification_report

# Build multi-label logistic regression, CalibratedClassifierCV is ineffective
logreg = LogisticRegression(max_iter=1000, solver='liblinear', class_weight='balanced')  # liblinear is good for small/medium datasets
model = MultiOutputClassifier(logreg)
model.fit(X_train_combined, y_train)
y_pred = model.predict(X_valid_combined)
print(classification_report(y_valid, y_pred, target_names=y_train.columns))

In [None]:
import pickle

# Save the model
with open('ocean_model.pkl', 'wb') as f:
    pickle.dump(model, f)

# Save the TF-IDF vectorizer
with open('tfidf_vectorizer.pkl', 'wb') as f:
    pickle.dump(tfidf, f)

# Save the OneHotEncoder if used
with open('source_ohe.pkl', 'wb') as f:
    pickle.dump(ohe, f)

with open("topic_stopwords.pkl", "wb") as f:
    pickle.dump(topic_stopwords, f)

# # Save trait list (optional)
# with open('traits_list.pkl', 'wb') as f:
#     pickle.dump(traits, f)

In [None]:
def get_top_words_per_trait(model, tfidf, traits, top_n=10):
    feature_names = tfidf.get_feature_names_out()
    tfidf_len = len(feature_names)

    for i, trait in enumerate(traits):
        print(f"\nTop words for {trait}:")
        coef = model.estimators_[i].coef_[0][:tfidf_len]  # Only use TF-IDF weights
        top_indices = np.argsort(coef)[-top_n:]
        top_words = feature_names[top_indices]
        print(top_words[::-1])  # Reverse for descending order

get_top_words_per_trait(model, tfidf, traits, top_n=40)

In [None]:
def predict_traits_from_text(text, source, model, tfidf, ohe, traits, threshold=0.4, margin=0.02):
    clean_text = preprocess_data(text)
    tfidf_vec = tfidf.transform([clean_text])
    tokens = tfidf.inverse_transform(tfidf_vec)[0]

    # Handle missing or unknown source
    if source is None:
        source = "unknown"
    source_encoded = ohe.transform([[source]])  # 2D array

    from scipy.sparse import hstack
    combined_vec = hstack([tfidf_vec, source_encoded])

    # Get raw weights and scores
    weights = [clf.coef_[0] for clf in model.estimators_]
    scores = np.array([clf.intercept_[0] for clf in model.estimators_])

    for i, trait_weights in enumerate(weights):
        scores[i] += combined_vec.dot(trait_weights.T)[0]

    probs = 1 / (1 + np.exp(-scores))
    probas_rounded = np.round(probs, 3)
    trait_probs = dict(zip(traits, probas_rounded))

    top_idx = np.argmax(probs)
    top_score = probs[top_idx]
    predicted_traits = sorted([
        traits[i] for i, p in enumerate(probs)
        if p >= threshold and (top_score - p <= margin)
    ])

    trait_thresholds = {
        'Extraversion': 0.4,
        'Neuroticism': 0.4,
        'Agreeable': 0.4,
        'Conscientiousness': 0.4,
        'Openness': 0.5
    }
    binary_vector = [
        1 if probs[i] >= trait_thresholds[traits[i]] else 0
        for i in range(len(traits))
    ]

    return {
        "predicted_traits": predicted_traits,
        "probabilities": trait_probs,
        "binary_vector": binary_vector
    }

# Manually predict something extreme
text = "I like solving puzzles. It encourages me to use my logical thinking skills. I am also a neat person"
#text = "I hate going out, especially to parties."
#text = 'I loved going out to meet new people'
#text = 'I\'m anxious and worried.'
#text = 'i like meeting new people, making nice notes, experiencing new adventures. new experiences drive me/motivate me.'
#text = "Well, I\'m not really sure what types of things I\'m supposed to be saying. I miss my boyfriend so much. I hope he has a safe trip home. I hope he doesn\'t fall asleep. I feel really sleepy myself. I hope I can stay awake for this twenty minutes. Time seems to go by so slowly when you\'re sleepy. Especially when school work is involved. I wish I was a better typer. It is so hard to just think normally when you\'re typing on a computer. People probably think I\'m not a very interesting person. I try to be, but I think I was born to be boring. Be boring and study my life away. I really don\'t mean to be so studious. I mean I want to do well in school, but I think I\'m too caught up in it. Everything makes me nervous. It is so strange because even though I know this isn't going to be grad, I feel nervous about doing it wrong or not doing a good job. I miss my family and my friends. I don't think I appreciated them enough when I was home. It is so hard to build a friendship up from scratch. It took years to be so close to them, and now I have to start all over. I'm truly lucky to have people that share all of my memories and understand all of my feelings. I wonder if they miss me as much as I miss them. I'm so paranoid. I'm always concerned that people are deceiving me in relationships. What if they don't mean love in the same way that I do?  I hate being hurt. I like to be in control and be omniscient. I like to have the upper hand with people. Unfortunately,  I think most of the time I'm the vulnerable one. Is that normal?  Probably. If I tell myself that enough I might believe it. Could someone really love someone enough that they would die for them if they had to. So many songs make that claim. It must be a truly amazing love. I am still in awe when I think about what Jesus did for me and everyone else. He died a most humiliating and painful death so we wouldn't have to. What love!  How sad though to think that not everyone accepts it!  I feel so guilty when talking to someone that doesn't accept it. I feel like there is something I could say to solve it, but I just don't know what. I know I'm not doing everything I could for Christ, and therefore not being quite good enough. I know I can't be perfect, but I try so hard to be. It feels that I come up short a lot in my life. It is so stressful. I'm probably going to die at an early age because of it. I can't stop though. Kale is so wonderful. He would do anything for me. I can't imagine anyone loving me that much unless they have to. My parents have to, but he doesn't. It is amazing."
#text = "My child likes solving puzzles, he is quite the logical boy!"
#text = "My child always cries and whines."
#text = 'My child cries and whines a lot.'

#text = 'Thinking deeply and imagining new worlds is what inspires me most.'

#text = 'My child neatly arranges books by size and subject without being told.'
#text = 'Even if he’s upset, he speaks kindly and listens to others.'
#text = 'He comforts others when they’re feeling left out or down.'
text = 'She’s extremely punctual and plans everything in advance.'

predict_traits_from_text(text, None, model, tfidf, ohe, traits)

In [None]:
text = "I hate going out, especially to parties."
#text = 'I\m anxious and worried.'
#text = "Well, I\'m not really sure what types of things I\'m supposed to be saying. I miss my boyfriend so much. I hope he has a safe trip home. I hope he doesn\'t fall asleep. I feel really sleepy myself. I hope I can stay awake for this twenty minutes. Time seems to go by so slowly when you\'re sleepy. Especially when school work is involved. I wish I was a better typer. It is so hard to just think normally when you\'re typing on a computer. People probably think I\'m not a very interesting person. I try to be, but I think I was born to be boring. Be boring and study my life away. I really don\'t mean to be so studious. I mean I want to do well in school, but I think I\'m too caught up in it. Everything makes me nervous. It is so strange because even though I know this isn't going to be grad, I feel nervous about doing it wrong or not doing a good job. I miss my family and my friends. I don't think I appreciated them enough when I was home. It is so hard to build a friendship up from scratch. It took years to be so close to them, and now I have to start all over. I'm truly lucky to have people that share all of my memories and understand all of my feelings. I wonder if they miss me as much as I miss them. I'm so paranoid. I'm always concerned that people are deceiving me in relationships. What if they don't mean love in the same way that I do?  I hate being hurt. I like to be in control and be omniscient. I like to have the upper hand with people. Unfortunately,  I think most of the time I'm the vulnerable one. Is that normal?  Probably. If I tell myself that enough I might believe it. Could someone really love someone enough that they would die for them if they had to. So many songs make that claim. It must be a truly amazing love. I am still in awe when I think about what Jesus did for me and everyone else. He died a most humiliating and painful death so we wouldn't have to. What love!  How sad though to think that not everyone accepts it!  I feel so guilty when talking to someone that doesn't accept it. I feel like there is something I could say to solve it, but I just don't know what. I know I'm not doing everything I could for Christ, and therefore not being quite good enough. I know I can't be perfect, but I try so hard to be. It feels that I come up short a lot in my life. It is so stressful. I'm probably going to die at an early age because of it. I can't stop though. Kale is so wonderful. He would do anything for me. I can't imagine anyone loving me that much unless they have to. My parents have to, but he doesn't. It is amazing."
#text = "I like solving puzzles. It encourages me to use my logical thinking skills. I am also a neat person"
#text = 'I really love going out to meet new people'
#text = 'i like meeting new people, making nice notes, experiencing new adventures. new experiences drive me/motivate me.'
text = "My child likes solving puzzles, he is quite the logical boy!"
#text = "My child always cries and whines."
#text = 'My child cries and whines a lot.'
text = 'My child is full of energy, always talking to new people and making friends everywhere he goes.'
#text = 'She always helps her classmates and makes sure everyone feels included during playtime'

text = 'Thinking deeply and imagining new worlds is what inspires me most.'
text = 'I make checklists for every task and feel fulfilled ticking them off.'
text = 'She’s extremely punctual and plans everything in advance.'
text = 'My child neatly arranges books by size and subject without being told.'
text = 'Even if he’s upset, he speaks kindly and listens to others.'
text = 'He comforts others when they’re feeling left out or down.'

# Preprocess
clean_text = preprocess_data(text)
print("Cleaned:", clean_text)

# TF-IDF tokens used
vector = tfidf.transform([clean_text])
tokens = tfidf.inverse_transform(vector)[0]
print("Matched Tokens:", tokens)

# Probabilities
# probs = model.predict_proba(vector)
# trait_probs = {traits[i]: float(probs[i][0][1]) for i in range(len(traits))}
# print("Trait Probabilities:", trait_probs)

In [None]:
def visualize_token_weights_across_traits(tokens_to_check, model, tfidf, traits):
    feature_names = tfidf.get_feature_names_out()
    weights_matrix = []

    for clf in model.estimators_:
        weights = clf.coef_[0][:len(feature_names)]  # Exclude OHE source weights
        token_weights = []
        for token in tokens_to_check:
            if token in feature_names:
                idx = np.where(feature_names == token)[0][0]
                token_weights.append(weights[idx])
            else:
                token_weights.append(0.0)
        weights_matrix.append(token_weights)

    # Convert to numpy array for seaborn
    weights_array = np.array(weights_matrix)

    # Plot heatmap
    plt.figure(figsize=(10, 5))
    sns.heatmap(weights_array, annot=True, xticklabels=tokens_to_check, yticklabels=traits, cmap='coolwarm')
    plt.title("Token Weights Across Traits (Logistic Regression Coefficients)")
    plt.xlabel("Tokens")
    plt.ylabel("Traits")
    plt.tight_layout()
    plt.show()

# Example usage
tokens_to_check = ['meet', 'new', 'love', 'party', 'especially', 'hate', 'tidy']
tokens_to_check = tokens.tolist()
tokens_to_analyze = [
    'panic',
    'suffer',
    'weep',
    'complain',
    'whine',
    'cry',
    'yell',
    'shout',
    'solve',
    'moody',
    'anxious',
    'scared',
    'calm',
    'breathe',
    'logical',
    'quiet',
    'solve',
    'neat',
    'child',
    'like',
    'boy',
    'love',
    'hate',
    'party',
    'meeting',
    'social',
    'overwhelm',
    'together',
    'fun',
    'group',
    'detail',
    'control',
    'relax',
    'creative',
    'curious',
    'adventure',
    'philosophy',
    'organized',
    'explore'
]
visualize_token_weights_across_traits(tokens_to_check, model, tfidf, traits)

In [None]:
for trait, clf in zip(traits, model.estimators_):
    print(f"{trait} bias (intercept): {clf.intercept_[0]}")