In [1]:
# Import statements:
import pandas as pd
import numpy as np
import torch
import time

# Hugging face import:
from transformers import pipeline

2024-02-19 20:33:42.877522: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# Saving zero-shot classification: 
theme_pipeline = pipeline("zero-shot-classification",
                      model="facebook/bart-large-mnli")

# Saving sentiment pipeline
# Ethan: specifying a model to ensure pipeline stability as per Huggingface recommendation
sentiment_pipeline = pipeline("sentiment-analysis", model = "distilbert-base-uncased-finetuned-sst-2-english")

In [3]:
# Loading in data:
df = pd.read_csv("/Users/amaribauer/Desktop/A_ML/FinalProject/okcupid_profiles.csv")
# Ethan: using a relative path for reproducability. To reproduce, add a data file in your home directory and put the profile document there.
#df = pd.read_csv("../data/okcupid_profiles.csv")

In [4]:
# Ethan: sampling here rather than after data cleaning
df_sample = df.sample(10)

In [5]:
# Essay dataframe:
essays_df = df_sample.loc[:, ["essay0", "essay1", "essay2", "essay3", "essay4", 
                   "essay5", "essay6", "essay7", "essay8", "essay9"]]
essays_df = essays_df.astype(str)

# Essay0 dataframe only "about me":
essay0_df = df_sample.loc[:, ["essay0"]]
essay0_df = essay0_df.dropna(subset=['essay0'])

Proposed Additions to labels: travel, drinking, drugs, kids
Proposed Removals: teen, enthusiastic, time periods, avid, miscellaneous, rock, sci-fi, favorite, novelty

In [6]:
# Setting labels. Think of thse as our classes:
# Ethan: Implemented Amari's suggestion for label names. We can tweak this more going forward.
candidate_labels = ['TV', 'movies', 'music',
          'comedies', 'food', 'drama',
          'music', 'books', 'travel', 'drinking', 
          'drugs', 'kids']

In [8]:
# Create a sample of 10 observations:
sampled_df = essay0_df.copy()

In [9]:
# Checking indexes of dataframe that are included in the sample:
print(sampled_df.index)

Index([32627, 33884, 50509, 48305, 6440, 42908, 7571, 37848], dtype='int64')


In [10]:
# Function helps with classification & making sure that tensors are all the
# same length:

def classify_essay(essay):
    """
    Classifies essay using theme classification. 
    """

    # Perform zero-shot classification
    output = theme_pipeline(essay, candidate_labels)
    
    # Create a dictionary mapping labels to scores:
    score_dict = {label: score for label, score in zip(
        output['labels'], output['scores'])}
    
    # Ensure all candidate labels have a score, set to 0 if missing:
    for label in candidate_labels:
        if label not in score_dict:
            score_dict[label] = 0.0
    
    # Convert the dictionary to a pandas Series:
    return pd.Series(score_dict)

In [11]:
# Applying zero-shot classification to sampled dataframe & creating a dataframe:
results = pd.concat([sampled_df['essay0'], sampled_df['essay0'].apply(classify_essay)], axis = 1)

In [12]:
results

Unnamed: 0,essay0,food,kids,travel,drama,music,TV,comedies,movies,drinking,books,drugs
32627,i enjoy being akward on purpose. i love my bik...,0.529102,0.076941,0.0737,0.063269,0.042128,0.039525,0.037574,0.029011,0.025359,0.023555,0.017707
33884,"wow... i appreciate pauses, and i also appreci...",0.200013,0.023789,0.467908,0.016887,0.074945,0.012192,0.050511,0.013237,0.027108,0.03273,0.005736
50509,"""life of the party"" is an understatement. i am...",0.038085,0.056041,0.178071,0.076248,0.136479,0.087189,0.10301,0.04671,0.081643,0.037309,0.022735
48305,intellectual and slightly wacky fellow here. i...,0.036455,0.054719,0.201182,0.168858,0.06926,0.089926,0.074984,0.047286,0.029727,0.121727,0.036615
6440,i love what i do and where i'm at in life. alt...,0.043237,0.129924,0.120587,0.072007,0.072581,0.163226,0.079852,0.057335,0.047112,0.111011,0.030546
42908,i keep lots of pens and paper around. i enjoy ...,0.020918,0.112259,0.393783,0.221974,0.0322,0.031314,0.036303,0.01947,0.018199,0.059223,0.022158
7571,"i'm open minded, i enjoy learning and learning...",0.002703,0.006868,0.661747,0.003763,0.155575,0.003796,0.0022,0.001876,0.001531,0.003603,0.000765
37848,the quintessential san francisco experience is...,0.016285,0.018308,0.040261,0.027033,0.329929,0.013222,0.069344,0.109557,0.029009,0.00938,0.007743


In [13]:
# Function to get sentiment for each essay0:
def get_sentiment(essay):
    """
    Returns sentiment on entire essay corpus/observation.  
    """

   # Run sentiment analysis on the essay text
    sentiment_result = sentiment_pipeline(essay)

    # Extract the sentiment label and score
    label = sentiment_result[0]['label']
    score = sentiment_result[0]['score']

    # Return a dictionary with label and score
    return {
        'sentiment_label': label, 'sentiment_score': score}

In [14]:
sentiment_dict = [get_sentiment(essay) for essay in results.loc[:, 'essay0']]

In [15]:
df.columns

Index(['age', 'status', 'sex', 'orientation', 'body_type', 'diet', 'drinks',
       'drugs', 'education', 'ethnicity', 'height', 'income', 'job',
       'last_online', 'location', 'offspring', 'pets', 'religion', 'sign',
       'smokes', 'speaks', 'essay0', 'essay1', 'essay2', 'essay3', 'essay4',
       'essay5', 'essay6', 'essay7', 'essay8', 'essay9'],
      dtype='object')

In [16]:
# Putting everything together, so the output contains original biographical data, interest probabilities, and sentiment info
df_label_sentiment = pd.concat([
    df_sample.loc[:, ['age', 'status', 'sex', 'orientation', 'body_type', 'diet', 'drinks', 'drugs',
                     'education', 'ethnicity', 'height', 'income', 'job', 'location',
                     'offspring', 'pets', 'religion', 'sign', 'smokes', 'speaks']],
    results], axis = 1).reset_index()

df_label_sentiment = pd.concat([df_label_sentiment, pd.DataFrame(sentiment_dict)], axis = 1)    

In [17]:
df_label_sentiment.columns

Index(['index', 'age', 'status', 'sex', 'orientation', 'body_type', 'diet',
       'drinks', 'drugs', 'education', 'ethnicity', 'height', 'income', 'job',
       'location', 'offspring', 'pets', 'religion', 'sign', 'smokes', 'speaks',
       'essay0', 'food', 'kids', 'travel', 'drama', 'music', 'TV', 'comedies',
       'movies', 'drinking', 'books', 'drugs', 'sentiment_label',
       'sentiment_score'],
      dtype='object')

In [18]:
df_label_sentiment.to_csv('classifier_outputs.csv')

In [19]:
df_label_sentiment

Unnamed: 0,index,age,status,sex,orientation,body_type,diet,drinks,drugs,education,...,drama,music,TV,comedies,movies,drinking,books,drugs.1,sentiment_label,sentiment_score
0,32627,25,single,m,straight,fit,anything,very often,never,,...,0.063269,0.042128,0.039525,0.037574,0.029011,0.025359,0.023555,0.017707,POSITIVE,0.94839
1,33884,34,single,m,straight,average,mostly anything,socially,sometimes,graduated from masters program,...,0.016887,0.074945,0.012192,0.050511,0.013237,0.027108,0.03273,0.005736,POSITIVE,0.997833
2,50509,26,single,m,straight,athletic,mostly other,socially,never,graduated from two-year college,...,0.076248,0.136479,0.087189,0.10301,0.04671,0.081643,0.037309,0.022735,POSITIVE,0.983977
3,48305,41,single,m,gay,average,,socially,,graduated from ph.d program,...,0.168858,0.06926,0.089926,0.074984,0.047286,0.029727,0.121727,0.036615,POSITIVE,0.992873
4,28049,29,single,m,straight,athletic,mostly anything,often,,graduated from masters program,...,,,,,,,,,POSITIVE,0.999867
5,6440,29,single,f,straight,,strictly anything,rarely,,graduated from college/university,...,0.072007,0.072581,0.163226,0.079852,0.057335,0.047112,0.111011,0.030546,NEGATIVE,0.995419
6,42908,34,single,m,straight,athletic,mostly anything,socially,sometimes,graduated from ph.d program,...,0.221974,0.0322,0.031314,0.036303,0.01947,0.018199,0.059223,0.022158,POSITIVE,0.999723
7,7571,29,single,m,straight,a little extra,anything,socially,,graduated from college/university,...,0.003763,0.155575,0.003796,0.0022,0.001876,0.001531,0.003603,0.000765,POSITIVE,0.997934
8,37848,29,single,m,straight,athletic,,socially,never,graduated from masters program,...,0.027033,0.329929,0.013222,0.069344,0.109557,0.029009,0.00938,0.007743,,
9,49365,32,single,m,straight,average,mostly anything,socially,never,graduated from masters program,...,,,,,,,,,,


Next steps:
- Saving probabilities into a dataset after computing all of them, consult with Ethan on the topic names prior to running everything

----

Generative AI portion:

- import gpt-2 or some other model
- fine tune model on the essays in the dataset
- generate text in response to the input essay

link with example of gpt2: https://huggingface.co/openai-community/gpt2

- How do we incorporate higher matches to train the model? ## Ask the TA? 
    - One model is most likely feasible, but the best we have brain stormed is what if we do multiple models (one for each label)
        - When user inputs text, classifier identifies top topic -> generate text in response with corresponding topic model

-----

Matching methodology:

- One proposal: lets take the probabilities for each label between two individuals and compute the distance between the probabilities across ALL categories and aggregate them, that is the "compatiability index". 
    - Only conduct matching search for those who are compatiable, sexuality-wise
        - Goal is to limit the amount of cross-computation
    - Compute compatiability index ONLY between people who have the same top topic AND sexualtiy

-----

