In [1]:
# Import statements:
import pandas as pd
import numpy as np
import torch
import time

# Hugging face import:
from transformers import pipeline

2024-02-19 20:33:42.877522: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# Saving zero-shot classification: 
theme_pipeline = pipeline("zero-shot-classification",
                      model="facebook/bart-large-mnli")

# Saving sentiment pipeline
# Ethan: specifying a model to ensure pipeline stability as per Huggingface recommendation
sentiment_pipeline = pipeline("sentiment-analysis", model = "distilbert-base-uncased-finetuned-sst-2-english")

In [3]:
# Loading in data:
df = pd.read_csv("/Users/amaribauer/Desktop/A_ML/FinalProject/okcupid_profiles.csv")
# Ethan: using a relative path for reproducability. To reproduce, add a data file in your home directory and put the profile document there.
#df = pd.read_csv("../data/okcupid_profiles.csv")

In [4]:
# Ethan: sampling here rather than after data cleaning
df_sample = df.sample(10)

In [5]:
# Essay dataframe:
essays_df = df_sample.loc[:, ["essay0", "essay1", "essay2", "essay3", "essay4", 
                   "essay5", "essay6", "essay7", "essay8", "essay9"]]
essays_df = essays_df.astype(str)

# Essay0 dataframe only "about me":
essay0_df = df_sample.loc[:, ["essay0"]]
essay0_df = essay0_df.dropna(subset=['essay0'])

Proposed Additions to labels: travel, drinking, drugs, kids
Proposed Removals: teen, enthusiastic, time periods, avid, miscellaneous, rock, sci-fi, favorite, novelty

In [6]:
# Setting labels. Think of thse as our classes:
# Ethan: Implemented Amari's suggestion for label names. We can tweak this more going forward.
candidate_labels = ['TV', 'movies', 'music',
          'comedies', 'food', 'drama',
          'music', 'books', 'travel', 'drinking', 
          'drugs', 'kids']

In [8]:
# Create a sample of 10 observations:
sampled_df = essay0_df.copy()

In [9]:
# Checking indexes of dataframe that are included in the sample:
print(sampled_df.index)

Index([32627, 33884, 50509, 48305, 6440, 42908, 7571, 37848], dtype='int64')


In [10]:
# Function helps with classification & making sure that tensors are all the
# same length:

def classify_essay(essay):

    # Perform zero-shot classification
    output = theme_pipeline(essay, candidate_labels)
    
    # Create a dictionary mapping labels to scores:
    score_dict = {label: score for label, score in zip(
        output['labels'], output['scores'])}
    
    # Ensure all candidate labels have a score, set to 0 if missing:
    for label in candidate_labels:
        if label not in score_dict:
            score_dict[label] = 0.0
    
    # Convert the dictionary to a pandas Series:
    return pd.Series(score_dict)

In [None]:
# Applying zero-shot classification to sampled dataframe & creating a dataframe:
results = pd.concat([sampled_df['essay0'], sampled_df['essay0'].apply(classify_essay)], axis = 1)

In [None]:
results

In [None]:
# Function to get sentiment for each essay0:
def get_sentiment(essay):

    # Run sentiment analysis on the essay text
    sentiment_result = sentiment_pipeline(essay)

    # Extract the sentiment label and score
    label = sentiment_result[0]['label']
    score = sentiment_result[0]['score']

    # Return a dictionary with label and score
    return {
        'sentiment_label': label, 'sentiment_score': score}

In [None]:
sentiment_dict = [get_sentiment(essay) for essay in results.loc[:, 'essay0']]

In [None]:
df.columns

In [None]:
# Putting everything together, so the output contains original biographical data, interest probabilities, and sentiment info
df_label_sentiment = pd.concat([
    df_sample.loc[:, ['age', 'status', 'sex', 'orientation', 'body_type', 'diet', 'drinks', 'drugs',
                     'education', 'ethnicity', 'height', 'income', 'job', 'location',
                     'offspring', 'pets', 'religion', 'sign', 'smokes', 'speaks']],
    results], axis = 1).reset_index()

df_label_sentiment = pd.concat([df_label_sentiment, pd.DataFrame(sentiment_dict)], axis = 1)


    

In [None]:
df_label_sentiment.columns

In [None]:
df_label_sentiment.to_csv('classifier_outputs.csv')

In [None]:
df_label_sentiment

Next steps:
- Saving probabilities into a dataset after computing all of them, consult with Ethan on the topic names prior to running everything

----

Generative AI portion:

- import gpt-2 or some other model
- fine tune model on the essays in the dataset
- generate text in response to the input essay

link with example of gpt2: https://huggingface.co/openai-community/gpt2

- How do we incorporate higher matches to train the model? ## Ask the TA? 
    - One model is most likely feasible, but the best we have brain stormed is what if we do multiple models (one for each label)
        - When user inputs text, classifier identifies top topic -> generate text in response with corresponding topic model

-----

Matching methodology:

- One proposal: lets take the probabilities for each label between two individuals and compute the distance between the probabilities across ALL categories and aggregate them, that is the "compatiability index". 
    - Only conduct matching search for those who are compatiable, sexuality-wise
        - Goal is to limit the amount of cross-computation
    - Compute compatiability index ONLY between people who have the same top topic AND sexualtiy

-----

