In [2]:
# Import statements:
import pandas as pd
import numpy as np
import torch
import time

# Hugging face import:
from transformers import pipeline

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd
  from .autonotebook import tqdm as notebook_tqdm


In [5]:
# Saving zero-shot classification: 
theme_pipeline = pipeline("zero-shot-classification",
                      model="facebook/bart-large-mnli")

# Saving sentiment pipeline
# Ethan: specifying a model to ensure pipeline stability as per Huggingface recommendation
sentiment_pipeline = pipeline("sentiment-analysis", model = "distilbert-base-uncased-finetuned-sst-2-english")

In [12]:
# Loading in data:
#df = pd.read_csv("/Users/amaribauer/Desktop/A_ML/FinalProject/okcupid_profiles.csv")
# Ethan: using a relative path for reproducability. To reproduce, add a data file in your home directory and put the profile document there.
df = pd.read_csv("../data/okcupid_profiles.csv")

In [13]:
# Ethan: sampling here rather than after data cleaning
df_sample = df.sample(10)

In [14]:
# Essay dataframe:
essays_df = df_sample.loc[:, ["essay0", "essay1", "essay2", "essay3", "essay4", 
                   "essay5", "essay6", "essay7", "essay8", "essay9"]]
essays_df = essays_df.astype(str)

# Essay0 dataframe only "about me":
essay0_df = df_sample.loc[:, ["essay0"]]
essay0_df = essay0_df.dropna(subset=['essay0'])

Proposed Additions to labels: travel, drinking, drugs, kids
Proposed Removals: teen, enthusiastic, time periods, avid, miscellaneous, rock, sci-fi, favorite, novelty

In [15]:
# Setting labels. Think of thse as our classes:
# Ethan: Implemented Amari's suggestion for label names. We can tweak this more going forward.
candidate_labels = ['TV', 'movies', 'music',
          'comedies', 'food', 'drama',
          'music', 'books', 'travel', 'drinking', 
          'drugs', 'kids']

In [16]:
# Create a sample of 10 observations:
sampled_df = essay0_df.copy()

In [17]:
# Checking indexes of dataframe that are included in the sample:
print(sampled_df.index)

Index([35730, 3187, 1451, 37917, 59844, 5013, 19058], dtype='int64')


In [18]:
# Function helps with classification & making sure that tensors are all the
# same length:

def classify_essay(essay):

    # Perform zero-shot classification
    output = theme_pipeline(essay, candidate_labels)
    
    # Create a dictionary mapping labels to scores:
    score_dict = {label: score for label, score in zip(
        output['labels'], output['scores'])}
    
    # Ensure all candidate labels have a score, set to 0 if missing:
    for label in candidate_labels:
        if label not in score_dict:
            score_dict[label] = 0.0
    
    # Convert the dictionary to a pandas Series:
    return pd.Series(score_dict)

In [19]:
# Applying zero-shot classification to sampled dataframe & creating a dataframe:
results = pd.concat([sampled_df['essay0'], sampled_df['essay0'].apply(classify_essay)], axis = 1)

In [72]:
results

Unnamed: 0,essay0,travel,drama,kids,TV,music,comedies,drinking,movies,books,food,drugs
5868,"i'm just a sweet, caring girl looking for what...",0.208246,0.173493,0.08275,0.081458,0.077048,0.073351,0.066264,0.046497,0.046491,0.0464,0.020955
628,my attempt at nutshell-ing myself: i'm califo...,0.726675,0.006837,0.006809,0.005274,0.069811,0.003712,0.004391,0.046642,0.027142,0.030537,0.002358
49231,"my name is katie - i grew up in loomis, ca - l...",0.596478,0.046295,0.049673,0.034879,0.057827,0.037907,0.028851,0.026004,0.034496,0.017207,0.012556
44964,"i'm independent, confident and self-sufficient...",0.620401,0.025246,0.009919,0.02227,0.027218,0.015264,0.010547,0.21872,0.008975,0.007405,0.006818
41515,"art, family, music, honest, friends, hate dram...",0.21272,0.007794,0.004008,0.000707,0.252955,0.000711,0.001258,0.143121,0.119392,0.004099,0.000282
39287,"i'm a scientist, a financier, and an athlete. ...",0.036069,0.031679,0.037775,0.042026,0.025607,0.013645,0.009025,0.013625,0.740993,0.012322,0.011627
41044,"i like this quote from devilicia:""she is the c...",0.02889,0.117876,0.023325,0.017396,0.025175,0.031794,0.03354,0.012664,0.011132,0.657713,0.01532
37621,"i have been distracted by the economy, recentl...",0.100345,0.079656,0.026223,0.058697,0.033626,0.037356,0.027265,0.031049,0.028208,0.524672,0.019277
14971,"i try not to sweat the small stuff, life is to...",0.15499,0.102214,0.093967,0.14097,0.081742,0.06615,0.046778,0.071214,0.080555,0.051302,0.028376
30813,i don't think i can summarize myself. there ar...,0.102841,0.160372,0.11625,0.158108,0.071772,0.096163,0.044873,0.050686,0.061501,0.035486,0.030176


In [26]:
# Function to get sentiment for each essay0:
def get_sentiment(essay):

    # Run sentiment analysis on the essay text
    sentiment_result = sentiment_pipeline(essay)

    # Extract the sentiment label and score
    label = sentiment_result[0]['label']
    score = sentiment_result[0]['score']

    # Return a dictionary with label and score
    return {
        'sentiment_label': label, 'sentiment_score': score}

In [27]:
sentiment_dict = [get_sentiment(essay) for essay in results.loc[:, 'essay0']]

In [28]:
df.columns

Index(['age', 'status', 'sex', 'orientation', 'body_type', 'diet', 'drinks',
       'drugs', 'education', 'ethnicity', 'height', 'income', 'job',
       'last_online', 'location', 'offspring', 'pets', 'religion', 'sign',
       'smokes', 'speaks', 'essay0', 'essay1', 'essay2', 'essay3', 'essay4',
       'essay5', 'essay6', 'essay7', 'essay8', 'essay9'],
      dtype='object')

In [32]:
# Putting everything together, so the output contains original biographical data, interest probabilities, and sentiment info
df_label_sentiment = pd.concat([
    df_sample.loc[:, ['age', 'status', 'sex', 'orientation', 'body_type', 'diet', 'drinks', 'drugs',
                     'education', 'ethnicity', 'height', 'income', 'job', 'location',
                     'offspring', 'pets', 'religion', 'sign', 'smokes', 'speaks']],
    results], axis = 1).reset_index()

df_label_sentiment = pd.concat([df_label_sentiment, pd.DataFrame(sentiment_dict)], axis = 1)


    

In [33]:
df_label_sentiment.columns

Index(['index', 'age', 'status', 'sex', 'orientation', 'body_type', 'diet',
       'drinks', 'drugs', 'education', 'ethnicity', 'height', 'income', 'job',
       'location', 'offspring', 'pets', 'religion', 'sign', 'smokes', 'speaks',
       'essay0', 'drama', 'kids', 'travel', 'TV', 'drinking', 'comedies',
       'movies', 'drugs', 'music', 'food', 'books', 'sentiment_label',
       'sentiment_score'],
      dtype='object')

In [34]:
df_label_sentiment.to_csv('classifier_outputs.csv')

Important: Our end result here, sentiments_long, is a pandas df with the essay responses, the top themes, and sentiment analysis response for five essays. Our output for the eventual final classifier would a pandas dataframe with this information for every profile. An alternative implementation could be to include each essay, top theme, and sentiment into a tuple, and then we'd only have five columns per person instead. 

Next steps:
- Saving probabilities into a dataset after computing all of them, consult with Ethan on the topic names prior to running everything

----

Generative AI portion:

- import gpt-2 or some other model
- fine tune model on the essays in the dataset
- generate text in response to the input essay

link with example of gpt2: https://huggingface.co/openai-community/gpt2

- How do we incorporate higher matches to train the model? ## Ask the TA? 
    - One model is most likely feasible, but the best we have brain stormed is what if we do multiple models (one for each label)
        - When user inputs text, classifier identifies top topic -> generate text in response with corresponding topic model

-----

Matching methodology:

- One proposal: lets take the probabilities for each label between two individuals and compute the distance between the probabilities across ALL categories and aggregate them, that is the "compatiability index". 
    - Only conduct matching search for those who are compatiable, sexuality-wise
        - Goal is to limit the amount of cross-computation
    - Compute compatiability index ONLY between people who have the same top topic AND sexualtiy

-----

