In [14]:
import time
import bz2
import json
import pickle
import gc

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

import torch
from sentence_transformers import SentenceTransformer
import transformers

import utils

# Personality

In [17]:
def softmax(logits):
    exp_logits = np.exp(logits)  # Subtract max for numerical stability
    return exp_logits / np.sum(exp_logits)

In [20]:
def personality_detection(text):
    tokenizer = transformers.BertTokenizer.from_pretrained("Minej/bert-base-personality")
    model = transformers.BertForSequenceClassification.from_pretrained("Minej/bert-base-personality")

    inputs = tokenizer(text, truncation=True, padding=True, return_tensors="pt")
    outputs = model(**inputs)
    predictions = outputs.logits.squeeze().detach().numpy()
    
    # convert logit to prob (??)
    predictions = softmax(predictions)

    label_names = ['Extroversion', 'Neuroticism', 'Agreeableness', 'Conscientiousness', 'Openness']
    result = {label_names[i]: predictions[i] for i in range(len(label_names))}

    return result

In [25]:
text_input = "The world makes me nervous and afraid"
personality_prediction = personality_detection(text_input)

print('\n'.join([f'{k}: {v:.2f}' for k,v in personality_prediction.items()]))

Extroversion: 0.17
Neuroticism: 0.33
Agreeableness: 0.18
Conscientiousness: 0.10
Openness: 0.22


In [4]:
personality_detection('This is another really amazing test of this classifier!!')

{'Extroversion': -0.060873274,
 'Neuroticism': 0.13655815,
 'Agreeableness': -0.27149466,
 'Conscientiousness': -0.8811069,
 'Openness': -0.20495062}

# Sentiment embedding

In [11]:
import time
import bz2
import json
import pickle
import gc

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

from sentence_transformers import SentenceTransformer
import transformers

import utils

In [3]:
%load_ext autoreload
%autoreload 2

In [4]:
DATA_PATH = '/sciclone/data10/twford/reddit/reddit/comments/'
BASE_PATH = '/sciclone/geograd/stmorse/reddit/'

In [6]:
sentences = utils.load_sentences_bz2(DATA_PATH, 2007, '01')
len(sentences)

58954

In [21]:
classifier = transformers.pipeline(
    'text-classification', 
    model='bhadresh-savani/distilbert-base-uncased-emotion',
    return_all_scores=True)



In [22]:
predictions = []
for s in sentences[:10]:
    predictions.append(classifier(s))

In [25]:
print(sentences[2])
print(predictions[2])

Yeah, this was pretty weird. You can't bring a cameraphone into a courthouse here in the US; i find it hard to believe the video was done without the knowledge and consent of the people in charge.
[[{'label': 'sadness', 'score': 0.002464491641148925}, {'label': 'joy', 'score': 0.006229648366570473}, {'label': 'love', 'score': 0.0007540768710896373}, {'label': 'anger', 'score': 0.0025485751684755087}, {'label': 'fear', 'score': 0.6492040157318115}, {'label': 'surprise', 'score': 0.3387991487979889}]]


In [5]:
model = SentenceTransformer('bhadresh-savani/distilbert-base-uncased-emotion')

No sentence-transformers model found with name bhadresh-savani/distilbert-base-uncased-emotion. Creating a new one with mean pooling.


config.json:   0%|          | 0.00/768 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/291 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [8]:
embeddings = model.encode(sentences[:100], show_progress_bar=True)

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

In [10]:
embeddings.shape

(100, 768)