In [None]:
# # pip install
# %pip install openai dill seaborn

In [1]:
# Corpus
# CORPUS = "synthetic_data_corpus"
# CORPUS_NAME = "converted_climate_change_gpt-4-turbo-preview_gpt-4-turbo-preview_10_10"
# CORPUS_NAME = "voice"
CORPUS_NAME = "climate_change"

# The name of the experiment (i.e. where to save the results)
EXPERIMENT_NAME = "climate_synthetic_test_k2_v2"
# EXPERIMENT_NAME = "test_voice_embedding"

MODEL = "text-embedding-3-large"
# MODEL = "fasttext"

CORPUS_SIZE = "/20_10"

In [2]:
if CORPUS_NAME == "voice":
    CORPUS = "voice"
    # CORPUS = "/workspaces/dev/projects/narratives/classifiers/real_world_corpora/the_voice_the_voice_broad_keyscheck_4sep2023_filtered_chars150to1200_gpt-3.5-turbo-instruct_2_2 (1).json"
elif CORPUS_NAME == "climate":
    CORPUS = "climate"
    # CORPUS = "/workspaces/dev/projects/narratives/classifiers/real_world_corpora/with_jensen_garrett_abbott_climate_climate_change_pms_curie_2_-1.json"
else:
    CORPUS = (
        "../gpt-4-only-corpora/"
        + CORPUS_NAME
        + CORPUS_SIZE
        + ".json"
    )

In [3]:
# %pip install fasttext nltk

# fasttext.util.download_model("en", if_exists="ignore")

In [4]:
# from sklearn.base import BaseEstimator, TransformerMixin
# import numpy as np
# from nltk.tokenize import word_tokenize

# class FastTextVectorizer(BaseEstimator, TransformerMixin):
#     def __init__(self):
#         # self.model = model
#         self.model = fasttext.load_model("cc.en.300.bin")

#     def fit(self, X, y=None):
#         return self

#     def transform(self, X):
#         # return np.array([self.model.get_sentence_vector(sentence) for sentence in X])
#         return np.array([self.transform_one(sentence) for sentence in X])

#     def transform_one(self, X):
#         return self.model.get_sentence_vector(X)

In [6]:
from sklearn.base import BaseEstimator, TransformerMixin
from soda.cache import ValueCache
from openai import OpenAI
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor
import fasttext

# import fasttext.util

# import nltk

# nltk.download("punkt")
# from nltk.tokenize import word_tokenize
import numpy as np

cache = ValueCache("openai_emb")
usages = {}


class OpenAIVectorizer(BaseEstimator, TransformerMixin):
    def __init__(self, model):
        self.model = model
        self.client = OpenAI()
        if model == "fasttext":
            self.ft = fasttext.load_model("cc.en.300.bin")
        else:
            self.ft = None

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        if self.model == "fasttext":
            return self.transform_fasttext(X)

        return self.transform_openai(X)

    def transform_openai(self, X):
        def get_result_and_register_usage(datum):
            result, usage = self.transform_one_openai(datum, self.model)
            if datum not in usages:
                usages[datum] = usage
            return result

        # Using 20 threads
        with ThreadPoolExecutor(max_workers=20) as executor:
            results = list(
                tqdm(
                    # executor.map(lambda x: self.transform_one_openai(x, self.model), X),
                    executor.map(get_result_and_register_usage, X),
                    total=len(X),
                )
            )
        return results

    @cache.register()
    def transform_one_openai(self, X, model):
        # Embed with OpenAI
        result = self.client.embeddings.create(
            input=X,
            model=model,
        )

        # Add usage to the cache
        return result.data[0].embedding, result.usage

    def transform_fasttext(self, X):
        # return np.array([self.model.get_sentence_vector(sentence) for sentence in X])
        return np.array([self.transform_one_fasttext(sentence) for sentence in tqdm(X)])

    def transform_one_fasttext(self, X):
        return self.ft.get_sentence_vector(X)


# # Test
openai = OpenAIVectorizer(MODEL)
result = openai.transform(["I am a cat"])
print(result)
print(usages)

100%|██████████| 1/1 [00:00<00:00,  1.73it/s]

[[-0.014489330351352692, -0.01162608340382576, -0.013313740491867065, -0.0003529475361574441, 0.044946495443582535, -0.005549795925617218, -0.023973386734724045, -0.004853817634284496, -0.00428765919059515, 0.04840835556387901, -0.03355120122432709, -0.02167990431189537, -0.011568385176360607, -0.032137610018253326, 0.03522443398833275, 0.03632069006562233, -0.021247170865535736, 0.005600281059741974, -0.03995564579963684, 0.010241338983178139, 0.0052721258252859116, -0.017266031354665756, -0.003912623971700668, 0.007580033037811518, 0.0025459101889282465, 0.014027749188244343, 0.007969492115080357, -0.013407498598098755, -0.02568989247083664, -0.03144523501396179, 0.011763115413486958, 0.016227472573518753, 0.04431182146072388, 0.01782858371734619, 0.014748970046639442, -0.01725160703063011, 0.012585306540131569, -0.004572541452944279, 0.011590021662414074, 0.013854656368494034, 0.0033644961658865213, 0.03516673669219017, -0.04387908801436424, -0.010666859336197376, 0.0462446920573711




In [None]:
# Multiprocessing with joblib
import os
import yaml
from tqdm.auto import tqdm

# # import random


# # # Load texts from corpora/{CORPUS}.json
# # with open(os.path.join("../corpora", f"{CORPUS}.json"), "r") as f:
# #     corpus_data = yaml.safe_load(f)


# # # Load texts from corpora/{CORPUS}.json
# # with open(os.path.join("../corpora", f"{CORPUS}.json"), "r") as f:
# #     corpus_data = yaml.safe_load(f)

# with open(CORPUS, "r") as f:
#     corpus_data = yaml.safe_load(f)

# seeds = corpus_data["seeds"]
# distilled = corpus_data["distilled"]
# summarized = corpus_data["summarized"]
# names = corpus_data["names"]
# dataset = corpus_data["dataset"]

# # Convert the dataset to corpus data format
# corpus_data = []
# for seed_set in dataset:
#     for a in seed_set["a_first"]["a"]:
#         corpus_data.append({"text": a, "speakername": "a"})
#     for b in seed_set["a_first"]["b"]:
#         corpus_data.append({"text": b, "speakername": "b"})
#     for a in seed_set["b_first"]["a"]:
#         corpus_data.append({"text": a, "speakername": "a"})
#     for b in seed_set["b_first"]["b"]:
#         corpus_data.append({"text": b, "speakername": "b"})

# # # Shuffle
# # random.shuffle(corpus_data)

# print("Loaded {} texts from corpus".format(len(corpus_data)))
# print("Total word count:", sum([len(text["text"].split()) for text in corpus_data]))

# # Create the training data
# X = [text["text"] for text in corpus_data]
# y = [text["speakername"] for text in corpus_data]
# print(len(X), len(y))

from import_data import import_data

X, y, seeds, distilled, summarized, names = import_data(CORPUS)

In [None]:
# Transform X with the vectorizer
X = OpenAIVectorizer(MODEL).transform(X)


In [None]:
# print("Loaded {} texts from corpus".format(len(corpus_data)))
# print("Total word count:", sum([len(text["text"].split()) for text in corpus_data]))

# Create the training data and use an n-gram sklearn model
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB

# from sklearn.model_selection import train_test_split
# Stratified train-test split
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import cross_val_score

# Create the training data
# X = [text["text"] for text in corpus_data]
# y = [text["speakername"] for text in corpus_data]
# print(len(X), len(y))

print("Loaded {} texts from corpus".format(len(X)))
# print("Total word count:", sum([len(text["text"].split()) for text in X]))

# Split into train and test
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in sss.split(X, y):
    print("TRAIN:", len(train_index), "TEST:", len(test_index))
    X_train, X_test = [X[i] for i in train_index], [X[i] for i in test_index]
    y_train, y_test = [y[i] for i in train_index], [y[i] for i in test_index]

print(len(X_train), len(X_test), len(y_train), len(y_test))


# Create the pipeline
text_clf = Pipeline(
    [
        # TFIDF approach
        # ("vect", CountVectorizer(ngram_range=(1, 2))),
        # ("tfidf", TfidfTransformer()),
        # Embedding approach
        # ("ft", FastTextVectorizer()),
        # OpenAI approach
        # ("openai", OpenAIVectorizer(MODEL)),
        # SGD classifier
        (
            "clf",
            SGDClassifier(
                # loss="hinge", # Probability weights not available
                loss="log_loss",
                penalty="l2",
                alpha=1e-3,
                random_state=42,
                max_iter=5,
                tol=None,
            ),
        ),
        # Naive Bayes
        # ('clf', MultinomialNB()),
    ]
)

# Fit the model
text_clf.fit(X_train, y_train)

# Predict
predicted = text_clf.predict(X_test)
probs = text_clf.predict_proba(X_test)

# Evaluate
print("Accuracy:", accuracy_score(y_test, predicted))
print("F1:", f1_score(y_test, predicted, average="weighted"))

# Cross validation
scores = cross_val_score(text_clf, X, y, cv=5)
print("Cross validation scores:", scores)
print("Cross validation mean:", scores.mean())
print("Cross validation std:", scores.std())

# Save the data in CSV format
import pandas as pd

# Make the folder
os.makedirs(f"corpus_results/embeddings/{EXPERIMENT_NAME}", exist_ok=True)

# Save the predictions for each datum in the test set to data.csv
df = pd.DataFrame(
    {
        "text": X_test,
        "truth": y_test,
        "classification": predicted,
        "a": [p[0] for p in probs],
        "b": [p[1] for p in probs],
    }
)
df.to_csv(f"corpus_results/embeddings/{EXPERIMENT_NAME}/data.csv")

In [None]:
import random

# %pip install seaborn
# Sample N=(10, 100, 500) texts for training set, 100 times each
N = [10, 20, 50, 100, 200, 500]
# N = [x for x in N if x < len(X_train) // 2]
n_samples = 100
results = {}
for n in tqdm(N):
    if n > len(X_train):
        continue

    results[n] = []
    for iteration in tqdm(range(n_samples)):
        # Sample n texts from X_train
        # X_train_sample = random.sample(X_train, n)
        # y_train_sample = [y_train[X_train.index(text)] for text in X_train_sample]

        # Sample stratified, N/2 for each class
        X_train_sample = []
        y_train_sample = []
        failed = False
        for speaker in set(y_train):
            try:
                X_train_sample.extend(
                    random.sample(
                        [
                            text
                            for text, speakername in zip(X_train, y_train)
                            if speakername == speaker
                        ],
                        n // 2,
                    )
                )
            except ValueError:
                failed = True
            y_train_sample.extend([speaker for _ in range(n // 2)])

        if failed:
            print("Failed to sample for n =", n)
            continue

        # Shuffle together
        X_train_sample, y_train_sample = zip(
            *random.sample(
                list(zip(X_train_sample, y_train_sample)), len(X_train_sample)
            )
        )

        # print(X_train_sample)
        # print(y_train_sample)

        # print(len(X_train_sample), len(y_train_sample))

        # Clear the model
        text_clf = Pipeline(
            [
                # ("vect", CountVectorizer(ngram_range=(1, 2))),
                # ("tfidf", TfidfTransformer()),
                # OpenAI
                # ("openai", OpenAIVectorizer(MODEL)),
                # SGD classifier
                # ('clf', SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, random_state=42, max_iter=5, tol=None)),
                # Probability estimates not available for loss='hinge'
                (
                    "clf",
                    SGDClassifier(
                    loss="log_loss",
                        penalty="l2",
                        alpha=1e-3,
                        random_state=42,
                        max_iter=5,
                        tol=None,
                    ),
                ),
                # Naive Bayes
                # ('clf', MultinomialNB()),
            ]
        )
        # Fit the model
        text_clf.fit(X_train_sample, y_train_sample)
        # Predict
        predicted = text_clf.predict(X_test)
        probs = text_clf.predict_proba(X_test)
        # Evaluate
        results[n].append(
            {
                "n": n,
                "i": iteration,
                "accuracy": accuracy_score(y_test, predicted),
                "predictions": predicted,
                "probabilities": probs,
            }
        )

In [None]:
# Save each prediction to a CSV, flattened and including the value of n and the iteration as columns
df = pd.concat(
    [
        pd.DataFrame(
            {
                "n": n,
                "i": iteration,
                # "text": X_test,
                # "accuracy": result["accuracy"],
                "truth": y_test,
                "classification": result["predictions"],
                "a": [p[0] for p in result["probabilities"]],
                "b": [p[1] for p in result["probabilities"]],
            }
        )
        for n, results_n in results.items()
        for iteration, result in enumerate(results_n)
    ]
)

# Save to CSV without index
df.to_csv(f"corpus_results/embeddings/{EXPERIMENT_NAME}/data_sampled.csv", index=False)

# # Save usage to a CSV with two columns (prompt_tokens,completion_tokens)
# usage_df = pd.DataFrame(usages, columns=["prompt_tokens", "completion_tokens"])
# usage_df.to_csv(f"corpus_results/embeddings/{EXPERIMENT_NAME}/usage.csv", index=False)

# # Get prompt tokens
# # print(usages[0].prompt_tokens)
# prompt_tokens = [usage.prompt_tokens for usage in usages]
# completion_tokens = [usage.total_tokens - usage.prompt_tokens for usage in usages]

# # Save to CSV without index
# usage_df = pd.DataFrame(
#     {"prompt_tokens": prompt_tokens, "completion_tokens": completion_tokens}
# )
# usage_df.to_csv(f"corpus_results/embeddings/{EXPERIMENT_NAME}/usage.csv", index=False)

# With dict version of usage
# Save usage to a CSV with two columns (prompt_tokens,completion_tokens)
prompt_tokens = []
completion_tokens = []
for usage in usages.values():
    prompt_tokens.append(usage.prompt_tokens)
    completion_tokens.append(usage.total_tokens - usage.prompt_tokens)

usage_df = pd.DataFrame(
    {"prompt_tokens": prompt_tokens, "completion_tokens": completion_tokens}
)
usage_df.to_csv(f"corpus_results/embeddings/{EXPERIMENT_NAME}/usage.csv", index=False)


In [None]:
# %pip install seaborn
# Show results
# print(results)

# # Save the data in JSON format
# import json
# with open(f"corpus_results/{EXPERIMENT_NAME}.json", "w") as f:
#     json.dump(results, f)

# Drop any empty results
results = {k: v for k, v in results.items() if v}

# Save the results
df = pd.DataFrame(results)
df.to_csv(f"corpus_results/embeddings/{EXPERIMENT_NAME}/scores.csv")

# Show average and std for each N
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_style("whitegrid")
sns.set_palette("colorblind")
sns.set_context("paper", font_scale=1.5)
plt.figure(figsize=(10, 6))

for n in N:
    if n in results:
        plt.errorbar(
            n,
            np.mean([result["accuracy"] for result in results[n]]),
            yerr=np.std([result["accuracy"] for result in results[n]]),
            fmt="o",
            capsize=5,
            label=f"N={n}",
        )

# Give solid white background
plt.gca().set_facecolor("white")
plt.xlabel("N")
plt.ylabel("Accuracy")
plt.legend()
plt.savefig(
    f"corpus_results/embeddings/{EXPERIMENT_NAME}/accuracy.jpg",
    dpi=300,
    bbox_inches="tight",
)
plt.show()

In [None]:
# print(usages[0].prompt_tokens)