In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import project_path

In [None]:
import random

import numpy as np
import optuna
import pandas as pd
import torch
from torch import nn
from torch.utils.data import DataLoader, Subset
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
from tqdm.auto import tqdm

from src.classifier import PostClassifier
from src.data_utils import load_data
from src.paths import datap, modelp, outputp
from src.training import Trainer
from src.embed_posts import get_paragraph_split

# Reproducibility
seed = 42
torch.manual_seed(seed)
random.seed(seed)
np.random.seed(seed)

In [None]:
train_indices = np.loadtxt(datap("train_indices.csv"), dtype=np.int32)
test_indices = np.loadtxt(datap("test_indices.csv"), dtype=np.int32)

In [None]:
model_name = "eaembd_post_classifier:2023-04-04" # "baseline"
filename = "labeled_posts.csv"

In [None]:
if model_name == "baseline":
    data = pd.read_csv(datap(filename))
    tqdm.pandas()
    paragraph_split = get_paragraph_split(data)

    first_paragraphs = paragraph_split.drop_duplicates(subset="postId", keep="first")
    first_paragraphs

    def classifier(text):
        if "organisation" in text or "organization" in text:
            return label_map["organization"]
        elif "project" in text:
            return label_map["project"]
        else:
            return label_map["other"]

    first_paragraphs["prediction"] = first_paragraphs.text.apply(classifier)
    first_paragraphs = first_paragraphs.reset_index(drop=True)
    test_labels = first_paragraphs.loc[test_indices]["label"]
    test_preds = first_paragraphs.loc[test_indices]["prediction"]

In [None]:
if model_name != "baseline":
    db_path = outputp("db.sqlite3")
    loaded_study = optuna.load_study(study_name=model_name,
                                     storage=f"sqlite:///{db_path}")

    dataset_size = len(pd.read_csv(datap(filename)))
    trainer = Trainer(epochs=9)

    lr, batch_size = loaded_study.best_trial.params["lr"], loaded_study.best_trial.params["batch_size"]
    dataset = load_data()

    train_loader = DataLoader(Subset(dataset,train_indices), batch_size=batch_size)
    test_loader = DataLoader(Subset(dataset, test_indices), batch_size=batch_size)
    train_metrics, test_metrics = trainer.train(lr, train_loader, test_loader, logging=True)

    test_embeddings, test_labels, test_ids = dataset[test_indices]

    test_preds = trainer.model(test_embeddings)
    test_preds = torch.argmax(torch.nn.functional.softmax(test_preds, dim=1), dim=1)

In [None]:
conf_mat = confusion_matrix(test_labels, test_preds)

In [None]:
conf_mat.sum()

In [None]:
classes = ["organization", "project", "other"]

fig, ax = plt.subplots()
im = ax.imshow(conf_mat)

ax.set_xticks(np.arange(len(classes)), labels=classes)
ax.set_yticks(np.arange(len(classes)), labels=classes)
ax.set_xlabel("Predicted")
ax.set_ylabel("Actual")

# Rotate the tick labels and set their alignment.
plt.setp(ax.get_xticklabels(), rotation=45, ha="right", rotation_mode="anchor")

# Loop over data dimensions and create text annotations.
for i in range(len(classes)):
    for j in range(len(classes)):
        text = ax.text(j, i, conf_mat[i, j],
                       ha="center", va="center", color="w")

In [None]:
FP = conf_mat.sum(axis=0) - np.diag(conf_mat)  
FN = conf_mat.sum(axis=1) - np.diag(conf_mat)
TP = np.diag(conf_mat)
TN = conf_mat.sum() - (FP + FN + TP)

# Sensitivity, hit rate, recall, or true positive rate
TPR = TP / (TP+FN)
# Fall out or false positive rate
FPR = FP / (FP+TN)
TPR, FPR

In [None]:
np.mean(TPR), np.mean(FPR)

In [None]:
### Find organizations that were classified as others

In [None]:
label_map = {"organization": 0, "project": 1, "other": 2}
class_map = {id_: class_ for class_, id_ in label_map.items()}

In [None]:
results = pd.DataFrame({"labels": test_labels, 
                        "preds": test_preds, 
                        "ids": test_ids})
results["pred_class"] = results.preds.map(class_map)
results["true_class"] = results.labels.map(class_map)

In [None]:
dataset = pd.read_csv(datap("labeled_posts.csv"))

In [None]:
dataset.head()

In [None]:
print(dataset[dataset._id.str.startswith("QpG")]["body"].values[0])

### Organizations that were classified as projects
All of them had a word "organization" in the first paragraph come on

### Organizations that were classified as other
one fixed to other, the rest are really orgs

### Projects that were classified as other - what is actually a project?
relabeled some to other, the rest are actual projects

In [None]:
proj2other = results[(results.pred_class == "other") & (results.true_class == "project")]
proj2other

In [None]:
for idx in range(len(proj2other)):
    print(idx)
    print(dataset[dataset._id.isin(proj2other.ids)].iloc[idx]["_id"])
    print(dataset[dataset._id.isin(proj2other.ids)]["body"].values[idx])
    print("\n\n\n")
    print("=================================================================")

### Projects that were classified as organizations and may be they truly are?

In [None]:
proj2org = results[(results.pred_class == "organization") & (results.true_class == "project")]

In [None]:
idx = 0
print(dataset[dataset._id.isin(proj2org.ids)].iloc[idx]["_id"])
print(dataset[dataset._id.isin(proj2org.ids)]["body"].values[idx])
# print("\nJusty comments:\nThis seems like an organization")

In [None]:
idx = 1
print(dataset[dataset._id.isin(proj2org.ids)].iloc[idx]["_id"])
print(dataset[dataset._id.isin(proj2org.ids)]["body"].values[idx])
print("\nJusty comments:\nThis seems like an organization")

In [None]:
idx = 2
print(dataset[dataset._id.isin(proj2org.ids)].iloc[idx]["_id"])
print(dataset[dataset._id.isin(proj2org.ids)]["body"].values[idx])
print("\nJusty comments:\nThis seems like an organization too, except they call it a project")