In [None]:
from pii_detection.probe_labels_utils import read_tokens_df

df_train = read_tokens_df("../data/train_shard.json")
df_val = read_tokens_df("../data/val_shard.json")



def containes_dot(word: str):
    if "." in word and not word.startswith(".") and not word.endswith("."):
        return True
    return False

def digit_streak_in_str(word:str):
    max_streak = 0
    streak = 0
    for c in word:
        if c.isdigit():
            streak += 1
        else:
            max_streak = max(max_streak, streak)
            streak = 0
    return max_streak

for df in [df_train, df_val]:
    df["contains_dot"] = df["tokens"].apply(containes_dot)
    df["contains_("] = df["tokens"].str.contains("(", regex=False)
    df["contains_-"] = df["tokens"].str.contains("-", regex=False)
    df["contains_digit"] = df["tokens"].apply(lambda x: any([c.isdigit() for c in x]))
    df["contains_@"] = df["tokens"].str.contains("@", regex=False)
    df["contains_http"] = df["tokens"].str.contains("http", regex=False)
    df["contains_www"] = df["tokens"].str.contains("www", regex=False)
    df["contains_.com"] = df["tokens"].str.contains(".com", regex=False)
    df["is_upper"] = df["tokens"].str.isupper()
    df["is_title"] = df["tokens"].str.istitle()
    df["is_digit"] = df["tokens"].str.isdigit()
    df["is_alpha"] = df["tokens"].str.isalpha()
    df["is_space"] = df["tokens"].str.isspace()
    df["is_lower"] = df["tokens"].str.islower()
    df["is_numeric"] = df["tokens"].str.isnumeric()
    df["is_alnum"] = df["tokens"].str.isalnum()
    df["is_decimal"] = df["tokens"].str.isdecimal()
    df["tokens_len"] = df["tokens"].str.len()
    df["new_line"] = df["tokens"].str.contains("\n")
    df["new_paragraph"] = (df["tokens"].str.count("\n") > 1)
    df["line_num"] = df.groupby("document")["new_line"].cumsum()
    df["paragraph_num"] = df.groupby("document")["new_paragraph"].cumsum()
    df["n_digits"] = df["tokens"].apply(lambda x: sum([c.isdigit() for c in x]))
    df["digit_streak"] = df["tokens"].apply(digit_streak_in_str)
# df[~df["tokens"].apply(f)]["labels"].value_counts()
# df["tokens"].apply(f).value_counts(normalize=True)

df_train["labels"] = df_train["labels"].apply(lambda x: "O" if "NAME" in x else x)
df_val["labels"] = df_val["labels"].apply(lambda x: "O" if "NAME" in x else x)
df_train.shape[1] == df_val.shape[1]

In [None]:
import pandas as pd


def mask(df: pd.DataFrame) -> pd.Series:
    return (df["is_title"] | df["contains_http"] | df["contains_www"] | df["contains_.com"] | df["contains_@"] | df["contains_("] | df["contains_-"] | df["contains_digit"] | df["contains_dot"])

In [None]:
# train random forest classifier:

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
import numpy as np

train_mask = mask(df_train)
val_mask = mask(df_val)

X_train = df_train[train_mask].drop(["labels", "tokens", "document"], axis=1)
y_train = df_train[train_mask]["labels"]
X_val = df_val[val_mask].drop(["labels", "tokens", "document"], axis=1)
y_val = df_val[val_mask]["labels"]

clf = RandomForestClassifier(verbose=3, n_jobs=8)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_val)

print(classification_report(y_val, y_pred))
print(confusion_matrix(y_val, y_pred))
print(np.unique(y_pred))
print(np.unique(y_val))


In [None]:
y_pred = clf.predict(X_train)
print(classification_report(y_train, y_pred))
print(confusion_matrix(y_train, y_pred))
print(np.unique(y_pred))
print(np.unique(y_train))

In [None]:
import json

with open("../data/train_shard.json", "r") as f:
    train_data = json.load(f)
train_data = sorted(train_data, key=lambda x: x["document"])

all_names = []
for essay in train_data:
    doc_id = essay["document"]
    names = []
    
    for token, label in zip(essay["tokens"], essay["labels"]):
        if "NAME_STUDENT" in label:
            if label.startswith("B"):
                names.append(token)
            elif label.startswith("I"):
                names[-1] += f" {token}"
    if names:
        all_names.append(names)

all_names


with open("../data/train_shard_renamed.json", "r") as f:
    train_data = json.load(f)
train_data = sorted(train_data, key=lambda x: x["document"])

all_names2 = []
for essay in train_data:
    doc_id = essay["document"]
    names = []
    
    for token, label in zip(essay["tokens"], essay["labels"]):
        if "NAME_STUDENT" in label:
            if label.startswith("B"):
                names.append(token)
            elif label.startswith("I"):
                names[-1] += f" {token}"
    if names:
        all_names2.append(names)

pd.DataFrame({"names": all_names, "names2": all_names2}).

In [None]:
with open("../data/train_shard_renamed.json", "r") as f:
    train_data = sorted(json.load(f), key=lambda x: x["document"])

train_data[0]

In [None]:
from transformers import BertTokenizer
# import bert-base-uncased

def tokenize_text(text, max_length=512):
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    tokens = tokenizer.encode(text, add_special_tokens=False, max_length=max_length)
    return tokenizer.convert_ids_to_tokens(tokens)

In [None]:
from transformers import DistilBertTokenizer, DistilBertForQuestionAnswering
import torch

def chat_with_bert(context, question, model, tokenizer):
    # Tokenize the input
    inputs = tokenizer(context, question, return_tensors="pt")

    # Get the model output
    outputs = model(**inputs)
    
    # Get the answer span
    start_scores = outputs.start_logits
    end_scores = outputs.end_logits

    # Find the tokens with the highest start and end scores
    start_index = torch.argmax(start_scores)
    end_index = torch.argmax(end_scores)

    # Get the answer span
    answer_span = inputs["input_ids"][0][start_index:end_index+1]
    answer = tokenizer.decode(answer_span, skip_special_tokens=True)

    return answer

# Load DistilBERT model and tokenizer
model_name = 'distilbert-base-cased-distilled-squad'
tokenizer = DistilBertTokenizer.from_pretrained(model_name)
model = DistilBertForQuestionAnswering.from_pretrained(model_name)

# Example conversation
context = "BERT is a powerful pre-trained language model. It can be fine-tuned for various natural language processing tasks."

# Chat with BERT
while True:
    question = input("You: ")
    if question.lower() in ["exit", "quit", "bye"]:
        print("Chat ended.")
        break
    
    answer = chat_with_bert(context, question, model, tokenizer)
    print("BERT:", answer)


In [None]:
from tqdm import tqdm

