# Requirements and Data Loading/Transformation and Helpers

In [1]:
import ast
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.manifold import TSNE
from umap import UMAP


#  expand "labels" column to individual columns
def expand_labels(df):
    df['labels'] = df['labels'].apply(
        lambda x: ast.literal_eval(x) if isinstance(x, str) else {}
    ) # Convert string-formatted dictionary to dictionary
    expanded_data = pd.json_normalize(df['labels']) # Normalize dictionary entries to a separate DataFrame
    expanded_data.index = df.index
    result_df = pd.concat([df, expanded_data], axis=1)
    result_df = result_df.drop(columns=['labels'])
    return result_df


def insert_line_breaks(text, n):
    words = text.split()
    lines = [' '.join(words[i:i+n]) for i in range(0, len(words), n)]
    return '<br>'.join(lines)


def normalise_x(df):
    # Use a regex to replace any sequence of 'x's with 'xxxxx'
    df["response"] = df["response"].str.replace(r'\b(x+)\b', 'xxxxx', case=False, regex=True)
    return df


def update_scatter_format(fig):
    # Update layout
    fig.update_layout(
        legend=dict(orientation="h", yanchor="bottom", y=-0.2,
                    xanchor="center", x=0.5, title=None,
                    itemclick="toggleothers", itemdoubleclick="toggle",
                    font=dict(size=18)),
        margin=dict(t=100, b=100, l=100, r=100),
        height=1000,
        width=1000,
        xaxis=dict(title_font=dict(size=18), tickfont=dict(size=14)),
        yaxis=dict(title_font=dict(size=18), tickfont=dict(size=14))
    )

    # Update traces
    fig.update_traces(marker=dict(size=12, line=dict(width=2), opacity=0.7))

    # Customize legend
    trace_names = []
    for trace in fig.data:
        trace_name = trace.name.split(",")[0]
        if trace_name in trace_names:
            trace.showlegend = False
        else:
            trace.name = trace_name
            trace_names.append(trace_name)

    return fig


def add_tsne_embeddings(df):
    df_copy = df.copy()

    tsne = TSNE(n_components=2, random_state=0)
    embeddings = np.vstack(df.embeddings.values)
    embeddings_2d = tsne.fit_transform(embeddings)

    df_copy['x'] = embeddings_2d[:, 0]
    df_copy['y'] = embeddings_2d[:, 1]

    return df_copy


def add_umap_embeddings(df):
    df_copy = df.copy()

    # Create an instance of UMAP.
    umap_model = UMAP(n_components=2, random_state=42)
    embeddings = np.vstack(df.embeddings.values)
    embeddings_2d = umap_model.fit_transform(embeddings)

    df_copy['x'] = embeddings_2d[:, 0]
    df_copy['y'] = embeddings_2d[:, 1]

    return df_copy

all_logs = pd.read_csv("log_examples.csv")
all_logs.drop("embeddings", axis=1, inplace=True)
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
all_logs["embeddings"] = list(model.encode(all_logs['response'].tolist()))

In [2]:
all_logs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1230 entries, 0 to 1229
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   response       1230 non-null   object
 1   incident_type  1230 non-null   object
 2   is_mh          1230 non-null   bool  
 3   model          1230 non-null   object
 4   embeddings     1230 non-null   object
dtypes: bool(1), object(4)
memory usage: 39.8+ KB


# Basic Exploration

## Log Examples

In [3]:
eg_cols = ["model", "incident_type", "is_mh", "response"]
all_logs[all_logs.model == "gpt3.5"][eg_cols].sample(5)

Unnamed: 0,model,incident_type,is_mh,response
102,gpt3.5,Harassment/Stalking,True,xxxxx has reported being harassed and stalked ...
46,gpt3.5,Domestic Disturbance,False,Caller reported a loud argument and crashing s...
207,gpt3.5,Domestic Abuse,False,Report: 34-year-old female named Jane Smith re...
36,gpt3.5,Theft,False,Caller reported a theft at their residence loc...
361,gpt3.5,Domestic Abuse,False,Neighbor reported hearing loud noises coming f...


In [4]:
eg_cols = ["model", "incident_type", "is_mh", "response"]
all_logs[all_logs.model == "falcon"][eg_cols].sample(5)

Unnamed: 0,model,incident_type,is_mh,response
915,falcon,Harassment/Stalking,False,Scenario: Abuse of power\n\n ...
1059,falcon,Drugs,False,\n\nNote if the victim is a black man (no...
838,falcon,Domestic Abuse,False,\n This report describes xxxxx's histor...
999,falcon,Fraud,True,"1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,..."
825,falcon,Assault,True,"- In a residential area, a person approached x..."


## Table 1

In [5]:
counts = pd.crosstab(all_logs['incident_type'], all_logs['is_mh'], margins=True)
counts.columns = ['is_mh=false', 'is_mh=true', 'Total']
counts

Unnamed: 0_level_0,is_mh=false,is_mh=true,Total
incident_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Assault,34,51,85
Domestic Abuse,105,86,191
Domestic Disturbance,143,83,226
Drugs,41,26,67
Fraud,14,34,48
Harassment/Stalking,97,136,233
Missing Person,32,53,85
Noise Complaint,25,50,75
Theft,42,73,115
Trespassing,41,37,78


# Top n-Grams

## Code:

In [6]:
from collections import Counter, defaultdict
import nltk
from nltk.corpus import stopwords
import numpy as np
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer
import ipywidgets as widgets
from IPython.display import display, clear_output
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np

nltk.download('stopwords')
stop=set(stopwords.words('english'))

def plot_top_ngrams(df, model=None, incident_type=None, is_mh=None,
                    n=1, use_stop=False, stop_words=[], top_k=15):

    subset_features = {k: v for k, v in locals().items()
                       if k in {"model", "incident_type", "is_mh"} and v is not None}

    for feature_name, feature_value in subset_features.items():
        df = df[df[feature_name] == feature_value].copy()

    if use_stop:
        stop_words += list(stop)

    vec = CountVectorizer(ngram_range=(n, n),
                          stop_words=stop_words).fit(df.response)
    bag_of_words = vec.transform(df.response)
    bag_of_words = bag_of_words > 0
    sum_words = bag_of_words.sum(axis=0)
    words_freq = [(word, sum_words[0, idx])
                  for word, idx in vec.vocabulary_.items()]
    words_freq = sorted(words_freq, key = lambda x: x[1], reverse=True)
    y, x = map(list, zip(*words_freq))
    x = np.array(x) / len(df.response)
    return sns.barplot(x=x[:top_k], y=y[:top_k], hue=y[:top_k])


def mask_words(corpus, words):
    # Create a regex pattern to match any of the words in "words" (case-insensitive)
    pattern = re.compile(r'\b(?:' + '|'.join(re.escape(word) for word in words) + r')\b', re.IGNORECASE)

    # Process each document in the corpus
    updated_corpus = [pattern.sub('xxxxx', doc) for doc in corpus]

    return updated_corpus


def interactive_top_ngrams_plot(df, use_stop=False, stop_words=[], top_k=15):
    # 1. Create dropdown widgets
    model_dropdown = widgets.Dropdown(
        options=['All'] + df['model'].dropna().unique().tolist(),
        value='All',
        description='Model:',
    )

    incident_type_dropdown = widgets.Dropdown(
        options=['All'] + df['incident_type'].dropna().unique().tolist(),
        value='All',
        description='Incident Type:',
    )

    is_mh_dropdown = widgets.Dropdown(
        options=['All'] + df['is_mh'].dropna().unique().tolist(),
        value='All',
        description='Is MH:',
    )

    n_dropdown = widgets.Dropdown(
        options=[1, 2, 3],
        value=1,
        description='n:',
    )

    output = widgets.Output()

    # 2. Define update function
    def update_plot(change):
        with output:
            clear_output(wait=True)
            model = None if model_dropdown.value == 'All' else model_dropdown.value
            incident_type = None if incident_type_dropdown.value == 'All' else incident_type_dropdown.value
            is_mh = None if is_mh_dropdown.value == 'All' else is_mh_dropdown.value
            n_value = n_dropdown.value

            plot_top_ngrams(df, model=model, incident_type=incident_type,
                            is_mh=is_mh, n=n_value, use_stop=use_stop,
                            stop_words=stop_words, top_k=top_k)
            plt.show()

    # 3. Observe changes in dropdowns and update plot
    model_dropdown.observe(update_plot, names='value')
    incident_type_dropdown.observe(update_plot, names='value')
    is_mh_dropdown.observe(update_plot, names='value')
    n_dropdown.observe(update_plot, names='value')

    # 4. Display widgets
    display(model_dropdown, incident_type_dropdown, is_mh_dropdown, n_dropdown, output)

    # Initial plot
    update_plot(None)

# Example usage:

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/samrelins/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Output:

In [7]:
interactive_top_ngrams_plot(all_logs, use_stop=True, stop_words=["xxxxx"])

Dropdown(description='Model:', options=('All', 'gpt3.5', 'gpt4', 'falcon', 'example'), value='All')

Dropdown(description='Incident Type:', options=('All', 'Theft', 'Domestic Disturbance', 'Trespassing', 'Vandal…

Dropdown(description='Is MH:', options=('All', False, True), value='All')

Dropdown(description='n:', options=(1, 2, 3), value=1)

Output()

In [8]:
for news in df['headline_text']:
    words=[w for w in word_tokenize(news) if (w not in stop)]

    words=[lem.lemmatize(w) for w in words if len(w)>2]

    corpus.append(words)

NameError: name 'df' is not defined

In [None]:
import gensim

In [None]:
import re

def remove_punctuation(string):
    """
    Strips non-alphanumeric and whitespace characters from a string.
    """
    pattern = r'[^a-zA-Z0-9\s]'
    stripped_string = re.sub(pattern, '', string)
    return stripped_string


corpus = (logs.log
          .apply(lambda x: remove_punctuation(x))
          .apply(lambda x: [word for word in word_tokenize(x)
                            if word not in stop_words])
          .apply(lambda x: [stemmer.stem(word) for word in x]))
dic = gensim.corpora.Dictionary(corpus)
bow_corpus = [dic.doc2bow(doc) for doc in corpus]

  and should_run_async(code)


In [None]:
lda_model = gensim.models.LdaMulticore(bow_corpus,
                                       num_topics = 10,
                                       id2word = dic,
                                       passes = 10,
                                       workers = 2)
lda_model.show_topics()

  and should_run_async(code)


[(0,
  '0.034*"suspect" + 0.033*"individu" + 0.024*"activ" + 0.023*"drug" + 0.023*"report" + 0.019*"suspici" + 0.016*"incid" + 0.015*"observ" + 0.014*"park" + 0.014*"notic"'),
 (1,
  '0.067*"person" + 0.035*"miss" + 0.027*"report" + 0.024*"seen" + 0.023*"last" + 0.021*"name" + 0.020*"friend" + 0.017*"concern" + 0.017*"famili" + 0.014*"locat"'),
 (2,
  '0.034*"safeti" + 0.033*"report" + 0.024*"concern" + 0.024*"involv" + 0.022*"individu" + 0.022*"domest" + 0.016*"request" + 0.016*"neighbor" + 0.016*"state" + 0.015*"physic"'),
 (3,
  '0.031*"report" + 0.022*"male" + 0.020*"appear" + 0.018*"individu" + 0.018*"femal" + 0.018*"mental" + 0.016*"arriv" + 0.016*"the" + 0.016*"health" + 0.015*"state"'),
 (4,
  '0.030*"report" + 0.026*"harass" + 0.026*"messag" + 0.023*"safeti" + 0.022*"victim" + 0.021*"individu" + 0.018*"receiv" + 0.017*"request" + 0.017*"fear" + 0.017*"call"'),
 (5,
  '0.038*"victim" + 0.023*"partner" + 0.023*"report" + 0.020*"abus" + 0.020*"physic" + 0.018*"polic" + 0.017*"fem

In [None]:
import pyLDAvis.gensim_models
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda_model, bow_corpus, dic)
vis

  and should_run_async(code)


# Embedding t-SNE

**Objective**: Reduce the dimensionality of the embeddings from the high-dimensional space to 2D space, facilitating visualization of the semantic distribution of logs.

- **Step 1: Calculate High-Dimensional Embeddings**
  - Use Sentence Transformer model ('paraphrase-MiniLM-L6-v2') to convert the 'response' text from each log into high-dimensional vector embeddings.

- **Step 2: Apply t-SNE for Dimensionality Reduction**
  - Use t-SNE (t-Distributed Stochastic Neighbor Embedding) to reduce the dimensionality of the embeddings from high-dimensional space to 2D space, preserving the pairwise similarities between points.

- **Step 7: Visualize 2D Embeddings**
  - The 2D embeddings have been visualized in a scatter plot, with hand-crafted examples higlighted to visually inspect the distribution and clustering of generated vs. genuine logs.

- **Output**:
  - The modified DataFrame with two additional columns: 'x' and 'y', representing the 2D embeddings of the text responses.
  - A visual representation of the semantic distribution of logs, aiding in the qualitative assessment of the synthetic data’s diversity and similarity to genuine examples.

## Code:

In [3]:
import plotly.express as px
import plotly.io as pio

pio.renderers.default = 'iframe'

def plot_embeddings_interactive(df, model="gpt3.5", color="incident_type"):

    df = df[df.model.isin([model, "example"])].copy()

    df['response'] = df['response'].apply(lambda x: insert_line_breaks(x, 10))

    # Create scatter plot
    fig = px.scatter(df, x='x', y='y',
                     color=color,
                     hover_name="response",
                     symbol='model',
                     symbol_sequence=["circle", "x"])

    fig = update_scatter_format(fig)

    # Show the plot
    fig.show()


all_logs_w_tsne_embeds = all_logs.pipe(add_tsne_embeddings)
x_min, x_max = all_logs_w_tsne_embeds.x.agg(["min", "max"])
y_min, y_max = all_logs_w_tsne_embeds.y.agg(["min", "max"])

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


## Output:

In [5]:
plot_embeddings_interactive(all_logs_w_tsne_embeds,
                            model="gpt3.5",
                            color="incident_type")

# Embedding UMAP

Methods/objectives are the same as t-SNE, just with different method to generate the 2d representation of the embeddings. UMAP is more efficient computationally, and is supposed to better represent global structure of the embeddings.

## Code:

In [6]:
import plotly.express as px


def plot_embeddings_interactive(df, models=[], x_model="example",
                                color="incident_type"):

    df = df[df.model.isin(models)].copy()

    df['response'] = df['response'].apply(lambda x: insert_line_breaks(x, 10))

    symbol_sequence = ["x" if model == x_model else "circle"
                       for model in df.model.unique()]
    # Create scatter plot
    fig = px.scatter(df, x='x', y='y',
                     color=color,
                     hover_name="response",
                     symbol='model',
                     symbol_sequence=symbol_sequence)

    fig = update_scatter_format(fig)

    # Show the plot
    fig.show()


all_logs_w_umap_embeds = all_logs.pipe(add_umap_embeddings)
x_min, x_max = all_logs_w_umap_embeds.x.agg(["min", "max"])
y_min, y_max = all_logs_w_umap_embeds.y.agg(["min", "max"])


n_jobs value -1 overridden to 1 by setting random_state. Use no seed for parallelism.



## Results

In [7]:
plot_embeddings_interactive(all_logs_w_umap_embeds,
                            models = ["gpt4", "gpt3.5"],
                            x_model = "gpt4",
                            color="incident_type")

# Diversity Stats

**Objective**:
Calculate and analyze the diversity of synthetic police incident reports generated by different language models, categorized by incident types.

**Method**:
- Calculate pairwise cosine distances between embeddings to assess semantic diversity.
- Compute mean diversity scores for each unique model type and incident type in the DataFrame, and for aggregate models and full dataset. This is done by grouping the DataFrame by relevant features, and then calculating the diversity within each group.

**Output**:
A "table 1" style dataframe (`diversity_stats_df`) that displays the pairwise cosine distances stratified by model and incident type

## Code:

In [8]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_distances


def calculate_diversity(embeddings):
    distances = cosine_distances(embeddings)
    return np.mean(distances)


def diversity_stats(df):

    # Initialize results dataframe
    index = df['model'].unique().tolist() + ['all']
    columns = df['incident_type'].unique().tolist() + ['all']
    results = pd.DataFrame(index=index, columns=columns)

    # Calculate diversity for each model type
    for model_type in df['model'].unique():
        sub_df = df[df['model'] == model_type]
        for incident_type in df['incident_type'].unique():
            incident_sub_df = sub_df[sub_df['incident_type'] == incident_type]
            if not incident_sub_df.empty:
                embeds = np.vstack(incident_sub_df['embeddings'].values)
                results.at[model_type, incident_type] = calculate_diversity(embeds)
        # Calculate diversity for all incident types within this model type
        embeds = np.vstack(sub_df['embeddings'].values)
        results.at[model_type, 'all'] = calculate_diversity(embeds)

    # Calculate diversity for each incident type (across all model types)
    for incident_type in df['incident_type'].unique():
        incident_sub_df = df[df['incident_type'] == incident_type]
        if not incident_sub_df.empty:
            embeds = np.vstack(incident_sub_df['embeddings'].values)
            results.at['all', incident_type] = calculate_diversity(embeds)

    # Calculate diversity for all data combined
    results.at['all', 'all'] = calculate_diversity(np.vstack(df['embeddings'].values))

    return results


## Output:

In [9]:
diversity_stats(all_logs)


Unnamed: 0,Theft,Domestic Disturbance,Trespassing,Vandalism,Domestic Abuse,Drugs,Noise Complaint,Harassment/Stalking,Missing Person,Fraud,Assault,all
gpt3.5,0.384648,0.320541,0.303272,0.358015,0.324055,0.365189,0.260003,0.281971,0.390295,0.362537,0.342705,0.500655
gpt4,0.409654,0.296098,0.35797,0.315052,0.353337,0.417224,0.28083,0.33021,0.395169,0.369929,0.431817,0.498674
falcon,0.717294,0.746368,0.732304,0.676691,0.727187,0.732534,0.654845,0.695468,0.689948,0.692806,0.721182,0.759069
example,0.387959,0.350209,0.370989,0.0,0.378645,0.293464,0.0,0.326927,0.239076,0.0,0.0,0.569068
all,0.605949,0.599613,0.646186,0.54118,0.555577,0.547982,0.439558,0.428324,0.565347,0.580678,0.591551,0.640852


# Incident Type Similarities

**Objective**: Compute a similarity score for each log in the dataset, based on how semantically similar it is to the centroid of its respective 'incident_type' category.

**Step 1: Calculate Embeddings**
  - Use SentenceTransformer model to transform the 'response' text of each log into a high-dimensional vector (embedding).

**Step 2: Calculate Centroids**
  - For each unique 'incident_type', calculate the centroid of the embeddings by averaging them. Store these centroids.

**Step 3: Calculate Similarity Scores**
  - For each log, calculate the cosine similarity between its embedding and the centroid of its 'incident_type' category. Cosine similarity ranges from -1 (completely dissimilar) to 1 (completely similar).

**Output**:
  - The 'similarity_score' column reflects how semantically similar each log is to the average (centroid) of its category, providing a metric of typicality or alignment with other logs of the same incident type.

This method provides a way to quantify how well each log aligns with the central tendency of its category, which can be useful for understanding the diversity and consistency of your synthetic data.

## Code:

In [10]:
import plotly.express as px
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

def calculate_similarity_scores(df):

    # Calculate the centroid of all embeddings for each 'incident_type'
    centroids = df.groupby('incident_type')['embeddings'].apply(
        lambda emb_list: np.mean(emb_list, axis=0)
    )

    # Calculate cosine similarity of each example to its 'incident_type' centroid
    def calc_similarity_by_row(row):
        centroid = centroids.loc[row.incident_type]
        return cosine_similarity([row.embeddings], [centroid])[0][0]

    df['similarity_score'] = df.apply(calc_similarity_by_row, axis=1)

    return df


def plot_similarity_interactive(df, incident_type):

    df = df[df.incident_type == incident_type].copy()

    df['response'] = df['response'].apply(lambda x: insert_line_breaks(x, 10))

    # Create scatter plot
    fig = px.scatter(df, x='x', y='y',
                     color = "similarity_score",
                     color_continuous_scale="Turbo",
                     hover_name="response",
                     symbol='model',
                     symbol_sequence=["circle", "x"])


    fig = update_scatter_format(fig)

    fig.show()


logs_w_similarities = (all_logs.
                       pipe(calculate_similarity_scores).
                       pipe(add_umap_embeddings))


n_jobs value -1 overridden to 1 by setting random_state. Use no seed for parallelism.



## Output:

In [11]:
plot_similarity_interactive(logs_w_similarities, "Domestic Disturbance")

# Outlier Detection

**Objective**: Identify and label anomalies in the dataset based on the semantic content of logs and their incident type.

**Step 1: Embed Text Data**
  - Convert text responses in the dataset to numerical embeddings using a pre-trained transformer model (SentenceTransformer with 'paraphrase-MiniLM-L6-v2').
  
**Step 2: Apply Isolation Forest**
  - Train an Isolation Forest model on embeddings for each "incident_type" category to detect and label anomolies within each category

**Output**: The original dataset with an additional 'is_anomaly' column, where True indicates an anomalous log within its respective incident type category.

This method allows for the direct identification and labeling of anomalies in your dataset, helping you to easily spot logs that are atypical or unusual within their specific incident type categories.

## Code:

In [20]:
import plotly.express as px
from sklearn.ensemble import IsolationForest

def add_category_outliers(df):

    df['is_outlier'] = False  # Initialize the outlier column
    for incident_type in df['incident_type'].unique():
        incident_mask = df['incident_type'] == incident_type

        if sum(incident_mask) > 2:  # Isolation Forest requires at least 3 samples
            clf = IsolationForest(contamination='auto')  # 'auto' will set threshold as 1/num_samples
            embeds = np.vstack(df[incident_mask].embeddings.values)
            preds = clf.fit_predict(embeds)
            df.loc[incident_mask, 'is_outlier'] = preds == -1

    return df


def plot_outliers_interactive(df):

    df = df.copy()

    df['response'] = df['response'].apply(lambda x: insert_line_breaks(x, 10))

    # Create scatter plot
    fig = px.scatter(df, x='x', y='y',
                     color="incident_type",
                     hover_name="response",
                     symbol='is_outlier',
                     symbol_sequence=["circle", "x"])

    fig = update_scatter_format(fig)

    fig.show()


logs_w_outliers = (all_logs[all_logs.model != "falcon"].
                   pipe(add_category_outliers).
                   pipe(add_umap_embeddings))



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


n_jobs value -1 overridden to 1 by setting random_state. Use no seed for parallelism.



## Output:

In [21]:
plot_outliers_interactive(logs_w_outliers)

# Perplexity


### Objective:
Evaluate the fidelity of synthetic police incident reports generated by different language models.

### Method:
1. **Perplexity Calculation**: Apply the `calculate_perplexity` function to add a 'perplexity' column to `all_logs`, representing the predictive uncertainty of a GPT-2 model for each text log.
2. **Visualization**: Create a Plotly box plot to visualize the distribution of perplexity scores across different models.

### Output:
- visual summary of model performance based on text generation fidelity.


## Code:

In [25]:
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer

def calculate_perplexity(df):
    # Load pre-trained model and tokenizer
    model = GPT2LMHeadModel.from_pretrained('gpt2')
    tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

    # Ensure model is in evaluation mode
    model.eval()

    # Calculate perplexity for each response
    def get_perplexity(response):
        encodings = tokenizer(response, return_tensors='pt')

        with torch.no_grad():
            outputs = model(**encodings)

        logits = outputs.logits
        shift_logits = logits[:, :-1, :].view(-1, logits.size(-1))
        shift_labels = encodings['input_ids'][:, 1:].view(-1)

        loss_fct = torch.nn.CrossEntropyLoss()
        loss = loss_fct(shift_logits, shift_labels)
        perplexity = torch.exp(loss).item()

        return perplexity

    # Apply the function to the 'response' column and create a new 'perplexity' column
    df['perplexity'] = df['response'].apply(get_perplexity)

    return df

# Apply the function to your DataFrame
logs_w_perplexity = calculate_perplexity(all_logs)


## Output:

In [26]:
px.box(logs_w_perplexity, x="model", y="perplexity", color="model")

In [27]:
gpt_mask = logs_w_perplexity.model == "gpt3.5"
px.box(logs_w_perplexity[gpt_mask],
       x="incident_type",
       y="perplexity",
       color="incident_type")