<a href="https://colab.research.google.com/github/rmaacario/spatial-semantics-translation/blob/main/Error_Analysis_graphs.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%load_ext cudf.pandas

In [None]:
%reload_ext cudf.pandas

In [None]:
import matplotlib.pyplot as plt
import matplotlib.patches as patches

def draw_encoder_decoder_attention():
    fig, ax = plt.subplots(figsize=(12, 4))

    # Draw the encoder box
    enc_box = patches.FancyBboxPatch((0.1, 0.5), 0.1, 0.2, boxstyle="round,pad=0.1", edgecolor="black", facecolor="#add8e6")
    ax.add_patch(enc_box)
    ax.text(0.15, 0.6, 'ENC', horizontalalignment='center', verticalalignment='center', fontsize=12, weight='bold')

    # Draw the attention box
    att_box = patches.FancyBboxPatch((0.35, 0.5), 0.1, 0.2, boxstyle="round,pad=0.1", edgecolor="black", facecolor="#add8e6")
    ax.add_patch(att_box)
    ax.text(0.4, 0.6, 'ATT', horizontalalignment='center', verticalalignment='center', fontsize=12, weight='bold')

    # Draw the decoder box
    dec_box = patches.FancyBboxPatch((0.6, 0.5), 0.1, 0.2, boxstyle="round,pad=0.1", edgecolor="black", facecolor="#add8e6")
    ax.add_patch(dec_box)
    ax.text(0.65, 0.6, 'DEC', horizontalalignment='center', verticalalignment='center', fontsize=12, weight='bold')

    # Draw input text
    ax.text(0.1, 0.3, r'$<it> \, the \, cat \, on \, the \, mat$', horizontalalignment='center', verticalalignment='center', fontsize=12, color='black')

    # Draw output text
    ax.text(0.75, 0.7, r'$il$', horizontalalignment='center', verticalalignment='center', fontsize=12, color='black')
    ax.text(0.9, 0.7, r'$gatto$', horizontalalignment='center', verticalalignment='center', fontsize=12, color='red')
    ax.text(0.75, 0.3, r'$<it> \, il$', horizontalalignment='center', verticalalignment='center', fontsize=12, color='black')

    # Draw arrows between boxes
    ax.annotate('', xy=(0.2, 0.6), xytext=(0.35, 0.6),
                arrowprops=dict(facecolor='black', arrowstyle='->'))
    ax.annotate('', xy=(0.45, 0.6), xytext=(0.6, 0.6),
                arrowprops=dict(facecolor='black', arrowstyle='->'))

    # Draw state and embedding arrows
    ax.annotate('', xy=(0.45, 0.6), xytext=(0.45, 0.8),
                arrowprops=dict(facecolor='black', arrowstyle='->'))
    ax.annotate('', xy=(0.55, 0.8), xytext=(0.6, 0.6),
                arrowprops=dict(facecolor='black', arrowstyle='->'))
    ax.annotate('', xy=(0.75, 0.6), xytext=(0.9, 0.6),
                arrowprops=dict(facecolor='black', arrowstyle='->'))

    # Draw word, embeddings, states labels
    ax.text(0.95, 0.65, 'states', horizontalalignment='left', verticalalignment='center', fontsize=12, color='blue')
    ax.text(0.95, 0.35, 'embeddings', horizontalalignment='left', verticalalignment='center', fontsize=12, color='blue')
    ax.text(0.95, 0.1, 'words', horizontalalignment='left', verticalalignment='center', fontsize=12, color='blue')

    # Set limits and hide axes
    ax.set_xlim(0, 1)
    ax.set_ylim(0, 1)
    ax.axis('off')

    plt.show()

# Draw the first diagram
draw_encoder_decoder_attention()


In [None]:
import os
import glob
import numpy as np
import pandas as pd
from plotnine import annotate, xlab, ylab, ggplot, aes, geom_tile, scale_fill_gradient, geom_text, theme_gray, theme_minimal, theme, element_text, coord_fixed, scale_fill_identity

In [None]:
import pandas as pd
from plotnine import *
from datetime import datetime

# Data for the timeline
data = pd.DataFrame({
    'date': [
        '2019-10', '2020-05', '2020-10', '2021-04', '2021-06', '2021-07', '2021-08', '2021-09', '2021-10', '2021-12',
        '2022-01', '2022-02', '2022-03', '2022-04', '2022-08', '2022-09', '2022-10', '2022-11', '2022-12',
        '2023-02', '2023-03', '2023-04', '2023-05', '2023-06', '2023-07', '2023-08',
        '2024-01'
    ],
    'model': [
        'T5', 'GPT-3', 'mT5', 'PanGu-α', 'CPM-2', 'Codex', 'ERNIE 3.0', 'Jurassic-1', 'HyperCLOVA', 'Yuan 1.0', 'Gopher', 'ERNIE 3.0 Titan', 'GLAM', 'LaMDA', 'WebGPT',
        'MT-NLG', 'AlphaCode', 'Chinchilla', 'PaLM', 'AlexaTM', 'Sparrow', 'U-PaLM', 'Flan-U-PaLM', 'BLOOM', 'ChatGPT',
        'OPT-IML', 'mT0', 'Galactica', 'GLM', 'OPT', 'UL2', 'Tk-Instruct', 'GPT-NeoX-20B', 'CodeGen', 'PaLM 2', 'Claude', 'Bard', 'Gemini'
    ]
})

# Convert date to datetime
data['date'] = pd.to_datetime(data['date'])

# Create the plot
timeline_plot = (
    ggplot(data, aes(x='date', y=0, label='model')) +
    geom_point(color='black') +
    geom_text(aes(label='model'), ha='left', nudge_y=0.3, size=8) +
    geom_segment(aes(xend='date', yend=0), linetype='solid') +
    scale_x_datetime(date_labels='%b\n%Y', date_breaks='3 months') +
    theme_minimal() +
    theme(
        axis_title_y=element_blank(),
        axis_text_y=element_blank(),
        axis_ticks_y=element_blank(),
        panel_grid_major_y=element_blank(),
        panel_grid_minor_y=element_blank(),
        axis_text_x=element_text(size=10),
        axis_title_x=element_text(size=12),
        plot_margin={'top': 10, 'right': 50, 'bottom': 30, 'left': 50}
    )
)

# Display the plot
print(timeline_plot)

In [None]:
import numpy as np
import pandas as pd
from plotnine import *
from scipy.stats import norm

# Generate data for the normal distribution
x = np.linspace(-4, 4, 1000)
y = norm.pdf(x)

# Create a DataFrame
data = pd.DataFrame({'x': x, 'y': y})

# Create the plot
p = (
    ggplot(data, aes(x='x', y='y')) +
    geom_line(color='black') +
    geom_ribbon(aes(ymin=0, ymax='y'), data=data[(x >= -1.96) & (x <= 1.96)], fill='lightblue') +
    geom_ribbon(aes(ymin=0, ymax='y'), data=data[(x < -1.96)], fill='lightgray') +
    geom_ribbon(aes(ymin=0, ymax='y'), data=data[(x > 1.96)], fill='lightgray') +
    annotate('text', x=0, y=0.15, label='95% Confidence Level', ha='center', size=10) +
    annotate('text', x=-3, y=0.06, label='Significance Level', ha='center', size=10) +
    annotate('text', x=3, y=0.06, label='Significance Level', ha='center', size=10) +
    annotate('text', x=-3, y=0.04, label='2%', ha='center', size=10) +
    annotate('text', x=3, y=0.04, label='2%', ha='center', size=10) +
    labs(x='Confidence Interval', y='') +
    theme_void() +
    theme(axis_text_y=element_blank(), axis_ticks_major_y=element_blank())
)

# Display the plot
print(p)

In [None]:
import pandas as pd
from plotnine import ggplot, aes, geom_rect, geom_text, theme_void, theme

# Data for the rectangles
rects = pd.DataFrame({
    'xmin': [0, 1, 0, 1],
    'xmax': [1, 2, 1, 2],
    'ymin': [1, 1, 0, 0],
    'ymax': [2, 2, 1, 1],
    'label': [
        'moldura de satélite\n↓\nMANEIRA no verbo',
        'moldura de verbo\n↓\nTRAJETO no verbo',
        'INGLÊS\nlimp, tip-toe, crawl',
        'PORTUGUÊS\nsubir, descer, entrar, sair'
    ]
})

# Create the plot
p = (ggplot(rects) +
     geom_rect(aes(xmin='xmin', xmax='xmax', ymin='ymin', ymax='ymax'),
               fill=None, color='black', size=1) +  # Border size
     geom_text(aes(x='(xmin + xmax) / 2', y='(ymin + ymax) / 2', label='label'),
               ha='center', va='center', size=15, fontstyle='italic') +  # Text size
     theme_void() +
     theme(plot_margin=0)  # Remove plot margins
    )

# Display the plot
print(p)

In [None]:
errors = ['cc', 'un', 'om', 'ad', 'ag', 'co',
         'wl', 'wt', 'an', 'gr', 'in', 'ie']

s_errors = ['sp', 'po', 'ws']

ie_errors = ['ie']

ns_errors = ['un', 'om', 'ad', 'ag', 'co',
              'wl', 'wt', 'an', 'gr', 'in', 'ie']

error_types_to_include = list(set(s_errors + errors) - {'cc'})

In [None]:
ns_errors

In [None]:
model_labels = {
        'gemma:7b': 'Gemma-7B',
        'llama2:7b': 'LLaMA-2-7B',
        'llama2:13b': 'LLaMA-2-13B',
        'llama3': 'LLaMA-3-8B',
        'deepl': 'DeepL',
        'mistral': 'Mistral-7B',
        'amazon-stock': 'Amazon (Stock)',
        'amazon-custom': 'Amazon (Custom)',
        'googletrans': 'Google',
        'mixtral': 'Mixtral-8x7B'
    }

In [None]:
import pandas as pd

#folder_path = "/content/Untitled Folder"

df = pd.read_csv('/content/Mixtral-8x7b_merged.csv')
df

In [None]:
dff = df[df['inner_id'] == 2797]
df

print(dff['reference'].iloc[0])

In [None]:
df = pd.read_csv('/content/TEDTalks.en_pt-br.Mixtral.ANALYZED.csv')
df_filtered = df[df['spatial_sense'].str.contains('Onto') & (df['error_type'] == 'ie')]
df_filtered['inner_id'].iloc[0]

In [None]:
folder_path = 'content/'

In [None]:
# Define the function to split the 'spatial_sense' column
def extract_spatial_sense_label(spatial_sense):
    return spatial_sense.split('(')[0]

In [None]:
import glob
import pandas as pd

def process_folder(folder_path, column, model_labels):
    # Get a list of all files with .csv extension in the folder
    file_paths = glob.glob(folder_path + "/*.csv")

    # Initialize an empty dictionary to store DataFrames
    grouped_dfs = {}
    model_names = []

    # Iterate over each file in the folder
    for file_path in file_paths:
        print("Processing file:", file_path)  # Debugging: Print the file being processed
        # Read the file
        df = pd.read_csv(file_path)

        # Extract model name
        model_name = df.columns[3] if len(df.columns) > 3 else None
        print("Model name:", model_name)  # Debugging: Print the extracted model name
        if model_name:
            model_names.append(model_name)

        # Process error types
        df['error_type'] = df['error_type'].str.split(',')
        df['error_type'] = df['error_type'].apply(lambda x: [item.strip() for item in x if isinstance(item, str) and item.strip().isalpha()] if isinstance(x, list) and all(isinstance(item, str) for item in x) else [])
        df = df.explode('error_type')
        df['error_type'] = df['error_type'].replace({'ha': 'in', 'wo': 'gr', 're': 'gr', 'lt': 'an'})  # Replace error types as specified
        df = df[df['error_type'].isin(errors) | df['error_type'].isin(s_errors)]  # Use '|' for element-wise logical OR operation

        # Filter out rows where 'spatial_sense' ends with (5) or (3) if values are 'sp', 'po', 'ws'
        df = df[~(df['spatial_sense'].str.endswith('(5)') | df['spatial_sense'].str.endswith('(3)')) | ~df['spatial_sense'].isin(['sp', 'po', 'ws'])]

        # Check if 'Model' column exists
        if 'Model' not in df.columns:
            # Add a new column 'Model' with the model name
            df['Model'] = model_name

        # Apply the function to create a new column 'spatial_sense_label'
        df['spatial_sense_label'] = df['spatial_sense'].apply(extract_spatial_sense_label)

        # Group by the specified column and error type, and calculate counts
        grouped_df = df.groupby([column, 'error_type', 'spatial_sense_label']).size().reset_index(name='count')

        # Store the grouped DataFrame in the dictionary
        grouped_dfs[model_name] = grouped_df

    # Get unique values in the 'spatial_sense_label' column from the last DataFrame
    unique_labels = df['spatial_sense_label'].unique()

    return grouped_dfs, model_names, unique_labels

In [None]:
import pandas as pd

# Sample DataFrame
data = {
    'spatial_sense': ['Across(3)', 'Across(5)', 'Across(1)', '3', '2'],
    'error_type': ['sp', 'po', 'ws', 'other', 'sp']
}
df = pd.DataFrame(data)

combined_df = pd.groupby(['spatial_sense', 'error_type'], as_index=False).sum()


print(df)

In [None]:
import os
import pandas as pd

# Directory containing the CSV files
folder_path = '/content/'

error_types = ['sp', 'ws', 'po']

# Read and concatenate all CSV files in the folder
all_files = [f for f in os.listdir(folder_path) if f.endswith('.csv')]
df_list = [pd.read_csv(os.path.join(folder_path, file)) for file in all_files]
combined_df = pd.concat(df_list, ignore_index=True)

# Ensure the 'count' column exists
if 'count' not in combined_df.columns:
    combined_df['count'] = 1  # Assign a default count of 1 if not present

# Update 'error_type' based on 'spatial_sense'
combined_df.loc[combined_df['spatial_sense'].isin(['Into(3)', 'Onto(3)', 'Across(5)', 'Through(5)']) &
                combined_df['error_type'].isin(error_types), 'error_type'] = 'ie'

combined_df.loc[(combined_df['spatial_sense'].str.endswith('(1)') |
                 combined_df['spatial_sense'].str.endswith('(2)') |
                 combined_df['spatial_sense'].str.endswith('(4)') |
                 combined_df['spatial_sense'].isin(['Across(3)', 'Through(3)'])) &
                (combined_df['error_type'] == 'ie'), 'error_type'] = 'sp'

# Aggregate counts based on the updated 'error_type'
aggregated_df = combined_df.groupby(['spatial_sense', 'error_type'], as_index=False)['count'].sum()

# Separate the aggregated data into different DataFrames for each error type
df_sp = aggregated_df[aggregated_df['error_type'] == 'sp']
df_ws = aggregated_df[aggregated_df['error_type'] == 'ws']
df_po = aggregated_df[aggregated_df['error_type'] == 'po']

# Group by 'spatial_sense' and sum the counts for each error type
grouped_sp = df_sp.groupby('spatial_sense')['count'].sum().reset_index(name='count_sp')
grouped_ws = df_ws.groupby('spatial_sense')['count'].sum().reset_index(name='count_ws')
grouped_po = df_po.groupby('spatial_sense')['count'].sum().reset_index(name='count_po')

# Merge the grouped DataFrames to get the total counts by 'spatial_sense'
total_counts = pd.merge(grouped_sp, grouped_ws, on='spatial_sense', how='outer')
total_counts = pd.merge(total_counts, grouped_po, on='spatial_sense', how='outer')

# Fill NaN values with 0 and calculate the total count
total_counts = total_counts.fillna(0)
total_counts['total_count'] = total_counts['count_sp'] + total_counts['count_ws'] + total_counts['count_po']

# Output the results
print("Grouped DataFrame for SP:")
print(grouped_sp)
print("\nGrouped DataFrame for WS:")
print(grouped_ws)
print("\nGrouped DataFrame for PO:")
print(grouped_po)
print("\nTotal Counts by Spatial Sense:")
print(total_counts)

In [None]:
def export_grouped_dfs(grouped_dfs, unique_labels, output_dir):
    os.makedirs(output_dir, exist_ok=True)

    for label in unique_labels:
        combined_df = pd.concat([df for df in grouped_dfs.values() if not df.empty], ignore_index=True)
        df_filtered = combined_df[combined_df['spatial_sense_label'] == label]
        if not df_filtered.empty:
            file_path = os.path.join(output_dir, f"{label}_errors_corrects.csv")
            df_filtered.to_csv(file_path)
            print(f"Exported {label}_errors_corrects.csv to {output_dir}")
        else:
            print(f"No data found for label: {label}")

In [None]:
grouped_df, model_names, unique_labels = process_folder(folder_path, 'spatial_sense', model_labels)
export_grouped_dfs(grouped_df, unique_labels, 'content/output')
grouped_df

In [None]:
grouped_df

In [None]:
def filter_and_export_total_error_corrects(df, column, model_labels, model_names, error_types_to_include):
    # Filter rows containing 'cc'
    filtered_df1 = df[df['error_type'].str.contains('cc')]

    # Filter rows not containing 'cc'
    filtered_df2 = df[~df['error_type'].str.contains('cc')]

    # Create pivot tables for both filtered DataFrames
    pivot1 = pd.pivot_table(filtered_df1, index=column, columns='error_type', values='count', aggfunc='sum', fill_value=0)
    pivot2 = pd.pivot_table(filtered_df2, index=column, columns='error_type', values='count', aggfunc='sum', fill_value=0)

    # Sum up all error columns from pivot2 into one column 'Total of Errors'
    pivot2_total_errors = pivot2.sum(axis=1).to_frame('Total of Errors')

    # Merge pivot2 and pivot2_total_errors
    merged_pivot2 = pd.concat([pivot2, pivot2_total_errors], axis=1)

    # Combine pivot tables to get the final result
    model_counts = pivot1.add(merged_pivot2, fill_value=0)

    # Calculate Total of Corrects
    model_counts['Total of Corrects'] = model_counts['cc']

    # Calculate Percentage of Corrects
    model_counts['% of Corrects'] = (model_counts['cc'] / (model_counts['cc'] + model_counts['Total of Errors'])) * 100

    # Calculate Total of Errors
    model_counts['Total of Errors'] = model_counts.loc[:, error_types_to_include].sum(axis=1)

    # Calculate Percentage of Errors
    model_counts['% of Errors'] = 100 - model_counts['% of Corrects']

    # Substitute model labels directly into the index
    model_counts.index = model_counts.index.map(model_labels)

    return model_counts

In [None]:
def export_pivot_tables(model_counts, column, model_labels, model_names, errors, s_errors, ns_errors):

    # Pivot table with total of corrects, total of errors, percentage of corrects, and percentage of total errors
    pivot_total = model_counts.copy()

    # Filter columns in model_counts based on spatial error types
    spatial_errors = model_counts[s_errors]

    # Filter columns in model_counts based on non-spatial error types
    non_spatial_errors = model_counts[ns_errors]

    pivot_spatial_nonspatial = pd.DataFrame(index=model_counts.index)
    pivot_spatial_nonspatial['Total of Spatial Errors'] = spatial_errors.sum(axis=1)
    pivot_spatial_nonspatial['Total of Non-Spatial Errors'] = non_spatial_errors.sum(axis=1)

    pivot_spatial_nonspatial['Total of Errors'] = pivot_total['Total of Errors']
    pivot_spatial_nonspatial['% of Spatial Errors'] = ((pivot_spatial_nonspatial['Total of Spatial Errors'] / pivot_spatial_nonspatial['Total of Errors']) * 100).round(2).astype(str) + '%'
    pivot_spatial_nonspatial['% of Non-Spatial Errors'] = ((pivot_spatial_nonspatial['Total of Non-Spatial Errors'] / pivot_spatial_nonspatial['Total of Errors']) * 100).round(2).astype(str) + '%'

    pivot_total['Total of Corrects'] = pivot_total['Total of Corrects'].astype(str).str.rstrip('.0')
    pivot_total['Total of Errors'] = pivot_total['Total of Errors'].astype(str).str.rstrip('.0')
    pivot_total['% of Corrects'] = pivot_total['% of Corrects'].round(2).astype(str) + '%'
    pivot_total['% of Errors'] = pivot_total['% of Errors'].round(2).astype(str) + '%'

    # Export pivot tables to CSV files
 #   pivot_total.to_csv('pivot_total.csv')
  #  pivot_spatial_nonspatial.to_csv('pivot_spatial_nonspatial.csv')

    return pivot_total, pivot_spatial_nonspatial

In [None]:
def filter_and_export_total_error_corrects2(df, column, model_labels, model_names, error_types_to_include):
    grouped_dfs = {}
    unique_labels = df[column].unique()
    print("Unique Labels:", unique_labels)

    for label in unique_labels:
        filtered_df = df[df[column].str.contains(unique_labels)]
        print(f"Filtered DataFrame for {label}:", filtered_df)

        if not filtered_df.empty:
            pivot_table = pd.pivot_table(filtered_df, index='spatial_sense', columns='error_type', values='count', aggfunc='sum', fill_value=0)
            pivot_table['Total of Errors'] = pivot_table[error_types_to_include].sum(axis=1)
            pivot_table['Total of Corrects'] = pivot_table['cc'] if 'cc' in pivot_table.columns else 0
            pivot_table['% of Corrects'] = (pivot_table['Total of Corrects'] / (pivot_table['Total of Corrects'] + pivot_table['Total of Errors']) * 100).round(2)
            pivot_table['% of Errors'] = 100 - pivot_table['% of Corrects']

            pivot_table.index = pivot_table.index.map(lambda x: next((label for label in model_labels if label in x), x))

            grouped_dfs[label] = pivot_table

    return grouped_dfs

In [None]:
def export_pivot_tables2(model_counts, column, model_labels, model_names, errors, s_errors, ns_errors):
    pivot_tables = {}

    # Get unique non-NaN values in the spatial_sense_label column
    unique_labels = model_counts.index.dropna().unique()
    # Include NaN as a possible spatial sense label
    unique_labels = unique_labels.append(pd.Index([np.nan]))

    # Iterate over unique values in the spatial_sense_label column
    for spatial_label in unique_labels:
        # Filter model counts for the current spatial_sense_label
        if pd.isna(spatial_label):
            filtered_model_counts = model_counts[model_counts.index.isna()]
        else:
            filtered_model_counts = model_counts[model_counts.index == spatial_label]

        # Pivot table with total of corrects, total of errors, percentage of corrects, and percentage of total errors
        pivot_total = filtered_model_counts.copy()

        # Filter columns in model_counts based on spatial error types
        spatial_errors = filtered_model_counts[s_errors]

        # Filter columns in model_counts based on non-spatial error types
        non_spatial_errors = filtered_model_counts[ns_errors]

        pivot_spatial_nonspatial = pd.DataFrame(index=filtered_model_counts.index)
        pivot_spatial_nonspatial['Total of Spatial Errors'] = spatial_errors.sum(axis=1)
        pivot_spatial_nonspatial['Total of Non-Spatial Errors'] = non_spatial_errors.sum(axis=1)

        pivot_spatial_nonspatial['Total of Errors'] = pivot_total['Total of Errors']
        pivot_spatial_nonspatial['% of Spatial Errors'] = ((pivot_spatial_nonspatial['Total of Spatial Errors'] / pivot_spatial_nonspatial['Total of Errors']) * 100).round(2).astype(str) + '%'
        pivot_spatial_nonspatial['% of Non-Spatial Errors'] = ((pivot_spatial_nonspatial['Total of Non-Spatial Errors'] / pivot_spatial_nonspatial['Total of Errors']) * 100).round(2).astype(str) + '%'

        pivot_total['Total of Corrects'] = pivot_total['Total of Corrects'].astype(str).str.rstrip('.0')
        pivot_total['Total of Errors'] = pivot_total['Total of Errors'].astype(str).str.rstrip('.0')
        pivot_total['% of Corrects'] = pivot_total['% of Corrects'].round(2).astype(str) + '%'
        pivot_total['% of Errors'] = pivot_total['% of Errors'].round(2).astype(str) + '%'

        # Save pivot tables for each spatial_sense_label
        pivot_tables[spatial_label] = (pivot_total, pivot_spatial_nonspatial)

    return pivot_tables

In [None]:
def main_with_spatial_sense(folder_path):
    # Process and group error types using 'spatial_sense'
    grouped_errors, spatial_sense_values = process_folder(folder_path, 'spatial_sense', model_labels)
    # print(grouped_errors, spatial_sense_values)

    # Extract unique spatial_sense labels
    spatial_sense_labels = list(set([extract_spatial_sense_label(value) for value in spatial_sense_values]))

    # Filter and export error types
    filtered_errors = filter_and_export_total_error_corrects(grouped_errors, 'spatial_sense', spatial_sense_labels, spatial_sense_values, error_types_to_include)
    # print(filtered_errors)

    pivot_tables = export_pivot_tables(filtered_errors, 'spatial_sense', spatial_sense_labels, spatial_sense_values, errors, s_errors, ns_errors)
    print(pivot_tables)


if __name__ == "__main__":
    folder_path = "/content"  # Change this to your folder path
    main_with_spatial_sense(folder_path)



In [None]:
def main(folder_path):
    # Process and group error types
    grouped_df, model_names, unique_labels = process_folder(folder_path, 'spatial_sense', model_labels)
    print(grouped_df, unique_labels)

    # Filter and export error types
 #   filtered_errors = filter_and_export_total_error_corrects(grouped_df, 'spatial_sense', model_labels, model_names, error_types_to_include)
 #   print(filtered_errors)

   # pivot_total, pivot_spatial_nonspatial = export_pivot_tables(filtered_errors, 'spatial_sense', model_labels, model_names, errors, s_errors, ns_errors)
  #  print(pivot_total, pivot_spatial_nonspatial)
    output_dir = "/content/output"
    export_grouped_dfs(grouped_df, unique_labels, output_dir)

if __name__ == "__main__":
    folder_path = "/content"  # Change this to your folder path
    main(folder_path)

In [None]:
#df = pd.read_csv("/content/pivot_total.csv")

#df = df[['Model', 'ad', 'ag', 'an', 'co', 'gr', 'ie', 'in', 'om', 'po', 'sp', 'un', 'wl', 'ws', 'wt']]
#df

In [None]:
from plotnine import *

def plot_heatmap(df, label):
    """
    Plots a heatmap based on the provided DataFrame.
    """
    error_substitutions = {
        "cc": "Correct",
        "un": "Untranslated",
        "om": "Omission",
        "re": "Repetition",
        "ad": "Addition",
        "ag": "Agreement",
        "co": "Collocation",
        "wl": "Wrong Lexis",
        "wt": "Wrong Mood/Tense",
        "an": "Anglicism",
        "gr": "Wrong Grammar",
        "in": "Interlanguage",
        "sp": "Syntactic Proj.",
        "po": "Polysemy",
        "ws": "Wrong Sense",
        "ie": "Idiomatic Exp."
    }

    model_labels = {
        'gemma:7b': 'Gemma-7B',
        'llama2:7b': 'LLaMA-2-7B',
        'llama2:13b': 'LLaMA-2-13B',
        'llama3': 'LLaMA-3-8B',
        'deepl': 'DeepL',
        'mistral': 'Mistral-7B',
        'amazon-stock': 'Amazon (Stock)',
        'amazon-custom': 'Amazon (Custom)',
        'googletrans': 'Google',
        'mixtral': 'Mixtral-8x7B'
    }

    label_name = "Meaning" if label == "spatial_sense" else "Model"

    # Preprocess dataframe to replace model labels
    if 'Model' in df.columns:
        df['Model'] = df['Model'].replace(model_labels)

    if 'error_type' in df.columns:
        df['error_type'] = df['error_type'].replace(error_substitutions)


    p = (
        ggplot(df, aes(x='error_type', y='spatial_sense'))
        + geom_tile(aes(fill='count'), color="black")
        + geom_text(aes(label='count'), color="white", size=8)  # Add labels to tiles
        + scale_fill_gradient(low="lightblue", high="darkblue")
        + coord_fixed()  # Fix aspect ratio
        + theme_gray()
        + theme(
            figure_size=(10, 6),  # Adjust figure size
            axis_text_x=element_text(angle=45, vjust=1, hjust=-5, size=8),  # Rotate x-axis labels
            legend_position='right',  # Position of legend
            legend_direction='vertical',  # Direction of legend
            legend_title=element_text(size=10),  # Legend title size
            legend_text=element_text(size=8),  # Legend text size
        )
        + xlab("Error Type") + ylab(label_name)  # Set the axis labels
    )

    return print(p)

In [None]:
!pip install pyexcel_ods

In [None]:
import pandas as pd
import pyexcel_ods

# Read data from ODS file
data = pyexcel_ods.get_data('/content/bbb.ods')

# Extract data from the first sheet
df = pd.DataFrame(data['Sheet1'])

# Transpose the DataFrame
df = df.T

# Set the first row as the header
df.columns = df.iloc[0]

# Drop the first row after using it as the header
df = df[1:].reset_index(drop=True)

#df['spatial_sense'] = df['spatial_sense'].astype(str)

import re

#df['spatial_sense_label'] = df['spatial_sense'].apply(lambda x: re.split(r'\(|\)', x)[0].strip())

#df.to_csv('/content/bb.csv')
df

In [None]:
# Assuming df is your DataFrame
#df.drop(df.columns[0], axis=1, inplace=True)  # Drop the first column
# Assuming df is your DataFrame
#df = df[~df['preposition'].isin(['TOTAL', 'None'])]
df

In [None]:
from plotnine import *
import pandas as pd

def plot_stacked_bar(df):
    """
    Plots a stacked bar chart based on the provided DataFrame.
    """
    # Melt the DataFrame to have a long format suitable for plotting with plotnine
    melted_df = pd.melt(
        df,
        id_vars=['preposition', 'error_type'],  # Include 'preposition' and 'error_type' columns as id_vars
        value_vars=['error_count', 'correct_count'],
        var_name='type',
        value_name='count'
    )

    # Calculate total counts (errors + corrects) for each preposition and error type
    melted_df['total_count'] = melted_df.groupby(['preposition', 'error_type'])['count'].transform('sum')

    # Calculate percentages
    melted_df['percentage'] = (melted_df['count'] / melted_df['total_count']) * 100

    # Round percentages and convert to integers
    melted_df['percentage'] = melted_df['percentage'].apply(lambda x: round(x)).astype(int)

    melted_df = melted_df[melted_df['preposition'] == 'Total']

    print(melted_df)

    # Create the plot for prepositions
    preposition_plot = (
        ggplot(melted_df, aes(x='error_type', y='percentage', fill='type'))
        + geom_bar(position="stack", stat="identity")
        + geom_text(
            aes(label='percentage.astype(str) + "%"'),
            position=position_stack(vjust=0.5),  # Adjust position to stack
            size=12,  # Adjust the text size
            color="white"
        )
        + scale_fill_manual(values=["lightblue", "darkblue"], labels=["Correct", "Error"])
        + theme_gray()
        + theme(
            figure_size=(10, 6),
            legend_position='right',
            legend_direction='vertical',
            legend_title=element_text(text='Type', size=12),
            legend_text=element_text(size=10),
            axis_text_x=element_text(size=10, weight = 'bold'),  # Adjust x-axis text size
            axis_text_y=element_text(size=10),  # Adjust y-axis text size
            axis_title_x=element_text(size=12),  # Adjust x-axis title size and weight
            axis_title_y=element_text(size=12)  # Adjust y-axis title size and weight
        )
        + labs(x="Error Type", y="Percentage", )
        + facet_wrap('~preposition', scales='free_y', ncol=2)  # Separate by preposition
    )

    print(preposition_plot)

# Call the function with the example DataFrame
plot_stacked_bar(df)


In [None]:
from plotnine import *
import pandas as pd

def plot_stacked_bar(df):
    """
    Plots individual stacked bar charts for each unique value of the 'preposition' column.
    """
    # Melt the DataFrame to have a long format suitable for plotting with plotnine
    melted_df = pd.melt(
        df,
        id_vars=['preposition', 'spatial_sense', 'error_type'],  # Include 'preposition' and 'error_type' columns as id_vars
        value_vars=['error_count', 'correct_count'],
        var_name='type',
        value_name='count'
    )

    # Calculate total counts (errors + corrects) for each preposition and error type
    melted_df['total_count'] = melted_df.groupby(['preposition', 'spatial_sense', 'error_type'])['count'].transform('sum')

    # Calculate percentages
    melted_df['percentage'] = (melted_df['count'] / melted_df['total_count']) * 100

    # Round percentages and convert to integers
    melted_df['percentage'] = melted_df['percentage'].apply(lambda x: round(x)).astype(int)

    # Filter out 'Total' from 'preposition'
    melted_df = melted_df[melted_df['preposition'] != 'Total']

    # Create the plot for prepositions
    plot = (
        ggplot(melted_df, aes(x='spatial_sense', y='percentage', fill='type'))
        + geom_bar(position="stack", stat="identity")
        + geom_text(
            aes(label='percentage.astype(str) + "%"'),
            position=position_stack(vjust=0.5),  # Adjust position to stack
            size=12,  # Adjust the text size
            color="white"
        )
        + scale_fill_manual(values=["lightblue", "darkblue"], labels=["Correct", "Error"])
        + theme_gray()
        + theme(
            figure_size=(10, 6),
            legend_position='right',
            legend_direction='vertical',
            legend_title=element_text(text='Type', size=12),
            legend_text=element_text(size=10),
            axis_text_x=element_text(size=10, angle=45, vjust=1, hjust=-1),  # Adjust x-axis text size
            axis_text_y=element_text(size=10),  # Adjust y-axis text size
            axis_title_x=element_text(size=12),  # Adjust x-axis title size and weight
            axis_title_y=element_text(size=12)  # Adjust y-axis title size and weight
        )
        + labs(x="Meaning", y="Percentage", )
        + facet_grid('~preposition', scales='free')  # Separate by preposition
    )
    print(plot)

# Call the function with the example DataFrame
plot_stacked_bar(df)

In [None]:
plot_stacked_bar(df)

In [None]:
from plotnine import *

def plot_stacked_bar_dois(error_counts, label):
    """
    Plots a stacked bar chart based on the provided DataFrame.
    """
    # Group by spatial_sense_label, spatial_sense, and error_type, summing the counts
  #  error_counts = df.groupby(['spatial_sense_label', 'spatial_sense', 'error_type'])['count'].sum().reset_index()

    # Calculate total errors for each spatial_sense_label
  #  total_counts = error_counts.groupby('spatial_sense_label')['count'].sum().reset_index()
  #  total_counts = total_counts.rename(columns={'count': 'total_count'})

    # Merge total_counts back to error_counts
 #   error_counts = error_counts.merge(total_counts, on='spatial_sense_label')

    # Calculate percentages
    error_counts['percentage'] = (error_counts['count'] / error_counts['error_counts']) * 100

    # Define 4 blue shades excluding darkblue for highest values
    blue_shades = ["#4682b4", "#3C5291", "#4B68B8"]

    # Map the unique spatial_sense values to the blue shades
    unique_spatial_sense = error_counts['spatial_sense'].unique()
    color_mapping = {unique_spatial_sense[i]: blue_shades[i % len(blue_shades)] for i in range(len(unique_spatial_sense))}

    # Conditionally assign 'lightblue' to specific values
    lightblue_conditions = error_counts['spatial_sense'].isin(['Across(5)', 'Through(5)', 'Into(3)', 'Onto(3)'])
    color_mapping.update(dict.fromkeys(error_counts[lightblue_conditions]['spatial_sense'], "lightblue"))

    # Ensure the highest value gets 'darkblue'
    max_spatial_sense = error_counts.loc[error_counts['count'].idxmax(), 'spatial_sense']
    color_mapping[max_spatial_sense] = 'darkblue'

    # Create the plot
    p = (
        ggplot(error_counts, aes(x='spatial_sense_label', y='percentage', fill='spatial_sense'))
        + geom_bar(position="stack", stat="identity")
        + geom_text(
            aes(label='round(percentage, 1).astype(str) + "%"'),
            position=position_stack(vjust=0.5),  # Adjust position to stack
            size=12,  # Adjust the text size
            color="white"
        )
        + scale_fill_manual(values=color_mapping)
        + theme_gray()
        + theme(
            figure_size=(10, 6),
            legend_position='right',
            legend_direction='vertical',
            legend_title=element_text(text='Sense', size=10),
            legend_text=element_text(size=10),
            axis_text_x=element_text(size=10),  # Adjust x-axis text size
            axis_text_y=element_text(size=10),  # Adjust y-axis text size
        )
        + labs(x="Preposition", y="Percentage", title="Specific Spatial vs. Non-spatial Error Types by Preposition Sense (%)")
    )

    print(p)

In [None]:
import pandas as pd
from plotnine import ggplot, aes, geom_bar, geom_text, scale_fill_manual, theme_gray, theme, element_text, labs, position_stack

def plot_stacked_bar(error_counts, label):
    """
    Plots a stacked bar chart based on the provided DataFrame.
    """
    # Calculate percentages and round to the nearest integer
    error_counts['percentage'] = (error_counts['count'] / error_counts['total_ocurrences']) * 100
    error_counts['percentage'] = error_counts['percentage'].fillna(0).astype(int)
    error_counts['count'] = error_counts['count'].fillna(0).astype(int)

    # Define blue shades excluding darkblue for highest values
    blue_shades = ["#3C5291"]  # Starting with two shades for remaining assignments

    # Initialize color mapping
    color_mapping = {}

    # Conditionally assign 'lightblue' to specific values
    lightblue_conditions = error_counts['spatial_sense'].isin(['Across(5)', 'Through(5)', 'Into(3)', 'Onto(3)'])
    color_mapping.update(dict.fromkeys(error_counts[lightblue_conditions]['spatial_sense'], "lightblue"))

    # Conditionally assign 'darkblue' to highest percentage for each unique spatial_sense_label not in lightblue_conditions
    for label in error_counts['spatial_sense_label'].unique():
        label_values = error_counts[(error_counts['spatial_sense_label'] == label) & ~lightblue_conditions]
        if not label_values.empty:
            max_label_value = label_values.loc[label_values['percentage'].idxmax(), 'spatial_sense']
            color_mapping[max_label_value] = 'darkblue'

    # Conditionally assign '#4682b4' to highest percentage for each unique spatial_sense_label not in lightblue_conditions or darkblue
    for label in error_counts['spatial_sense_label'].unique():
        label_values = error_counts[(error_counts['spatial_sense_label'] == label) &
                                    ~error_counts['spatial_sense'].isin(color_mapping.keys())]
        if not label_values.empty:
            max_label_value = label_values.loc[label_values['percentage'].idxmax(), 'spatial_sense']
            color_mapping[max_label_value] = '#4B68B8'

        # Conditionally assign '#4682b4' to highest percentage for each unique spatial_sense_label not in lightblue_conditions or darkblue
    for label in error_counts['spatial_sense_label'].unique():
        label_values = error_counts[(error_counts['spatial_sense_label'] == label) &
                                    ~error_counts['spatial_sense'].isin(color_mapping.keys())]
        if not label_values.empty:
            max_label_value = label_values.loc[label_values['percentage'].idxmax(), 'spatial_sense']
            color_mapping[max_label_value] = '#3C5291' #4682b4

    # Assign remaining blue shades to the highest values not in lightblue or darkblue or #4682b4
    remaining_values = error_counts[~error_counts['spatial_sense'].isin(color_mapping.keys())]
    for index, value in remaining_values.nlargest(len(blue_shades), 'percentage').iterrows():
        if value['spatial_sense'] not in color_mapping and blue_shades:
            color_mapping[value['spatial_sense']] = blue_shades.pop(0)

    # Map any remaining values to a default color if there are not enough blue shades
    default_color = "#B0C4DE"  # Light steel blue as a default color
    for value in error_counts['spatial_sense'].unique():
        if value not in color_mapping:
            color_mapping[value] = default_color

    # Create the plot
    p = (
        ggplot(error_counts, aes(x='spatial_sense_label', y='percentage', fill='spatial_sense'))
        + geom_bar(position="stack", stat="identity")
        + geom_text(
            aes(label='round(percentage, 1).astype(str) + "%"'),
          #  aes(label='count'),
            position=position_stack(vjust=0.5),  # Adjust position to stack
            size=12,  # Adjust the text size
            color="white"
        )
        + scale_fill_manual(values=color_mapping)
        + theme_gray()
        + theme(
            figure_size=(10, 6),
            legend_position='right',
            legend_direction='vertical',
            legend_title=element_text(text='Sense', size=10),
            legend_text=element_text(size=10),
            axis_text_x=element_text(size=10),  # Adjust x-axis text size
            axis_text_y=element_text(size=10),  # Adjust y-axis text size
        )
        + labs(x="Preposition", y="Count")
    )

    print(p)

In [None]:
plot_stacked_bar(df, 'error_type')

In [None]:
import pandas as pd
import glob

# Lista de erros
err = ['sp', 'po', 'ws', 'ie']
erros = ['sp', 'po', 'ws']
erros2 = ['ie']

# Encontrar todos os arquivos CSV no diretório
file_paths = glob.glob('*.csv')

# Inicializar uma lista para armazenar DataFrames
dfs = []

# Iterar sobre os caminhos dos arquivos
for file_path in file_paths:
    # Ler o arquivo CSV em um DataFrame
    df = pd.read_csv(file_path)
    # Descartar a coluna 'Unnamed: 0' se existir
    df = df.drop(columns=['Unnamed: 0'], errors='ignore')
    # Filtrar o DataFrame para os erros especificados
    df_filtered = df[df['error_type'].isin(err)]

    # Filter out rows where 'spatial_sense' ends with (5) or (3) if values are 'sp', 'po', 'ws'
  #  df_filtered = df_filtered[~((df_filtered['spatial_sense'].str.endswith('(3)') | df_filtered['spatial_sense'].str.endswith('(5)')) & df_filtered['error_type'].isin(erros))]
  #  df_filtered = df_filtered[~((df_filtered['spatial_sense'].str.endswith('(1)') |
   #                              df_filtered['spatial_sense'].str.endswith('(2)') |
   #                              df_filtered['spatial_sense'].str.endswith('(3)') |
   #                              df_filtered['spatial_sense'].str.endswith('(4)')) &
   #                             (df_filtered['error_type'] == 'ie'))]

    # Substituir os valores de 'error_type' conforme necessário
    df_filtered.loc[df_filtered['error_type'].isin(erros), 'error_type'] = 'Spatial'
    df_filtered.loc[df_filtered['error_type'] == 'ie', 'error_type'] = 'Non-spatial'

    df_filtered.loc[df_filtered['spatial_sense'].isin(['Into(3)', 'Onto(3)', 'Across(5)', 'Through(5)']), 'error_type'] = 'Non-spatial'
    df_filtered.loc[df_filtered['spatial_sense'].str.endswith('(1)') |
                    df_filtered['spatial_sense'].str.endswith('(2)') |
                    df_filtered['spatial_sense'].str.endswith('(4)') |
                    df_filtered['spatial_sense'].isin(['Across(3)', 'Through(3)']), 'error_type'] = 'Spatial'

   # print(df_filtered)

    # Adicionar o DataFrame filtrado à lista
    dfs.append(df_filtered)

# Concatenar todos os DataFrames da lista em um único DataFrame
combined_df = pd.concat(dfs, ignore_index=True)
combined_df

In [None]:
import pandas as pd
import os

# Directory containing CSV files
folder_path = '/content/data/'

# Read all CSV files and compile them into a single DataFrame
all_files = [os.path.join(folder_path, f) for f in os.listdir(folder_path) if f.endswith('.csv')]
df = pd.concat((pd.read_csv(f) for f in all_files), ignore_index=True)

# Sum the occurrences of unique values in the 'spatial_sense' column
result = df['spatial_sense'].value_counts().reset_index()
result.columns = ['spatial_sense', 'count']
print(result)

# Export the result to a LaTeX file
with open('output.tex', 'w') as f:
    f.write("\\begin{longtable}{c c}\n")
    f.write("\\caption{Occurrences of Spatial Senses.}\n")
    f.write("\\label{tab:spatial_senses} \\\\\n")
    f.write("\\midrule\n")
    f.write("\\toprule\n")
    f.write("\\textbf{Spatial Sense} & \\textbf{Count} \\\\\n")
    f.write("\\midrule\n")
    f.write("\\endfirsthead\n")
    f.write("\\toprule\n")
    f.write("\\textbf{Spatial Sense} & \\textbf{Count} \\\\\n")
    f.write("\\midrule\n")
    f.write("\\endhead\n")
    f.write("\\bottomrule\n")
    f.write("\\endfoot\n")
    f.write("\\bottomrule\n")
    f.write("\\endlastfoot\n")

    for _, row in result.iterrows():
        f.write(f"{row['spatial_sense']} & {row['count']} \\\\\n")

    f.write("\\end{longtable}\n")

print("LaTeX table exported successfully.")


In [None]:
import pandas as pd

# Read the CSV file into a pandas DataFrame with a default column name '0'
df = pd.read_csv('/content/Phrasal_Verbs_Wikipedia.csv', header=None)

# Define the prepositions to filter by
prepositions = ['across', 'through', 'into', 'onto']

# Initialize a dictionary to store the lists of phrasal verbs for each preposition
preposition_phrasal_verbs = {prep: [] for prep in prepositions}

# Iterate through each value in the DataFrame
for value in df[0]:
    # Check if the phrasal verb contains any of the prepositions
    for prep in prepositions:
        if prep in value.lower():
            # If a preposition is found, add the phrasal verb to the corresponding list
            preposition_phrasal_verbs[prep].append(value)

# Generate LaTeX table code for the prepositions summary
latex_summary_table = r"""
\begin{table}[ht]
\centering
\caption{Prevalence of Prepositions in Phrasal Verbs}
\label{tab:prevalence}
\begin{tabular}{@{}lc@{}}
\toprule
\textbf{Preposition} & \textbf{Occurrences} \\
\midrule
"""

for prep, verbs in preposition_phrasal_verbs.items():
    latex_summary_table += f"{prep.capitalize()} & {len(verbs)} \\\\\n"

latex_summary_table += r"""
\bottomrule
\end{tabular}
\end{table}
"""

# Save the summary LaTeX table to a file
with open('preposition_summary_table.tex', 'w') as file:
    file.write(latex_summary_table)

# Generate LaTeX table codes for each preposition list
for prep, verbs in preposition_phrasal_verbs.items():
    latex_list_table = r"""
\begin{table}[ht]
\centering
\caption{Phrasal Verbs Containing '""" + prep.capitalize() + r"""'}
\label{tab:""" + prep + r"""}
\begin{tabular}{@{}l@{}}
\toprule
\textbf{Phrasal Verb} \\
\midrule
"""
    for verb in verbs:
        latex_list_table += f"{verb} \\\\\n"

    latex_list_table += r"""
\bottomrule
\end{tabular}
\end{table}
"""

    # Save each preposition list LaTeX table to a file
    with open(f'{prep}_phrasal_verbs_table.tex', 'w') as file:
        file.write(latex_list_table)

In [None]:
from plotnine import *

# Error type substitutions
error_substitutions = {
    "cc": "Correct",
    "un": "Untranslated",
    "om": "Omission",
    "re": "Repetition",
    "ad": "Addition",
    "ag": "Agreement",
    "co": "Collocation",
    "wl": "Wrong Lexis",
    "wt": "Wrong Mood/\nTense",
    "an": "Anglicism",
    "gr": "Grammar/\northography",
    "in": "Interlanguage/\ncode-switching",
    "sp": "Syntactic\nProjection",
    "po": "Polysemy",
    "ws": "Wrong Sense",
    "ie": "Idiomatic\nExpression"
}

# Melt the dataframe into a format that can be used with ggplot
df_melted = pd.melt(df, id_vars=['Model'], var_name='Error Type', value_name='Count')

# Replace error types with their substitutions
df_melted['Error Type'] = df_melted['Error Type'].replace(error_substitutions)

# Convert counts to integers
df_melted['Count'] = df_melted['Count'].astype(int)

# Create the ggplot object
p = (
    ggplot(df_melted, aes(x='Error Type', y='Model', fill='Count'))
    + geom_tile()
    + geom_text(aes(label='Count'), color="white", size=10)
    + scale_fill_gradient(low="lightblue", high="darkblue")
    + coord_fixed()
    + theme_gray()
    + theme(
        figure_size=(10, 6),
        axis_text_x=element_text(angle=45, vjust=1, hjust=-5, size=9),
        legend_position='right',
        legend_direction='vertical',
        legend_title=element_text(size=10),
        legend_text=element_text(size=10),
    )
    + xlab("Error Type") + ylab("Model")
)

# Print the plot
print(p)

In [None]:
df

In [None]:
result_by_spatial_sense['error_type'].unique()

In [None]:
result_by_model = group_by_model_and_error(folder_path)
result_by_model

In [None]:
plot_heatmap(df, "spatial_sense")

In [None]:
plot_heatmap(result_by_model, "Model")

In [None]:
import pandas as pd
from plotnine import ggplot, aes, geom_tile, geom_text, theme_minimal

# Define the data for the contingency table
data = {
    "Category1": ["A", "A", "B", "B"],
    "Category2": ["X", "Y", "X", "Y"],
    "Count": [10, 15, 20, 25]
}

# Create a DataFrame from the data
df = pd.DataFrame(data)

# Create a ggplot object
p = (
    ggplot(df, aes(x='Category1', y='Category2', fill='Count')) +  # Define aesthetics
    geom_tile(color='black') +  # Add tiles
    geom_text(aes(label='Count'), size=12) +  # Add text labels
    theme_minimal()  # Apply a minimal theme
)

# Print the plot
print(p)

In [None]:
import numpy as np
from scipy.stats import chi2_contingency

# Contingency table based on the number of errors
data = np.array([
    [7, 25],
    [112, 55],
    [8, 3],
    [73, 32]
])

# Perform chi-square test
chi2, p, dof, expected = chi2_contingency(data)

print("Chi-square statistic:", chi2)
print("p-value:", p)
print("Degrees of freedom:", dof)
print("Expected frequencies:")
print(expected)

In [None]:
import numpy as np
from scipy.stats import chi2_contingency

#ACROSS

# Contingency table based on the number of errors
data = np.array([
    [25, 7],
    [45, 14]
])

# Perform chi-square test
chi2, p, dof, expected = chi2_contingency(data)

print("Chi-square statistic:", chi2)
print("p-value:", p)
print("Degrees of freedom:", dof)
print("Expected frequencies:")
print(expected)

In [None]:
#INTO

# Contingency table based on the number of errors
data = np.array([
    [55, 112],
    [134, 497]
])

# Perform chi-square test
chi2, p, dof, expected = chi2_contingency(data)

print("Chi-square statistic:", chi2)
print("p-value:", p)
print("Degrees of freedom:", dof)
print("Expected frequencies:")
print(expected)

In [None]:
#ONTO

# Contingency table based on the number of errors
data = np.array([
    [3, 8],
    [18, 20]
])

# Perform chi-square test
chi2, p, dof, expected = chi2_contingency(data)

print("Chi-square statistic:", chi2)
print("p-value:", p)
print("Degrees of freedom:", dof)
print("Expected frequencies:")
print(expected)

In [None]:
#THROUGH

# Contingency table based on the number of errors
data = np.array([
    [32, 73],
    [101, 263]
])

# Perform chi-square test
chi2, p, dof, expected = chi2_contingency(data)

print("Chi-square statistic:", chi2)
print("p-value:", p)
print("Degrees of freedom:", dof)
print("Expected frequencies:")
print(expected)

In [None]:
#TOTAL

# Contingency table based on the number of errors
data = np.array([
    [787, 298],
    [200, 115]
])

# Perform chi-square test
chi2, p, dof, expected = chi2_contingency(data)

print("Chi-square statistic:", chi2)
print("p-value:", p)
print("Degrees of freedom:", dof)
print("Expected frequencies:")
print(expected)

In [None]:
!pip install ggplot

In [None]:
import numpy as np
import pandas as pd

# Actual values
actual_values = np.array([
    [115, 200],
    [298, 787]
])

# Expected frequencies
expected_frequencies = np.array([
    [92.925, 222.075],
    [320.075, 764.925]
])

# Flatten the arrays and round the values
actual_values_flat = actual_values.flatten()
expected_frequencies_flat = expected_frequencies.flatten()

# Create DataFrame
df = pd.DataFrame({
    'Categories': ['Errors', 'Errors', 'Corrects', 'Corrects', 'Errors', 'Errors', 'Corrects', 'Corrects'],
    'Subcategory': ['Spatial', 'Non-Spatial', 'Spatial', 'Non-Spatial', 'Spatial', 'Non-Spatial', 'Spatial', 'Non-Spatial'],
    'Frequency': np.concatenate([actual_values_flat, expected_frequencies_flat]).round(6),
    'Type': ['Actual', 'Actual', 'Actual', 'Actual', 'Expected', 'Expected', 'Expected', 'Expected']
})

# Display the DataFrame
print(df)


In [None]:
from plotnine import *
import pandas as pd

# Labels for the groups
categories = ['Errors', 'Corrects']
subcategories = ['Spatial', 'Non-spatial']

# Actual values
actual_values = np.array([
    [115, 200],
    [298, 787]
])

# Expected frequencies
expected_frequencies = np.array([
    [92.925, 222.075],
    [320.075, 764.925]
])

# Flatten the arrays and round the values
actual_values_flat = actual_values.flatten()
expected_frequencies_flat = expected_frequencies.flatten()

# Labels for the groups
categories = ['Corrects', 'Corrects', 'Corrects', 'Errors', 'Errors', 'Errors']
subcategories = ['Onto(i)', 'Onto(ii)', 'Onto(iii)', 'Onto(i)', 'Onto(ii)', 'Onto(iii)']

# Actual values
actual_values = np.array([
    [12, 6, 20],
    [14, 7, 28]
])

# Expected frequencies
expected_frequencies = np.array([
    [11.35632184, 5.67816092, 20.96551724],
    [14.64367816, 7.32183908, 27.03448276]
])

# Initialize an empty list to store data
data = []

# Iterate over categories, subcategories, actual values, and expected frequencies
for cat, subcat, actual, expected in zip(categories, subcategories, actual_values, expected_frequencies):
    # Add actual data for each subcategory
    for act in actual:
        data.append({'Categories': cat, 'Subcategory': subcat, 'Frequency': act, 'Type': 'Actual'})
    # Add expected data for each subcategory
    for exp in expected:
        data.append({'Categories': cat, 'Subcategory': subcat, 'Frequency': exp, 'Type': 'Expected'})

# Create DataFrame
df = pd.DataFrame(data)

# Round the Frequency values to two decimal places
df['Frequency'] = df['Frequency'].round(2)

print(df)


# Plot using plotnine
gg = (
    ggplot(df, aes(x='Categories', y='Frequency', fill='Type')) +
    geom_bar(stat='identity', position='dodge') +
    facet_wrap('~Subcategory', scales='free') +  # Separate graphs by subcategory
    labs(x='Categories', y='Frequencies') + #, title='ACROSS: Actual vs. Expected Frequencies') +
    scale_fill_manual(values=["darkblue", "lightblue"], labels=["Observed", "Expected"])  # Set custom fill colors and labels
)

print(gg)

In [None]:
# Labels for the groups
categories = (['Corrects', 'Corrects'] + ['Errors', 'Errors']) * 2
subcategories = ['Non-Spatial', 'Non-Spatial', 'Spatial', 'Spatial'] * 2
typep = (['Actual', 'Actual']* 2) + (['Expected', 'Expected']) * 2


# Actual values
actual_values = np.array([
    [787, 298],
    [200, 115]
])

# Expected frequencies
expected_frequencies = np.array([
    [764.925, 320.075],
    [222.075, 92.925]
])

# Flatten the arrays and round the values
actual_values_flat = actual_values.flatten()
expected_frequencies_flat = expected_frequencies.flatten()

# Create DataFrame
df = pd.DataFrame({
    'Categories': ['Corrects', 'Corrects', 'Errors', 'Errors', 'Corrects', 'Corrects', 'Errors', 'Errors'],
    'Subcategory': ['Non-Spatial', 'Spatial', 'Non-Spatial', 'Spatial', 'Non-Spatial', 'Spatial', 'Non-Spatial', 'Spatial'],
    'Frequency': np.concatenate([actual_values_flat, expected_frequencies_flat]).round(6),
    'Type': ['Observed', 'Observed', 'Observed', 'Observed', 'Expected', 'Expected', 'Expected', 'Expected']
})

print(df)

# Display the DataFrame
print(df)

# Round the Frequency values to two decimal places
df['Frequency'] = df['Frequency'].round(2)

print(df)


# Plot using plotnine
gg = (
    ggplot(df, aes(x='Categories', y='Frequency', fill='Type')) +
    geom_bar(stat='identity', position='dodge') +
    geom_text(aes(label='Frequency'), color='white', position=position_dodge(width=0.9), size=10, va='top') +  # Add text labels
    facet_wrap('~Subcategory', scales='free') +  # Separate graphs by subcategory
    labs(x='Categories', y='Frequency') +  # , title='ACROSS: Actual vs. Expected Frequencies'
    scale_fill_manual(values=["darkblue", "lightblue"], labels=["Observed", "Expected"])  # Set custom fill colors and labels
)

print(gg)


In [None]:
import numpy as np
import pandas as pd
from plotnine import scale_fill_manual, theme_gray, element_text, theme, ggplot, position_dodge, aes, geom_bar, geom_tile, geom_text, theme_minimal, scale_fill_gradient, labs, facet_wrap

# Actual values
actual_values = np.array([
    [787, 298],
    [200, 115]
])

# Expected frequencies
expected_frequencies = np.array([
    [764.925, 320.075],
    [222.075, 92.925]
])

# Flatten the arrays and round the values
actual_values_flat = actual_values.flatten()
expected_frequencies_flat = expected_frequencies.flatten()

# Create DataFrame
df = pd.DataFrame({
    'Categories': ['Corrects', 'Corrects', 'Errors', 'Errors', 'Corrects', 'Corrects', 'Errors', 'Errors'],
    'Subcategory': ['Non-Spatial', 'Spatial', 'Non-Spatial', 'Spatial', 'Non-Spatial', 'Spatial', 'Non-Spatial', 'Spatial'],
    'Frequency': np.concatenate([actual_values_flat, expected_frequencies_flat]).round(6),
    'Type': ['Observed', 'Observed', 'Observed', 'Observed', 'Expected', 'Expected', 'Expected', 'Expected']
})

print(df)

# Plot using plotnine
from plotnine import ggplot, aes, geom_tile, geom_text, theme_minimal, scale_fill_gradient, facet_wrap

# Define the base plot
#base_plot = (ggplot(df, aes(x='Subcategory', y='Categories', fill='Frequency')) +
#             geom_tile() +
#             geom_text(aes(label='Frequency'), color='white') +
 #            scale_fill_gradient(low='lightblue', high='darkblue') +
#             theme_gray()
           #  labs(title='Chi-Square Test of Independence', x='', y='')
 #          )

# Facet the plot to separate Actual and Expected
#plot = base_plot + facet_wrap('~Type')

# Display the plot
#print(plot)



# Bar plot to show observed vs expected
bar_plot = (
    ggplot(df, aes(x='Categories', y='Frequency', fill='Type')) +
    #geom_bar(stat='identity', position='dodge') +
    geom_bar(stat='identity', position=position_dodge(width=0.9)) +
    geom_text(aes(label='Frequency'), position=position_dodge(width=0.9), size=8, va='bottom') +
    facet_wrap('~Subcategory', scales='free') +  # Separate graphs by subcategory
    labs(y='Count', fill='Type') +
    theme_gray() +
    scale_fill_manual(values=["lightblue", "darkblue"], labels=["Observed", "Expected"])  # Set custom fill colors and labels

   # theme(axis_text_x=element_text(rotation=45, hjust=1))
)

print(bar_plot)


In [None]:
# @title Default title text
import pandas as pd
import numpy as np
import scipy.stats as stats
from plotnine import ggplot, aes, geom_bar, geom_text, labs, theme_minimal, geom_line, geom_vline, annotate, theme, element_text, position_dodge, geom_area

# Data for observed and expected absences
data = {
    'Day': ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday'],
    'Observed': [23, 16, 14, 19, 28],
    'Expected': [20, 20, 20, 20, 20]
}

df = pd.DataFrame(data)

# Bar plot to show observed vs expected
bar_plot = (
    ggplot(df.melt(id_vars='Day'), aes(x='Day', y='value', fill='variable')) +
    geom_bar(stat='identity', position=position_dodge(width=0.9)) +
    geom_text(aes(label='value'), position=position_dodge(width=0.9), size=8, va='bottom') +
    labs(title='Observed vs Expected Absences', y='Count', fill='Type') +
    theme_minimal() +
    theme(axis_text_x=element_text(rotation=45, hjust=1))
)

# Data for chi-square distribution plot
x = np.linspace(0, 20, 400)
y = stats.chi2.pdf(x, df=4)  # degrees of freedom = 4

# Critical value for chi-square with df=4 and alpha=0.05
critical_value = stats.chi2.ppf(0.95, df=3)

# DataFrame for the chi-square distribution
chi_square_df = pd.DataFrame({'x': x, 'y': y})

# Plot chi-square distribution
chi_square_plot = (
    ggplot(chi_square_df, aes(x='x', y='y')) +
    geom_line(color='black') +
    geom_vline(xintercept=critical_value, color='red', linetype='dashed') +
    geom_area(aes(x='x', y='y'), data=chi_square_df[chi_square_df['x'] >= critical_value], fill='red', alpha=0.3) +
    geom_area(aes(x='x', y='y'), data=chi_square_df[chi_square_df['x'] < critical_value], fill='lightblue', alpha=0.3) +
    annotate('text', x=critical_value + 2.2, y=max(y)/2, label=f'Critical value\n{round(critical_value, 2)}', color='red') +
    annotate('text', x=critical_value - 5, y=max(y)/2, label='Do Not Reject', color='black') +
    annotate('text', x=critical_value + 3, y=0.025, label='Reject', color='red') +
    annotate('text', x=17, y=max(y) * 0.9, label=r'$\alpha = 0.05$', color='black') +
    annotate('text', x=17, y=max(y) * 0.7, label='df = 3', color='black', size=10) +  # Annotate degrees of freedom
    labs(x='$\chi^2$', y='Density') +
    theme_minimal()
)

# Display the plots
print(bar_plot)
print(chi_square_plot)

In [None]:
pd