In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegressionCV
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.cluster import KMeans
from scipy.spatial.distance import euclidean, cosine
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from statsmodels.graphics.mosaicplot import mosaic
from scipy.stats import chi2_contingency
from sklearn.preprocessing import StandardScaler
import scipy.linalg as la

import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
race_train_df = pd.read_parquet('./data/race_train_prepared.parquet')
race_test_df = pd.read_parquet('./data/race_test_prepared.parquet')
dolly_cqa_df = pd.read_parquet('./data/dolly_cqa_prepared.parquet')

In [None]:
from readability import Readability

def compute_readability(text):
    r = Readability(text)
    gf = r.gunning_fog()
    gf_score = gf.score
    return gf_score

race_train_df['gf_score'] = race_train_df['text'].apply(lambda x: compute_readability(x))
race_test_df['gf_score'] = race_test_df['text'].apply(lambda x: compute_readability(x))
dolly_cqa_df['gf_score'] = dolly_cqa_df['text'].apply(lambda x: compute_readability(x))

In [None]:
race_train_df.to_parquet('./data/race_train_df_readability.parquet', index=False)
race_test_df.to_parquet('./data/race_test_df_readability.parquet', index=False)
dolly_cqa_df.to_parquet('./data/dolly_cqa_df_readability.parquet', index=False)

In [None]:
race_train_df = pd.read_parquet('./data/race_train_df_readability.parquet')
race_test_df = pd.read_parquet('./data/race_test_df_readability.parquet')
dolly_cqa_df = pd.read_parquet('./data/dolly_cqa_df_readability.parquet')

In [None]:
race_train_df['gf_score'].hist(bins=100)

In [None]:
dolly_cqa_df['gf_score'].hist(bins=100)

In [None]:
# race_train_df = race_train_df[race_train_df['gf_score'] >= 5]
race_train_df = race_train_df[race_train_df['gf_score'] <= 35]

# race_test_df = race_test_df[race_test_df['gf_score'] >= 5]
race_test_df = race_test_df[race_test_df['gf_score'] <= 35]

In [None]:
race_train_df['gf_score'].hist(bins=100)

In [None]:
# def equal_frequency_binning(data, num_bins):
#     """
#     Bucketize the data into bins with approximately equal number of data points.
    
#     Parameters:
#     - data (list or np.array): The data to be bucketized.
#     - num_bins (int): Number of bins desired.
    
#     Returns:
#     - bins (list of tuples): List of intervals representing the bins.
#     """
    
#     if not isinstance(data, np.ndarray):
#         data = np.array(data)
    
#     # Sort the data
#     sorted_data = np.sort(data)
    
#     # Calculate bin edges using quantiles
#     bin_edges = [np.percentile(sorted_data, i) for i in np.linspace(0, 100, num_bins+1)]
    
#     # Create bins as tuples of (start, end)
#     bins = [(bin_edges[i], bin_edges[i+1]) for i in range(len(bin_edges)-1)]
    
#     return bins

# def bin_data(data, bins, bin_names):
#     """
#     Convert continuous data into categorical data using specified bins and bin names.
    
#     Parameters:
#     - data (list or np.array): The data to be bucketized.
#     - bins (list of tuples): List of intervals representing the bins.
#     - bin_names (list of str): Names for each bin.
    
#     Returns:
#     - categorical_data (list of str): Categorical representation of the data.
#     """
    
#     if not isinstance(data, np.ndarray):
#         data = np.array(data)
    
#     if len(bins) != len(bin_names):
#         raise ValueError("Number of bins and bin names should be the same.")
    
#     # Initialize an empty list to store the categorical data
#     categorical_data = []
    
#     # Loop over each data point to assign it to a bin
#     for value in data:
#         assigned = False
#         for i, (start, end) in enumerate(bins):
#             if start <= value < end or (i == len(bins) - 1 and value == end):
#                 categorical_data.append(bin_names[i])
#                 assigned = True
#                 break
#         if not assigned:
#             categorical_data.append('Unknown')
    
#     return categorical_data

def bucketize_by_quantiles(df, column_name, n_buckets):
    """
    Bucketizes the continuous values in the specified column of the DataFrame into
    n_buckets using quantiles, and returns a new categorical column with the bucketized values.
    
    Parameters:
    - df: pandas DataFrame
    - column_name: Name of the column with continuous values
    - n_buckets: Number of buckets to create
    
    Returns:
    - A new column with bucketized values
    """
    bucket_labels = [f"L{i+1}" for i in range(n_buckets)]
    return pd.qcut(df[column_name], q=n_buckets, labels=bucket_labels)

# def bucketize(df, column_name):
#     # Create an ordered index based on the values in the column
#     ordered_index = df[column_name].sort_values().index
    
#     # Round the continuous values to nearest integer
#     df['rounded'] = df[column_name].round().astype(int)
    
#     # Apply rolling window on ordered values to compute the middle value for each window
#     bucketized = df.loc[ordered_index, 'rounded'].rolling(window=3, center=True).apply(lambda x: x[1], raw=True)
    
#     # Fill NaN values for beginning edge with the first valid bucket value
#     first_valid_bucket = bucketized.dropna().iloc[0]
#     bucketized.iloc[:bucketized.first_valid_index()] = first_valid_bucket
    
#     # Fill NaN values for ending edge with the last valid bucket value
#     last_valid_bucket = bucketized.dropna().iloc[-1]
#     bucketized.iloc[bucketized.last_valid_index()+1:] = last_valid_bucket
    
#     # Convert the bucketized values to integers
#     bucketized = bucketized.astype(int)
    
#     # Convert the integer values to strings by prepending "L" to them
#     bucketized = 'L' + bucketized.astype(str)
    
#     # Restore the original order
#     bucketized = bucketized.reindex(df.index)
    
#     # Drop the temporary 'rounded' column
#     df.drop('rounded', axis=1, inplace=True)
    
#     return bucketized

In [None]:
# bins_race_train = equal_frequency_binning(race_train_df['gf_score'], num_bins=15)
# race_train_df['gf_score_categ'] = bin_data(race_train_df['gf_score'], bins_race_train, ['L' + str(i) for i in range(1,16)])

# bins_race_test = equal_frequency_binning(race_test_df['gf_score'], num_bins=15)
# race_test_df['gf_score_categ'] = bin_data(race_test_df['gf_score'], bins_race_test, ['L' + str(i) for i in range(1,16)])

# bins_dolly = equal_frequency_binning(dolly_cqa_df['gf_score'], num_bins=15)
# dolly_cqa_df['gf_score_categ'] = bin_data(dolly_cqa_df['gf_score'], bins_dolly, ['L' + str(i) for i in range(1,16)])

# race_train_df['gf_score_categ'] = race_train_df['gf_score'].apply(lambda x: str(round(x)))
# race_test_df['gf_score_categ'] = race_test_df['gf_score'].apply(lambda x: str(round(x)))
# dolly_cqa_df['gf_score_categ'] = dolly_cqa_df['gf_score'].apply(lambda x: str(round(x)))

race_train_df['gf_score_categ'] = bucketize_by_quantiles(df=race_train_df, column_name='gf_score', n_buckets=10)
race_test_df['gf_score_categ'] = bucketize_by_quantiles(df=race_test_df, column_name='gf_score', n_buckets=10)
dolly_cqa_df['gf_score_categ'] = bucketize_by_quantiles(df=dolly_cqa_df, column_name='gf_score', n_buckets=10)

In [None]:
labels = race_train_df['gf_score_categ'].unique()
labels = sorted(labels, key=lambda x: int(x[1:]))
labels

In [None]:
labels = race_test_df['gf_score_categ'].unique()
labels = sorted(labels, key=lambda x: int(x[1:]))
labels

In [None]:
labels = dolly_cqa_df['gf_score_categ'].unique()
labels = sorted(labels, key=lambda x: int(x[1:]))
labels

In [None]:
def train_lr_classifier(df, embeddings_col, y_col):
    # Convert the list of lists in X_colname to a numpy array
    X = np.array(df[embeddings_col].tolist())
    
    y = df[y_col]
    
    # Label encoding the target variable
    le = LabelEncoder()
    y_encoded = le.fit_transform(y)
    
    # Initialize LR classifier
    # clf = LogisticRegressionCV(cv=5, scoring='f1_macro', max_iter=1000, n_jobs=-1)
    clf = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=20000)

    # Fit the classifier
    clf.fit(X, y_encoded)

    return le, clf

def get_predictions(df, le, clf, embeddings_col, y_col):
    # Convert the list of lists in X_colname to a numpy array
    X = np.array(df[embeddings_col].tolist())
    
    y = df[y_col]

    # Label encoding the target variable
    y_encoded = le.transform(y)

    # Predictions
    y_pred_encoded = clf.predict(X)
    y_pred_proba = clf.predict_proba(X)

    # Reverse-transform the predicted and actual labels to their original values
    y_pred = le.inverse_transform(y_pred_encoded)
    y_test = le.inverse_transform(y_encoded)

    # Printing the classification report
    print(classification_report(y_test, y_pred))

    # Plotting the confusion matrix
    labels = df[y_col].unique().tolist()
    cm = confusion_matrix(y_test, y_pred, labels=labels)
    plt.figure(figsize=(7,5))
    sns.heatmap(cm, annot=True, fmt='g', cmap='Blues', xticklabels=labels, yticklabels=labels)
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.show()

    return y_pred, y_pred_proba

def create_clusters(df, n_clusters):
    clusters = KMeans(n_clusters=n_clusters, random_state=42, n_init='auto').fit( [v for v in df['embeddings_bge_large']])
    return clusters.labels_

In [None]:
le, clf = train_lr_classifier(df=race_train_df, embeddings_col='embeddings_bge_large', y_col='difficulty')

In [None]:
y_pred, y_pred_proba = get_predictions(df=race_test_df, le=le, clf=clf, embeddings_col='embeddings_bge_large', y_col='difficulty')

race_test_df['predicted_difficulty'] = y_pred
race_test_df['difficulty_proba_C'] = y_pred_proba[:,0]
race_test_df['difficulty_proba_H'] = y_pred_proba[:,1]
race_test_df['difficulty_proba_M'] = y_pred_proba[:,2]

race_test_df['cluster'] = create_clusters(df=race_test_df, n_clusters=10)

race_test_df

In [None]:
X = np.array(dolly_cqa_df['embeddings_bge_large'].tolist())
y_pred_encoded = clf.predict(X)
y_pred_proba = clf.predict_proba(X)
y_pred = le.inverse_transform(y_pred_encoded)

dolly_cqa_df['predicted_difficulty'] = y_pred
dolly_cqa_df['difficulty_proba_C'] = y_pred_proba[:,0]
dolly_cqa_df['difficulty_proba_H'] = y_pred_proba[:,1]
dolly_cqa_df['difficulty_proba_M'] = y_pred_proba[:,2]

dolly_cqa_df['cluster'] = create_clusters(df=dolly_cqa_df, n_clusters=10)

dolly_cqa_df

In [None]:
def plot_boxplots(df, title):
    plt.figure(figsize=(10, 6))
    
    # sns.boxplot(x='gf_score_categ', y='gf_score', data=df, order=['L' + str(i) for i in range(1,16)])
    
    # labels = df['gf_score_categ'].unique().tolist()
    # labels.sort(key=int)
    # sns.boxplot(x='gf_score_categ', y='gf_score', data=df, order=labels)

    labels = df['gf_score_categ'].unique()
    labels = sorted(labels, key=lambda x: int(x[1:]))
    sns.boxplot(x='gf_score_categ', y='gf_score', data=df, order=labels)
    
    plt.title(title)
    plt.show()

In [None]:
plot_boxplots(df=race_test_df, title='Boxplots of gf_score by gf_score_categ - RACE Dataset')

In [None]:
plot_boxplots(df=dolly_cqa_df, title='Boxplots of gf_score by gf_score_categ - Dolly Dataset')

In [None]:
# def plot_mosaic(df, title):
#     # Set the aesthetic style of the plots
#     sns.set_style("whitegrid")

#     # Plotting Mosaic Plot
#     plt.figure(figsize=(12, 6))
#     mosaic(df.sort_values('predicted_difficulty'), ['predicted_difficulty', 'gf_score_categ'], title=title)
#     plt.show()

def plot_stacked(df, title):
    # Plotting Stacked Bar Plot
    contingency_table = pd.crosstab(df['predicted_difficulty'], df['gf_score_categ'])
    contingency_table = contingency_table.reindex(['M', 'H', 'C'])
    
    # contingency_table = contingency_table.reindex(columns=['L' + str(i) for i in range(1,16)])
    
    # labels = df['gf_score_categ'].unique().tolist()
    # labels.sort(key=int)
    # contingency_table = contingency_table.reindex(columns=labels)

    labels = df['gf_score_categ'].unique()
    labels = sorted(labels, key=lambda x: int(x[1:]))
    contingency_table = contingency_table.reindex(columns=labels)

    # Normalizing the contingency table to show percentages
    contingency_table_percentage = contingency_table.div(contingency_table.sum(axis=1), axis=0) * 100

    contingency_table_percentage.plot(kind='bar', stacked=True, figsize=(12, 6), colormap='viridis')
    plt.title(title)
    plt.ylabel('Percentage (%)')
    plt.show()
    return contingency_table


In [None]:
# plot_mosaic(df=race_test_df, title='Mosaic Plot of difficulty vs gf_score - RACE Dataset')

In [None]:
# plot_mosaic(df=dolly_cqa_df, title='Mosaic Plot of difficulty vs gf_score - Dolly Dataset')

In [None]:
contingency_table_race = plot_stacked(df=race_test_df, title='Stacked Bar Plot of difficulty vs gf_score - RACE Dataset')

In [None]:
contingency_table_dolly = plot_stacked(df=dolly_cqa_df, title='Stacked Bar Plot of difficulty vs gf_score - Dolly Dataset')

In [None]:
def compute_correlation(contingency_table):
    # Chi-Squared Test
    chi2, p, _, _ = chi2_contingency(contingency_table)

    # Cramér's V calculation
    chi2_val = chi2_contingency(contingency_table)[0]
    n = contingency_table.sum().sum()
    cramers_v = (chi2_val / (n * min(contingency_table.shape[0]-1, contingency_table.shape[1]-1)))**0.5

    return chi2, p, cramers_v

In [None]:
chi2, p, cramers_v = compute_correlation(contingency_table_race)
print('Chi-squared: ', chi2, ' | p-value: ', p)
print("Cramer's V: ", cramers_v)

In [None]:
chi2, p, cramers_v = compute_correlation(contingency_table_dolly)
print('Chi-squared: ', chi2, ' | p-value: ', p)
print("Cramer's V: ", cramers_v)

In [None]:
def correspondence_analysis(contingency_table):
    # Calculate row and column marginal totals
    row_totals = contingency_table.sum(axis=1)
    col_totals = contingency_table.sum(axis=0)
    
    # Grand total
    grand_total = contingency_table.sum().sum()
    
    # Expected frequencies under independence
    expected = np.outer(row_totals, col_totals) / grand_total
    
    # Standardized residuals
    S = (contingency_table - expected) / np.sqrt(expected)
    
    # Singular value decomposition
    U, D, Vt = la.svd(S, full_matrices=False)
    
    # Row and column coordinates
    row_coords = U * D
    col_coords = Vt.T * D

    # Percentage of variance explained by each dimension
    explained_variance = (D**2) / sum(D**2)
    
    return row_coords, col_coords, explained_variance


def plot_correspondance_analysis(contingency_table, title, row_label, column_label):
    # Perform correspondence analysis
    row_coords, col_coords, explained_variance = correspondence_analysis(contingency_table)

    # Define row colors
    row_colors = ['red', 'green', 'blue']

    # Plotting the bi-plot
    plt.figure(figsize=(8, 4))
    plt.scatter(row_coords[:, 0], row_coords[:, 1], color='red', marker='o', label=row_label, alpha=0.6)
    plt.scatter(col_coords[:, 0], col_coords[:, 1], color='blue', marker='s', label=column_label, alpha=0.6)

    # Annotate row and column labels
    for idx, label in enumerate(contingency_table.index):
        plt.annotate(label, (row_coords[idx, 0] + 0.05, row_coords[idx, 1] + 0.05), color='red')
    for idx, label in enumerate(contingency_table.columns):
        plt.annotate(label, (col_coords[idx, 0] + 0.05, col_coords[idx, 1] + 0.05), color='blue')

    plt.axhline(0, color='gray', linestyle='--')
    plt.axvline(0, color='gray', linestyle='--')
    plt.title(title)
   
    plt.xlabel('Dimension 1: {:.2f}% Explained Variance'.format(explained_variance[0]*100))
    plt.ylabel('Dimension 2: {:.2f}% Explained VAriance'.format(explained_variance[1]*100))
    plt.legend()
    plt.grid(True)
    plt.show()

# def plot_correspondance_analysis(contingency_table, title):
#     # Perform correspondence analysis
#     row_coords, col_coords, explained_variance = correspondence_analysis(contingency_table)

#     # Define row colors
#     row_colors = ['red', 'green', 'blue']

#     # Plotting the bi-plot
#     plt.figure(figsize=(12, 8))

#     # Plot and annotate rows
#     for idx, label in enumerate(contingency_table.index):
#         plt.scatter(row_coords[idx, 0], row_coords[idx, 1], color=row_colors[idx], marker='o', alpha=0.6)
#         plt.annotate(label, (row_coords[idx, 0] + 0.05, row_coords[idx, 1] + 0.05), color=row_colors[idx])
    
#     # Plot and annotate columns based on the closest row point
#     for idx_col, label_col in enumerate(contingency_table.columns):
#         distances = np.linalg.norm(row_coords - col_coords[idx_col], axis=1)
#         closest_row_idx = np.argmin(distances)
#         plt.scatter(col_coords[idx_col, 0], col_coords[idx_col, 1], color=row_colors[closest_row_idx], marker='s', alpha=0.6)
#         plt.annotate(label_col, (col_coords[idx_col, 0] + 0.05, col_coords[idx_col, 1] + 0.05), color=row_colors[closest_row_idx])

#     plt.axhline(0, color='gray', linestyle='--')
#     plt.axvline(0, color='gray', linestyle='--')
#     plt.title(title)
   
#     plt.xlabel('Dimension 1: {:.2f}%'.format(explained_variance[0]*100))
#     plt.ylabel('Dimension 2: {:.2f}%'.format(explained_variance[1]*100))
    
#     # Create legend for row colors
#     legend_elements1 = [plt.Line2D([0], [0], color=row_colors[idx], marker='o', linestyle='', markersize=10, label=label) 
#                         for idx, label in enumerate(contingency_table.index)]
    
#     # Create legend for row and column labels
#     legend_elements2 = [plt.Line2D([0], [0], color='black', marker='o', linestyle='', markersize=10, label='Rows (difficulty)'),
#                         plt.Line2D([0], [0], color='black', marker='s', linestyle='', markersize=10, label='Columns (gf_score_categ)')]
    
#     # Place both legends outside the plot, top-right
#     legend1 = plt.legend(handles=legend_elements1, loc="upper left", bbox_to_anchor=(1, 1), title="Rows (difficulty)")
#     legend2 = plt.legend(handles=legend_elements2, loc="upper left", bbox_to_anchor=(1, 0.75))
    
#     # Add the first legend manually to the current Axes
#     plt.gca().add_artist(legend1)
    
#     plt.grid(True)
#     plt.tight_layout()
#     plt.show()


In [None]:
plot_correspondance_analysis(contingency_table=contingency_table_race, title='Correspondence Analysis Bi-plot - RACE Dataset',
                             row_label='Rows (predicted reading comprehension)', column_label='Columns (computed readability score)')

In [None]:
plot_correspondance_analysis(contingency_table=contingency_table_dolly, title='Correspondence Analysis Bi-plot - Dolly Dataset',
                             row_label='Rows (predicted reading comprehension)', column_label='Columns (computed readability score)')

In [None]:
df = dolly_cqa_df[dolly_cqa_df['predicted_difficulty'].isin(['C'])]
df = df[df['gf_score_categ']=='L8']

for text, categ in zip(df['text'], df['gf_score_categ']):
    print('gf_score_categ:', categ, '\n')
    print(text, '\n=============================================================')

In [None]:
def generate_percentage_table(*series_list, series_names=None, order=None):
    """
    Generates a table with the percentage of instances for each category level, for each series.
    
    Parameters:
    *series_list: One or more pandas Series.
    series_names: List of names for the series. Defaults to ['Series 1', 'Series 2', ...]
    order: List specifying the desired order of category levels in the table.
    
    Returns:
    DataFrame: A table with the percentage of instances for each category level, for each series.
    """
    
    if not series_names:
        series_names = [f"Series {i+1}" for i in range(len(series_list))]
    
    if len(series_list) != len(series_names):
        raise ValueError("The number of series and series names must match.")
    
    # Initialize a DataFrame to store the results
    df = pd.DataFrame()
    
    # For each series, calculate the percentage distribution of the categories
    for s, name in zip(series_list, series_names):
        df[name] = s.value_counts(normalize=True).mul(100).round().astype(int).astype(str) + '%'
    
    # Fill NaN with '0%' (this means that a particular category was not present in a series)
    df.fillna('0%', inplace=True)
    
    # If an order is provided, reorder the rows accordingly
    if order:
        df = df.reindex(order).fillna('0%')
    
    return df


In [None]:
generate_percentage_table(race_train_df['difficulty'], race_test_df['difficulty'], dolly_cqa_df['predicted_difficulty'],
                          series_names=['RACE train', 'RACE test', 'Dolly'], order=['M', 'H', 'C'])

In [None]:
def plot_cluster_scores(df, categ, title, legend_title):
  
    fig, ax = plt.subplots(1, 1, figsize=(12, 6))

    labels = df[categ].unique()
    labels = sorted(labels, key=lambda x: int(x[1:]))
    # labels = ['L' + str(i) for i in range(1,16)]

    sns.countplot(data=df, x='cluster', hue=categ, hue_order=labels, ax=ax)
    ax.set_title(title)
    ax.set_xlabel('Cluster')
    ax.set_ylabel('Count')
    
    # Place the legend outside the plot, to the right
    legend = ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))
    legend.set_title(legend_title)
    
    plt.tight_layout()
    plt.show()

In [None]:
plot_cluster_scores(df=dolly_cqa_df, categ='gf_score_categ', title='Readability Level Counts by Cluster',
                    legend_title='Gunning Fog Score Levels')

In [None]:
df = dolly_cqa_df[dolly_cqa_df['cluster']==2]
df10 = df[df['gf_score_categ']=='L1']

for text, categ in zip(df10['text'], df10['gf_score_categ']):
    print('gf_score_categ:', categ, '\n')
    print(text, '\n=============================================================')

In [None]:
df = dolly_cqa_df[dolly_cqa_df['cluster']==8]
df1 = df[df['gf_score_categ']=='L10']

for text, categ in zip(df1['text'], df1['gf_score_categ']):
    print('gf_score_categ:', categ, '\n')
    print(text, '\n=============================================================')

In [None]:
df = dolly_cqa_df[dolly_cqa_df['predicted_difficulty'].isin(['M'])]
plot_cluster_scores(df=df, categ='gf_score_categ', title='Readability Level Counts by CLuster for Reading Comprehension Level "M"',
                    legend_title='Gunning Fog Score Levels')

In [None]:
df0 = df[df['cluster']==0]
df0 = df0[df0['gf_score_categ']=='L8']

for text, categ in zip(df0['text'], df0['gf_score_categ']):
    print('gf_score_categ:', categ, '\n')
    print(text, '\n=============================================================')

In [None]:
df8 = df[df['cluster']==8]
df8 = df8[df8['gf_score_categ'].isin(['L6', 'L7'])]

for text, categ in zip(df8['text'], df8['gf_score_categ']):
    print('gf_score_categ:', categ, '\n')
    print(text, '\n=============================================================')

In [None]:
df = dolly_cqa_df[dolly_cqa_df['predicted_difficulty'].isin(['H'])]
plot_cluster_scores(df=df, categ='gf_score_categ', title='Readability Level Counts by Cluster for Reading Comprehension Level "H"',
                    legend_title='Gunning Fog Score Levels')

In [None]:
df8 = df[df['cluster']==8]
df8 = df8[df8['gf_score_categ']=='L1']

for text, categ in zip(df8['text'], df8['gf_score_categ']):
    print('gf_score_categ:', categ, '\n')
    print(text, '\n=============================================================')

In [None]:
df1 = df[df['cluster']==1]
df1 = df1[df1['gf_score_categ']=='L10']

for text, categ in zip(df1['text'], df1['gf_score_categ']):
    print('gf_score_categ:', categ, '\n')
    print(text, '\n=============================================================')

In [None]:
df = dolly_cqa_df[dolly_cqa_df['predicted_difficulty'].isin(['C'])]
plot_cluster_scores(df=df, categ='gf_score_categ', title='Readability Level Counts by CLuster for Reading Comprehension Level "C"',
                    legend_title='Gunning Fog Score Levels')

In [None]:
df1 = df[df['cluster']==1]
df1 = df1[df1['gf_score_categ']=='L10']

for text, categ in zip(df1['text'], df1['gf_score_categ']):
    print('gf_score_categ:', categ, '\n')
    print(text, '\n=============================================================')

In [None]:
df2 = df[df['cluster']==2]
df2 = df2[df2['gf_score_categ']=='L1']

for text, categ in zip(df2['text'], df2['gf_score_categ']):
    print('gf_score_categ:', categ, '\n')
    print(text, '\n=============================================================')

In [None]:
def plot_cluster_scores(df, categ, title, legend_title):
  
    fig, ax = plt.subplots(1, 1, figsize=(12, 6))

    labels = ['M', 'H', 'C']

    sns.countplot(data=df, x='cluster', hue=categ, hue_order=labels, ax=ax)
    ax.set_title(title)
    ax.set_xlabel('Cluster')
    ax.set_ylabel('Count')
    
    # Place the legend outside the plot, to the right
    legend = ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))
    legend.set_title(legend_title)
    
    plt.tight_layout()
    plt.show()

In [None]:
df = dolly_cqa_df[dolly_cqa_df['gf_score_categ'].isin(['L10'])]
plot_cluster_scores(df=df, categ='predicted_difficulty', title='Counts by CLuster for Readability Level "L10"',
                    legend_title='Predicted Reading Comprehension Levels')

In [None]:
df = dolly_cqa_df[dolly_cqa_df['gf_score_categ'].isin(['L1'])]
plot_cluster_scores(df=df, categ='predicted_difficulty', title='Counts by CLuster for Readability Level "L1"',
                    legend_title='Predicted Reading Comprehension Levels')

In [None]:
df2 = df[df['cluster']==2]
df2 = df2[df2['predicted_difficulty']=='C']

for text, level in zip(df2['text'], df2['predicted_difficulty']):
    print('predicted difficulty:', level, '\n')
    print(text, '\n=============================================================')

In [None]:
df8 = df[df['cluster']==8]
df8 = df8[df8['predicted_difficulty']=='H']

for text, level in zip(df8['text'], df8['predicted_difficulty']):
    print('predicted difficulty:', level, '\n')
    print(text, '\n=============================================================')

In [None]:
def plot_boxplots(df, categ):
    # List of continuous columns
    columns = ['difficulty_proba_M', 'difficulty_proba_C', 'difficulty_proba_H']
    
    # Setting up the figure and axes
    fig, axs = plt.subplots(nrows=3, figsize=(10, 15))
    
    # Looping through each column and plotting
    for i, column in enumerate(columns):
        sns.boxplot(x=categ, y=column, data=df, ax=axs[i], order=['L' + str(i) for i in range(1,10)])
        axs[i].set_title(f"Boxplot of {column} by {categ}")
    
    # Displaying the plots
    plt.tight_layout()
    plt.show()


In [None]:
plot_boxplots(race_test_df, 'gf_score_categ')

In [None]:
plot_boxplots(dolly_cqa_df, 'gf_score_categ')