In [134]:
import pandas as pd

# Load the SNLI test data (including true labels)
snli_test_path = "/kaggle/input/stanford-natural-language-inference-corpus/snli_1.0_test.csv"
snli_test_df = pd.read_csv(snli_test_path)

# Define file paths for SNLI prediction files
snli_predictions_paths = {
    "deberta": "/kaggle/input/deberta-nli/deberta_snli_predictions.csv",
    "roberta": "/kaggle/input/roberta/roberta_snli_predictions.csv",
    "albert": "/kaggle/input/albert/albert_snli_predictions.csv",
}

# Specify where you want to save the combined predictions CSV file
output_csv_path_snli = "/kaggle/working/combined_snli_df"

# Define the column names, placing True_Label at the end
columns = [
    'Deberta_Entailment', 'Deberta_Neutral', 'Deberta_Contradiction',
    'Roberta_Entailment', 'Roberta_Neutral', 'Roberta_Contradiction',
    'Albert_Entailment', 'Albert_Neutral', 'Albert_Contradiction',
    'True_Label'  # Ensuring True_Label is the last column
]

# Initialize the DataFrame with specified columns
combined_snli_df = pd.DataFrame(columns=columns)

label_mapping = {'entailment': 0, 'neutral': 1, 'contradiction': 2}

# Load and merge the predictions
for model, path in snli_predictions_paths.items():
    predictions_df = pd.read_csv(path)
    for label in ['Entailment', 'Neutral', 'Contradiction']:
        combined_snli_df[f"{model.capitalize()}_{label}"] = predictions_df[label]

# Assign the true labels to the True_Label column, now positioned at the end
combined_snli_df['True_Label'] = snli_test_df['gold_label'].map(label_mapping)

# Convert True_Label to integer type
combined_snli_df['True_Label'] = combined_snli_df['True_Label'].astype('Int64')

# Save the combined DataFrame to CSV
combined_snli_df.to_csv(output_csv_path_snli, index=False)

print(f"Combined SNLI predictions with true labels saved to {output_csv_path_snli}")


Combined SNLI predictions with true labels saved to /kaggle/working/combined_snli_df


In [135]:
combined_snli_df.head()

Unnamed: 0,Deberta_Entailment,Deberta_Neutral,Deberta_Contradiction,Roberta_Entailment,Roberta_Neutral,Roberta_Contradiction,Albert_Entailment,Albert_Neutral,Albert_Contradiction,True_Label
0,0.034767,0.962592,0.002641,0.012451,0.927093,0.060457,0.008653,0.947434,0.043913,1
1,0.001921,0.319032,0.679047,0.752766,0.242251,0.004983,0.740332,0.256434,0.003235,0
2,0.998783,0.000764,0.000453,0.000254,0.004494,0.995253,0.004677,0.060481,0.934843,2
3,0.001001,0.997708,0.001291,0.005844,0.990736,0.003419,0.034056,0.956687,0.009257,1
4,0.00108,0.301363,0.697557,0.278348,0.718575,0.003076,0.498761,0.49927,0.001969,0


In [136]:
# Load the ANLI Round 1 test data (including true labels)
mnli_matched_test_path = "/kaggle/input/nli-dataset-for-sentence-understanding/mnli_validation_matched.csv"
mnli_matched_test_df = pd.read_csv(mnli_matched_test_path)

# Define file paths for ANLI Round 1 prediction files
mnli_matched_predictions_paths = {
    "deberta": "/kaggle/input/validation/deberta_mnli_matched_val_predictions.csv",
    "roberta": "/kaggle/input/validation/roberta_mnli_matched_val_predictions.csv",
    "albert": "/kaggle/input/validation/albert_mnli_matched_val_predictions.csv",
}

# Specify where you want to save the combined predictions CSV file
output_csv_path_mnli_matched = "/kaggle/working/combined_mnli_matched_df"

# Define the column names, placing True_Label at the end
columns = [
    'Deberta_Entailment', 'Deberta_Neutral', 'Deberta_Contradiction',
    'Roberta_Entailment', 'Roberta_Neutral', 'Roberta_Contradiction',
    'Albert_Entailment', 'Albert_Neutral', 'Albert_Contradiction',
    'True_Label'  # Ensuring True_Label is the last column
]

# Initialize the DataFrame with specified columns
combined_mnli_matched_df = pd.DataFrame(columns=columns)

# Load and merge the predictions
for model, path in mnli_matched_predictions_paths.items():
    predictions_df = pd.read_csv(path)
    for label in ['Entailment', 'Neutral', 'Contradiction']:
        combined_mnli_matched_df[f"{model.capitalize()}_{label}"] = predictions_df[label]

# Assign the true labels to the True_Label column, now positioned at the end
combined_mnli_matched_df['True_Label'] = mnli_matched_test_df['label']

# Save the combined DataFrame to CSV
combined_mnli_matched_df.to_csv(output_csv_path_mnli_matched, index=False)

print(f"Combined MNLI-matched predictions with true labels saved to {output_csv_path_mnli_matched}")


Combined MNLI-matched predictions with true labels saved to /kaggle/working/combined_mnli_matched_df


In [137]:
combined_mnli_matched_df.head()

Unnamed: 0,Deberta_Entailment,Deberta_Neutral,Deberta_Contradiction,Roberta_Entailment,Roberta_Neutral,Roberta_Contradiction,Albert_Entailment,Albert_Neutral,Albert_Contradiction,True_Label
0,0.005164,0.993364,0.001472,0.017844,0.950246,0.031909,0.010844,0.983012,0.006144,1
1,0.999153,0.000526,0.000321,0.001413,0.00203,0.996557,0.005388,0.007536,0.987076,2
2,0.000989,0.044792,0.954219,0.954781,0.042249,0.00297,0.853862,0.143483,0.002655,0
3,0.994965,0.004808,0.000228,0.000343,0.003511,0.996146,0.004128,0.070757,0.925115,2
4,0.999657,0.00022,0.000123,7.9e-05,0.000496,0.999425,0.003864,0.029262,0.966875,2


In [138]:
# Load the ANLI Round 1 test data (including true labels)
mnli_mismatched_test_path = "/kaggle/input/nli-dataset-for-sentence-understanding/mnli_validation_mismatched.csv"
mnli_mismatched_test_df = pd.read_csv(mnli_mismatched_test_path)

# Define file paths for ANLI Round 1 prediction files
mnli_mismatched_predictions_paths = {
    "deberta": "/kaggle/input/validation/deberta_mnli_mismatched_val_predictions.csv",
    "roberta": "/kaggle/input/validation/roberta_mnli_mismatched_val_predictions.csv",
    "albert": "/kaggle/input/validation/albert_mnli_mismatched_val_predictions.csv",
}

# Specify where you want to save the combined predictions CSV file
output_csv_path_mnli_mismatched = "/kaggle/working/combined_mnli_mismatched_df"

# Define the column names, placing True_Label at the end
columns = [
    'Deberta_Entailment', 'Deberta_Neutral', 'Deberta_Contradiction',
    'Roberta_Entailment', 'Roberta_Neutral', 'Roberta_Contradiction',
    'Albert_Entailment', 'Albert_Neutral', 'Albert_Contradiction',
    'True_Label'  # Ensuring True_Label is the last column
]

# Initialize the DataFrame with specified columns
combined_mnli_mismatched_df = pd.DataFrame(columns=columns)

# Load and merge the predictions
for model, path in mnli_mismatched_predictions_paths.items():
    predictions_df = pd.read_csv(path)
    for label in ['Entailment', 'Neutral', 'Contradiction']:
        combined_mnli_mismatched_df[f"{model.capitalize()}_{label}"] = predictions_df[label]

# Assign the true labels to the True_Label column, now positioned at the end
combined_mnli_mismatched_df['True_Label'] = mnli_mismatched_test_df['label']

# Save the combined DataFrame to CSV
combined_mnli_mismatched_df.to_csv(output_csv_path_mnli_mismatched, index=False)

print(f"Combined MNLI-mismatched predictions with true labels saved to {output_csv_path_mnli_mismatched}")


Combined MNLI-mismatched predictions with true labels saved to /kaggle/working/combined_mnli_mismatched_df


In [139]:
combined_mnli_mismatched_df.head()

Unnamed: 0,Deberta_Entailment,Deberta_Neutral,Deberta_Contradiction,Roberta_Entailment,Roberta_Neutral,Roberta_Contradiction,Albert_Entailment,Albert_Neutral,Albert_Contradiction,True_Label
0,0.999667,0.00016,0.000173,6.8e-05,0.000402,0.999529,0.000894,0.003787,0.995318,2
1,0.998119,0.000962,0.000919,0.000183,0.001511,0.998306,0.006421,0.010224,0.983355,2
2,0.000552,0.004809,0.994639,0.986062,0.01202,0.001918,0.975041,0.023354,0.001605,0
3,0.827653,0.171961,0.000386,0.000478,0.270953,0.728569,0.001722,0.796122,0.202156,2
4,0.000292,0.002875,0.996833,0.975167,0.021904,0.002929,0.965952,0.032748,0.0013,0


In [140]:
# Load the ANLI Round 1 test data (including true labels)
anli_r1_test_path = "/kaggle/input/anli-a-large-scale-nli-benchmark-dataset/test_r1.csv"
anli_r1_test_df = pd.read_csv(anli_r1_test_path)

# Define file paths for ANLI Round 1 prediction files
anli_r1_predictions_paths = {
    "deberta": "/kaggle/input/deberta-nli/deberta_anli_r1_predictions.csv",
    "roberta": "/kaggle/input/roberta/roberta_anli_r1_predictions.csv",
    "albert": "/kaggle/input/albert/albert_anli_r1_predictions.csv",
}

# Specify where you want to save the combined predictions CSV file
output_csv_path_anli_r1 = "/kaggle/working/combined_anli_r1_df"

# Define the column names, placing True_Label at the end
columns = [
    'Deberta_Entailment', 'Deberta_Neutral', 'Deberta_Contradiction',
    'Roberta_Entailment', 'Roberta_Neutral', 'Roberta_Contradiction',
    'Albert_Entailment', 'Albert_Neutral', 'Albert_Contradiction',
    'True_Label'  # Ensuring True_Label is the last column
]

# Initialize the DataFrame with specified columns
combined_anli_r1_df = pd.DataFrame(columns=columns)

# Load and merge the predictions
for model, path in anli_r1_predictions_paths.items():
    predictions_df = pd.read_csv(path)
    for label in ['Entailment', 'Neutral', 'Contradiction']:
        combined_anli_r1_df[f"{model.capitalize()}_{label}"] = predictions_df[label]

# Assign the true labels to the True_Label column, now positioned at the end
combined_anli_r1_df['True_Label'] = anli_r1_test_df['label']

# Save the combined DataFrame to CSV
combined_anli_r1_df.to_csv(output_csv_path_anli_r1, index=False)

print(f"Combined ANLI Round 1 predictions with true labels saved to {output_csv_path_anli_r1}")


Combined ANLI Round 1 predictions with true labels saved to /kaggle/working/combined_anli_r1_df


In [141]:
combined_anli_r1_df.head()

Unnamed: 0,Deberta_Entailment,Deberta_Neutral,Deberta_Contradiction,Roberta_Entailment,Roberta_Neutral,Roberta_Contradiction,Albert_Entailment,Albert_Neutral,Albert_Contradiction,True_Label
0,0.015388,0.976305,0.008307,0.996714,0.000376,0.00291,0.322974,0.667628,0.009398,0
1,0.224603,0.501549,0.273848,0.87572,0.000724,0.123556,0.998526,0.000604,0.000869,0
2,0.006642,0.97669,0.016669,0.999484,0.00033,0.000186,0.783352,0.212241,0.004407,0
3,0.966494,0.032235,0.001272,0.000686,0.998181,0.001133,0.002134,0.989523,0.008343,1
4,0.880736,0.028293,0.090971,0.000378,0.000197,0.999425,0.023283,0.013253,0.963464,2


In [142]:
# Load the ANLI Round 2 test data (including true labels)
anli_r2_test_path = "/kaggle/input/anli-a-large-scale-nli-benchmark-dataset/test_r2.csv"
anli_r2_test_df = pd.read_csv(anli_r2_test_path)

# Define file paths for ANLI Round 2 prediction files
anli_r2_predictions_paths = {
    "deberta": "/kaggle/input/deberta-nli/deberta_anli_r2_predictions.csv",
    "roberta": "/kaggle/input/roberta/roberta_anli_r2_predictions.csv",
    "albert": "/kaggle/input/albert/albert_anli_r2_predictions.csv",
}

# Specify where you want to save the combined predictions CSV file
output_csv_path_anli_r2 = "/kaggle/working/combined_anli_r2_df"

# Define the column names, placing True_Label at the end
columns = [
    'Deberta_Entailment', 'Deberta_Neutral', 'Deberta_Contradiction',
    'Roberta_Entailment', 'Roberta_Neutral', 'Roberta_Contradiction',
    'Albert_Entailment', 'Albert_Neutral', 'Albert_Contradiction',
    'True_Label'  # Ensuring True_Label is the last column
]

# Initialize the DataFrame with specified columns
combined_anli_r2_df = pd.DataFrame(columns=columns)

# Load and merge the predictions
for model, path in anli_r2_predictions_paths.items():
    predictions_df = pd.read_csv(path)
    for label in ['Entailment', 'Neutral', 'Contradiction']:
        combined_anli_r2_df[f"{model.capitalize()}_{label}"] = predictions_df[label]

# Assign the true labels to the True_Label column, now positioned at the end
combined_anli_r2_df['True_Label'] = anli_r2_test_df['label']

# Save the combined DataFrame to CSV
combined_anli_r2_df.to_csv(output_csv_path_anli_r2, index=False)

print(f"Combined ANLI Round 2 predictions with true labels saved to {output_csv_path_anli_r2}")


Combined ANLI Round 2 predictions with true labels saved to /kaggle/working/combined_anli_r2_df


In [143]:
combined_anli_r2_df.head()

Unnamed: 0,Deberta_Entailment,Deberta_Neutral,Deberta_Contradiction,Roberta_Entailment,Roberta_Neutral,Roberta_Contradiction,Albert_Entailment,Albert_Neutral,Albert_Contradiction,True_Label
0,0.001309,0.029617,0.969075,0.999506,0.000264,0.00023,0.863365,0.133388,0.003246,0
1,0.724144,0.273676,0.00218,0.026951,0.05423,0.918819,0.0729,0.904344,0.022756,1
2,0.071604,0.917894,0.010503,0.001282,0.998108,0.00061,0.027402,0.972218,0.00038,0
3,0.066162,0.929179,0.004659,0.007091,0.992694,0.000215,0.632171,0.365194,0.002635,1
4,0.906199,0.089873,0.003928,0.006259,0.989432,0.004309,0.064109,0.234642,0.701249,2


In [144]:
# Load the ANLI Round 3 test data (including true labels)
anli_r3_test_path = "/kaggle/input/anli-a-large-scale-nli-benchmark-dataset/test_r3.csv"
anli_r3_test_df = pd.read_csv(anli_r3_test_path)

# Define file paths for ANLI Round 2 prediction files
anli_r3_predictions_paths = {
    "deberta": "/kaggle/input/deberta-nli/deberta_anli_r3_predictions.csv",
    "roberta": "/kaggle/input/roberta/roberta_anli_r3_predictions.csv",
    "albert": "/kaggle/input/albert/albert_anli_r3_predictions.csv",
}

# Specify where you want to save the combined predictions CSV file
output_csv_path_anli_r3 = "/kaggle/working/combined_anli_r3_df"

# Define the column names, placing True_Label at the end
columns = [
    'Deberta_Entailment', 'Deberta_Neutral', 'Deberta_Contradiction',
    'Roberta_Entailment', 'Roberta_Neutral', 'Roberta_Contradiction',
    'Albert_Entailment', 'Albert_Neutral', 'Albert_Contradiction',
    'True_Label'  # Ensuring True_Label is the last column
]

# Initialize the DataFrame with specified columns
combined_anli_r3_df = pd.DataFrame(columns=columns)

# Load and merge the predictions
for model, path in anli_r3_predictions_paths.items():
    predictions_df = pd.read_csv(path)
    for label in ['Entailment', 'Neutral', 'Contradiction']:
        combined_anli_r3_df[f"{model.capitalize()}_{label}"] = predictions_df[label]

# Assign the true labels to the True_Label column, now positioned at the end
combined_anli_r3_df['True_Label'] = anli_r3_test_df['label']

# Save the combined DataFrame to CSV
combined_anli_r3_df.to_csv(output_csv_path_anli_r3, index=False)

print(f"Combined ANLI Round 3 predictions with true labels saved to {output_csv_path_anli_r3}")


Combined ANLI Round 3 predictions with true labels saved to /kaggle/working/combined_anli_r3_df


In [145]:
combined_anli_r3_df.head()

Unnamed: 0,Deberta_Entailment,Deberta_Neutral,Deberta_Contradiction,Roberta_Entailment,Roberta_Neutral,Roberta_Contradiction,Albert_Entailment,Albert_Neutral,Albert_Contradiction,True_Label
0,0.005921,0.960529,0.033551,0.022959,0.976533,0.000509,0.001848,0.998084,6.7e-05,0
1,0.009586,0.934714,0.0557,0.999611,0.000205,0.000185,0.951772,0.048075,0.000153,0
2,0.003428,0.976393,0.020179,0.00202,0.997897,8.3e-05,0.001014,0.998984,2e-06,0
3,0.004633,0.023985,0.971382,0.974441,0.024459,0.0011,0.996749,0.000989,0.002262,0
4,0.017428,0.633695,0.348877,0.984416,0.011166,0.004419,0.000518,0.128416,0.871066,0


In [146]:
# Check for missing values
missing_values_anli1 = combined_anli_r1_df.isnull().sum()

missing_values_anli2 = combined_anli_r2_df.isnull().sum()

missing_values_anli3 = combined_anli_r3_df.isnull().sum()

missing_values_snli = combined_snli_df.isnull().sum()

missing_values_mnli_matched = combined_mnli_matched_df.isnull().sum()

missing_values_mnli_mismatched = combined_mnli_mismatched_df.isnull().sum()

In [147]:
missing_values_anli1

Deberta_Entailment       0
Deberta_Neutral          0
Deberta_Contradiction    0
Roberta_Entailment       0
Roberta_Neutral          0
Roberta_Contradiction    0
Albert_Entailment        0
Albert_Neutral           0
Albert_Contradiction     0
True_Label               0
dtype: int64

In [148]:
missing_values_anli2

Deberta_Entailment       0
Deberta_Neutral          0
Deberta_Contradiction    0
Roberta_Entailment       0
Roberta_Neutral          0
Roberta_Contradiction    0
Albert_Entailment        0
Albert_Neutral           0
Albert_Contradiction     0
True_Label               0
dtype: int64

In [149]:
missing_values_anli3

Deberta_Entailment       0
Deberta_Neutral          0
Deberta_Contradiction    0
Roberta_Entailment       0
Roberta_Neutral          0
Roberta_Contradiction    0
Albert_Entailment        0
Albert_Neutral           0
Albert_Contradiction     0
True_Label               0
dtype: int64

In [150]:
missing_values_snli

Deberta_Entailment         0
Deberta_Neutral            0
Deberta_Contradiction      0
Roberta_Entailment         0
Roberta_Neutral            0
Roberta_Contradiction      0
Albert_Entailment          0
Albert_Neutral             0
Albert_Contradiction       0
True_Label               176
dtype: int64

In [151]:
missing_values_mnli_matched

Deberta_Entailment       0
Deberta_Neutral          0
Deberta_Contradiction    0
Roberta_Entailment       0
Roberta_Neutral          0
Roberta_Contradiction    0
Albert_Entailment        0
Albert_Neutral           0
Albert_Contradiction     0
True_Label               0
dtype: int64

In [152]:
missing_values_mnli_mismatched

Deberta_Entailment       0
Deberta_Neutral          0
Deberta_Contradiction    0
Roberta_Entailment       0
Roberta_Neutral          0
Roberta_Contradiction    0
Albert_Entailment        0
Albert_Neutral           0
Albert_Contradiction     0
True_Label               0
dtype: int64

In [153]:
combined_snli_df.dropna(subset=['True_Label'], inplace=True)


In [154]:
# Verify missing values again after removal
missing_values_snli_after_removal = combined_snli_df.isnull().sum()
print(missing_values_snli_after_removal)


Deberta_Entailment       0
Deberta_Neutral          0
Deberta_Contradiction    0
Roberta_Entailment       0
Roberta_Neutral          0
Roberta_Contradiction    0
Albert_Entailment        0
Albert_Neutral           0
Albert_Contradiction     0
True_Label               0
dtype: int64


In [155]:
combined_snli_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 9824 entries, 0 to 9999
Data columns (total 10 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Deberta_Entailment     9824 non-null   float64
 1   Deberta_Neutral        9824 non-null   float64
 2   Deberta_Contradiction  9824 non-null   float64
 3   Roberta_Entailment     9824 non-null   float64
 4   Roberta_Neutral        9824 non-null   float64
 5   Roberta_Contradiction  9824 non-null   float64
 6   Albert_Entailment      9824 non-null   float64
 7   Albert_Neutral         9824 non-null   float64
 8   Albert_Contradiction   9824 non-null   float64
 9   True_Label             9824 non-null   Int64  
dtypes: Int64(1), float64(9)
memory usage: 853.8 KB


In [156]:
combined_snli_df

Unnamed: 0,Deberta_Entailment,Deberta_Neutral,Deberta_Contradiction,Roberta_Entailment,Roberta_Neutral,Roberta_Contradiction,Albert_Entailment,Albert_Neutral,Albert_Contradiction,True_Label
0,0.034767,0.962592,0.002641,0.012451,0.927093,0.060457,0.008653,0.947434,0.043913,1
1,0.001921,0.319032,0.679047,0.752766,0.242251,0.004983,0.740332,0.256434,0.003235,0
2,0.998783,0.000764,0.000453,0.000254,0.004494,0.995253,0.004677,0.060481,0.934843,2
3,0.001001,0.997708,0.001291,0.005844,0.990736,0.003419,0.034056,0.956687,0.009257,1
4,0.001080,0.301363,0.697557,0.278348,0.718575,0.003076,0.498761,0.499270,0.001969,0
...,...,...,...,...,...,...,...,...,...,...
9995,0.998825,0.001033,0.000142,0.001264,0.028942,0.969794,0.006420,0.057240,0.936340,2
9996,0.000704,0.009793,0.989503,0.780946,0.217053,0.002001,0.894637,0.104095,0.001267,0
9997,0.999171,0.000493,0.000336,0.000054,0.000765,0.999181,0.000838,0.002670,0.996493,2
9998,0.000267,0.002178,0.997556,0.983402,0.015884,0.000714,0.984347,0.015223,0.000430,0


In [157]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from keras.utils import to_categorical

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import tensorflow as tf

# Assuming 'combined_snli_df' is already loaded as described

# Features and Labels
X_snli = combined_snli_df.drop('True_Label', axis=1).values
y_snli = combined_snli_df['True_Label'].values

# Features and Labels
X_mnli_matched = combined_mnli_matched_df.drop('True_Label', axis=1).values
y_mnli_matched = combined_mnli_matched_df['True_Label'].values

# Features and Labels
X_mnli_mismatched = combined_mnli_mismatched_df.drop('True_Label', axis=1).values
y_mnli_mismatched = combined_mnli_mismatched_df['True_Label'].values

# Features and Labels
X_anli_r1 = combined_anli_r1_df.drop('True_Label', axis=1).values
y_anli_r1 = combined_anli_r1_df['True_Label'].values

# Features and Labels
X_anli_r2 = combined_anli_r2_df.drop('True_Label', axis=1).values
y_anli_r2 = combined_anli_r2_df['True_Label'].values

# Features and Labels
X_anli_r3 = combined_anli_r3_df.drop('True_Label', axis=1).values
y_anli_r3 = combined_anli_r3_df['True_Label'].values


In [158]:
# One-hot encode labels
y_encoded_snli = tf.keras.utils.to_categorical(y_snli)
# One-hot encode labels
y_encoded_mnli_matched = tf.keras.utils.to_categorical(y_mnli_matched)
# One-hot encode labels
y_encoded_mnli_mismatched = tf.keras.utils.to_categorical(y_mnli_mismatched)
# One-hot encode labels
y_encoded_anli_r1 = tf.keras.utils.to_categorical(y_anli_r1)
# One-hot encode labels
y_encoded_anli_r2 = tf.keras.utils.to_categorical(y_anli_r2)
# One-hot encode labels
y_encoded_anli_r3 = tf.keras.utils.to_categorical(y_anli_r3)


In [159]:
# Splitting the SNLI dataset
X_train_snli, X_test_snli, y_train_snli, y_test_snli = train_test_split(X_snli, y_encoded_snli, test_size=0.2, random_state=42)
# Splitting the MNLI matched dataset
X_train_mnli_matched, X_test_mnli_matched, y_train_mnli_matched, y_test_mnli_matched = train_test_split(X_mnli_matched, y_encoded_mnli_matched, test_size=0.2, random_state=42)
# Splitting the MNLI mismatched dataset
X_train_mnli_mismatched, X_test_mnli_mismatched, y_train_mnli_mismatched, y_test_mnli_mismatched = train_test_split(X_mnli_mismatched, y_encoded_mnli_mismatched, test_size=0.2, random_state=42)
# Splitting the ANLI round 1dataset
X_train_anli_r1, X_test_anli_r1, y_train_anli_r1, y_test_anli_r1 = train_test_split(X_anli_r1, y_encoded_anli_r1, test_size=0.2, random_state=42)
# Splitting the ANLI round 2 dataset
X_train_anli_r2, X_test_anli_r2, y_train_anli_r2, y_test_anli_r2 = train_test_split(X_anli_r2, y_encoded_anli_r2, test_size=0.2, random_state=42)
# Splitting the ANLI round 3 dataset
X_train_anli_r3, X_test_anli_r3, y_train_anli_r3, y_test_anli_r3 = train_test_split(X_anli_r3, y_encoded_anli_r3, test_size=0.2, random_state=42)


In [164]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import SimpleRNN, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import backend as K
import tensorflow as tf

# Custom F1-score metric
def f1_m(y_true, y_pred):
    y_pred_classes = K.argmax(y_pred, axis=-1)
    y_true_classes = K.argmax(y_true, axis=-1)
    
    # Convert y_true and y_pred to binary format for micro F1 score
    y_pred_binary = tf.one_hot(y_pred_classes, depth=3)
    y_true_binary = tf.one_hot(y_true_classes, depth=3)
    
    tp = K.sum(K.cast(y_true_binary * y_pred_binary, 'float'), axis=0)
    fp = K.sum(K.cast((1-y_true_binary) * y_pred_binary, 'float'), axis=0)
    fn = K.sum(K.cast(y_true_binary * (1-y_pred_binary), 'float'), axis=0)
    
    precision = tp / (tp + fp + K.epsilon())
    recall = tp / (tp + fn + K.epsilon())
    
    f1 = 2 * precision * recall / (precision + recall + K.epsilon())
    return K.mean(f1)

# Custom recall metric
def recall_m(y_true, y_pred):
    y_pred_classes = K.argmax(y_pred, axis=-1)
    y_true_classes = K.argmax(y_true, axis=-1)
    
    y_pred_binary = tf.one_hot(y_pred_classes, depth=3)
    y_true_binary = tf.one_hot(y_true_classes, depth=3)
    
    tp = K.sum(K.cast(y_true_binary * y_pred_binary, 'float'), axis=0)
    fn = K.sum(K.cast(y_true_binary * (1-y_pred_binary), 'float'), axis=0)
    
    recall = tp / (tp + fn + K.epsilon())
    return K.mean(recall)

def train_and_evaluate(X_train, y_train, X_test, y_test):
    # Reshape input data to (batch_size, 1, num_features)
    X_train_reshaped = X_train.reshape((X_train.shape[0], 1, X_train.shape[1]))
    X_test_reshaped = X_test.reshape((X_test.shape[0], 1, X_test.shape[1]))

    # Define the RNN model
    model = Sequential([
        SimpleRNN(50, input_shape=(1, X_train.shape[1]), return_sequences=True),
        Dropout(0.5),
        SimpleRNN(50),
        Dropout(0.5),
        Dense(3, activation='softmax')
    ])

    # Compile the model
    model.compile(optimizer=Adam(learning_rate=0.001), loss='categorical_crossentropy', 
                  metrics=[f1_m, recall_m])

    # Train the model
    history = model.fit(X_train_reshaped, y_train, epochs=20, batch_size=64, validation_split=0.2, verbose=1)

    # Evaluate the model
    evaluation = model.evaluate(X_test_reshaped, y_test, verbose=0)

    return model, history, evaluation



In [165]:
# Example usage for one of your datasets
model_snli, history_snli, evaluation_snli = train_and_evaluate(X_train_snli, y_train_snli, X_test_snli, y_test_snli)
print(f"Test Loss: {evaluation_snli[0]}, Test F1-Score: {evaluation_snli[1]}, Test Recall: {evaluation_snli[2]}")


Epoch 1/20
[1m99/99[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 53ms/step - f1_m: 0.6854 - loss: 0.7256 - recall_m: 0.6881 - val_f1_m: 0.9177 - val_loss: 0.2399 - val_recall_m: 0.9189
Epoch 2/20
[1m99/99[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - f1_m: 0.9197 - loss: 0.2670 - recall_m: 0.9213 - val_f1_m: 0.9185 - val_loss: 0.2444 - val_recall_m: 0.9195
Epoch 3/20
[1m99/99[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - f1_m: 0.9261 - loss: 0.2536 - recall_m: 0.9275 - val_f1_m: 0.9184 - val_loss: 0.2424 - val_recall_m: 0.9195
Epoch 4/20
[1m99/99[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - f1_m: 0.9261 - loss: 0.2445 - recall_m: 0.9274 - val_f1_m: 0.9171 - val_loss: 0.2417 - val_recall_m: 0.9184
Epoch 5/20
[1m99/99[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - f1_m: 0.9254 - loss: 0.2556 - recall_m: 0.9272 - val_f1_m: 0.9175 - val_loss: 0.2420 - val_recall_m: 0.9186
Epoch 6/20
[1m99/99[0m [3

In [166]:

# MNLI Matched Dataset
model_mnli_matched, history_mnli_matched, evaluation_mnli_matched = train_and_evaluate(
    X_train_mnli_matched, y_train_mnli_matched, X_test_mnli_matched, y_test_mnli_matched
)
print(f"MNLI Matched - Test Loss: {evaluation_mnli_matched[0]}, Test F1-Score: {evaluation_mnli_matched[1]}, Test Recall: {evaluation_mnli_matched[2]}")


Epoch 1/20
[1m99/99[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 50ms/step - f1_m: 0.7459 - loss: 0.6322 - recall_m: 0.7537 - val_f1_m: 0.9070 - val_loss: 0.2919 - val_recall_m: 0.9073
Epoch 2/20
[1m99/99[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - f1_m: 0.9137 - loss: 0.2794 - recall_m: 0.9174 - val_f1_m: 0.9050 - val_loss: 0.2960 - val_recall_m: 0.9054
Epoch 3/20
[1m99/99[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - f1_m: 0.9211 - loss: 0.2695 - recall_m: 0.9234 - val_f1_m: 0.9063 - val_loss: 0.2963 - val_recall_m: 0.9068
Epoch 4/20
[1m99/99[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - f1_m: 0.9141 - loss: 0.2819 - recall_m: 0.9159 - val_f1_m: 0.9082 - val_loss: 0.2966 - val_recall_m: 0.9086
Epoch 5/20
[1m99/99[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - f1_m: 0.9187 - loss: 0.2750 - recall_m: 0.9207 - val_f1_m: 0.9057 - val_loss: 0.2945 - val_recall_m: 0.9062
Epoch 6/20
[1m99/99[0m [3

In [167]:
# MNLI Mismatched Dataset
model_mnli_mismatched, history_mnli_mismatched, evaluation_mnli_mismatched = train_and_evaluate(
    X_train_mnli_mismatched, y_train_mnli_mismatched, X_test_mnli_mismatched, y_test_mnli_mismatched
)
print(f"MNLI Mismatched - Test Loss: {evaluation_mnli_mismatched[0]}, Test F1-Score: {evaluation_mnli_mismatched[1]}, Test Recall: {evaluation_mnli_mismatched[2]}")


Epoch 1/20
[1m99/99[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 52ms/step - f1_m: 0.6365 - loss: 0.7916 - recall_m: 0.6500 - val_f1_m: 0.9060 - val_loss: 0.2776 - val_recall_m: 0.9082
Epoch 2/20
[1m99/99[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - f1_m: 0.9183 - loss: 0.2720 - recall_m: 0.9201 - val_f1_m: 0.9036 - val_loss: 0.2811 - val_recall_m: 0.9059
Epoch 3/20
[1m99/99[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - f1_m: 0.9170 - loss: 0.2732 - recall_m: 0.9194 - val_f1_m: 0.9035 - val_loss: 0.2817 - val_recall_m: 0.9057
Epoch 4/20
[1m99/99[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - f1_m: 0.9050 - loss: 0.2816 - recall_m: 0.9073 - val_f1_m: 0.9029 - val_loss: 0.2821 - val_recall_m: 0.9051
Epoch 5/20
[1m99/99[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - f1_m: 0.9132 - loss: 0.2687 - recall_m: 0.9164 - val_f1_m: 0.9041 - val_loss: 0.2781 - val_recall_m: 0.9063
Epoch 6/20
[1m99/99[0m [3

In [168]:
# ANLI Round 1 Dataset
model_anli_r1, history_anli_r1, evaluation_anli_r1 = train_and_evaluate(
    X_train_anli_r1, y_train_anli_r1, X_test_anli_r1, y_test_anli_r1
)
print(f"ANLI Round 1 - Test Loss: {evaluation_anli_r1[0]}, Test F1-Score: {evaluation_anli_r1[1]}, Test Recall: {evaluation_anli_r1[2]}")


Epoch 1/20
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 116ms/step - f1_m: 0.3786 - loss: 1.1475 - recall_m: 0.3898 - val_f1_m: 0.6604 - val_loss: 0.9417 - val_recall_m: 0.6749
Epoch 2/20
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - f1_m: 0.5260 - loss: 0.9356 - recall_m: 0.5317 - val_f1_m: 0.6789 - val_loss: 0.8254 - val_recall_m: 0.6849
Epoch 3/20
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - f1_m: 0.6435 - loss: 0.8540 - recall_m: 0.6481 - val_f1_m: 0.7018 - val_loss: 0.7542 - val_recall_m: 0.7067
Epoch 4/20
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - f1_m: 0.6852 - loss: 0.7979 - recall_m: 0.6980 - val_f1_m: 0.7119 - val_loss: 0.7085 - val_recall_m: 0.7163
Epoch 5/20
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - f1_m: 0.7276 - loss: 0.7144 - recall_m: 0.7293 - val_f1_m: 0.7119 - val_loss: 0.6895 - val_recall_m: 0.7163
Epoch 6/20
[1m10/10[0m [3

In [170]:
# ANLI Round 2 Dataset
model_anli_r2, history_anli_r2, evaluation_anli_r2 = train_and_evaluate(
    X_train_anli_r2, y_train_anli_r2, X_test_anli_r2, y_test_anli_r2
)
print(f"ANLI Round 2 - Test Loss: {evaluation_anli_r2[0]}, Test F1-Score: {evaluation_anli_r2[1]}, Test Recall: {evaluation_anli_r2[2]}")


Epoch 1/20
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 117ms/step - f1_m: 0.3898 - loss: 1.1595 - recall_m: 0.3981 - val_f1_m: 0.5430 - val_loss: 0.9907 - val_recall_m: 0.5513
Epoch 2/20
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - f1_m: 0.4488 - loss: 1.0508 - recall_m: 0.4497 - val_f1_m: 0.5660 - val_loss: 0.9262 - val_recall_m: 0.5773
Epoch 3/20
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - f1_m: 0.5558 - loss: 0.9532 - recall_m: 0.5600 - val_f1_m: 0.5715 - val_loss: 0.8925 - val_recall_m: 0.5820
Epoch 4/20
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - f1_m: 0.5622 - loss: 0.9074 - recall_m: 0.5683 - val_f1_m: 0.5805 - val_loss: 0.8773 - val_recall_m: 0.5900
Epoch 5/20
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - f1_m: 0.5992 - loss: 0.8994 - recall_m: 0.6072 - val_f1_m: 0.5776 - val_loss: 0.8665 - val_recall_m: 0.5876
Epoch 6/20
[1m10/10[0m [3

In [171]:
# ANLI Round 3 Dataset
model_anli_r3, history_anli_r3, evaluation_anli_r3 = train_and_evaluate(
    X_train_anli_r3, y_train_anli_r3, X_test_anli_r3, y_test_anli_r3
)
print(f"ANLI Round 3 - Test Loss: {evaluation_anli_r3[0]}, Test F1-Score: {evaluation_anli_r3[1]}, Test Recall: {evaluation_anli_r3[2]}")


Epoch 1/20
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 65ms/step - f1_m: 0.3276 - loss: 1.1999 - recall_m: 0.3321 - val_f1_m: 0.5577 - val_loss: 0.9514 - val_recall_m: 0.6057
Epoch 2/20
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - f1_m: 0.4509 - loss: 1.0408 - recall_m: 0.4627 - val_f1_m: 0.6375 - val_loss: 0.8512 - val_recall_m: 0.6457
Epoch 3/20
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - f1_m: 0.5459 - loss: 0.9547 - recall_m: 0.5517 - val_f1_m: 0.6556 - val_loss: 0.8004 - val_recall_m: 0.6611
Epoch 4/20
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - f1_m: 0.5798 - loss: 0.9022 - recall_m: 0.5917 - val_f1_m: 0.6686 - val_loss: 0.7749 - val_recall_m: 0.6706
Epoch 5/20
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - f1_m: 0.5871 - loss: 0.8715 - recall_m: 0.5959 - val_f1_m: 0.6686 - val_loss: 0.7598 - val_recall_m: 0.6706
Epoch 6/20
[1m12/12[0m [32

In [172]:
# Assuming all your datasets are already loaded and preprocessed correctly
# Concatenate all feature and label arrays across the datasets
X_combined = np.concatenate((X_snli, X_mnli_matched, X_mnli_mismatched, X_anli_r1, X_anli_r2, X_anli_r3), axis=0)
y_combined = np.concatenate((y_encoded_snli, y_encoded_mnli_matched, y_encoded_mnli_mismatched, y_encoded_anli_r1, y_encoded_anli_r2, y_encoded_anli_r3), axis=0)

# Split the combined data into training and testing sets
X_train_combined, X_test_combined, y_train_combined, y_test_combined = train_test_split(
    X_combined, y_combined, test_size=0.2, random_state=42
)

# Combined Dataset
model_combined, history_combined, evaluation_combined = train_and_evaluate(
    X_train_combined, y_train_combined, X_test_combined, y_test_combined
)
print(f"Combined - Test Loss: {evaluation_combined[0]}, Test F1-Score: {evaluation_combined[1]}, Test Recall: {evaluation_combined[2]}")

Epoch 1/20
[1m327/327[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 17ms/step - f1_m: 0.8013 - loss: 0.5204 - recall_m: 0.8054 - val_f1_m: 0.8878 - val_loss: 0.3171 - val_recall_m: 0.8896
Epoch 2/20
[1m327/327[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - f1_m: 0.8945 - loss: 0.3264 - recall_m: 0.8974 - val_f1_m: 0.8874 - val_loss: 0.3190 - val_recall_m: 0.8892
Epoch 3/20
[1m327/327[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - f1_m: 0.8912 - loss: 0.3295 - recall_m: 0.8930 - val_f1_m: 0.8883 - val_loss: 0.3190 - val_recall_m: 0.8902
Epoch 4/20
[1m327/327[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - f1_m: 0.8904 - loss: 0.3288 - recall_m: 0.8925 - val_f1_m: 0.8873 - val_loss: 0.3164 - val_recall_m: 0.8890
Epoch 5/20
[1m327/327[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - f1_m: 0.8958 - loss: 0.3165 - recall_m: 0.8979 - val_f1_m: 0.8892 - val_loss: 0.3189 - val_recall_m: 0.8910
Epoch 6/20
[1m327