In [None]:
import pandas as pd

# Define the file paths
file_paths = [
    "/content/GNN_results.csv",
    "/content/logistic_results.csv",
    "/content/predicted_submit.csv",
    "/content/test.csv"
]

# Loop through each file and print the column names
for file_path in file_paths:
    df = pd.read_csv(file_path)
    print(f"Column names for {file_path}:")
    print(df.columns.tolist())
    print("\n")


In [None]:
import pandas as pd

# Load each CSV file into a DataFrame
df_predicted_submit = pd.read_csv("/content/predicted_submit.csv")
df_logistic = pd.read_csv("/content/logistic_results.csv")
df_gnn = pd.read_csv("/content/GNN_results.csv")

# Merge the DataFrames based on the "id" column
df_merge_1 = pd.merge(df_predicted_submit[['id', 'predicted_label', 'label']], df_logistic[['id', 'predicted_label']], on='id', suffixes=('_LSTM', '_Logistic'))
df_final = pd.merge(df_merge_1, df_gnn[['id', 'predicted_label']], on='id')

# Rename columns
df_final.rename(columns={
    'predicted_label_LSTM': 'LSTM_prediction',
    'predicted_label_Logistic': 'Logistic_prediction',
    'predicted_label': 'GNN_prediction',
    'label': 'real_label'
}, inplace=True)

# Save the final DataFrame to a new CSV file
df_final.to_csv("/content/prediction_summary.csv", index=False)


In [None]:
import pandas as pd

# Load the CSV file into a DataFrame
df_summary = pd.read_csv("/content/prediction_summary.csv")

# Print the column names
print(df_summary.columns.tolist())


In [None]:
import pandas as pd

# Load the CSV file into a DataFrame
df_summary = pd.read_csv("/content/prediction_summary.csv")

# Compute accuracy for LSTM model
lstm_accuracy = (df_summary['LSTM_prediction'] == df_summary['real_label']).mean()
print(f"LSTM Model Accuracy: {lstm_accuracy * 100:.2f}%")

# Compute accuracy for Logistic model
logistic_accuracy = (df_summary['Logistic_prediction'] == df_summary['real_label']).mean()
print(f"Logistic Regression Model Accuracy: {logistic_accuracy * 100:.2f}%")

# Compute accuracy for GNN model
gnn_accuracy = (df_summary['GNN_prediction'] == df_summary['real_label']).mean()
print(f"GNN Model Accuracy: {gnn_accuracy * 100:.2f}%")


In [None]:
import matplotlib.pyplot as plt

# Data
models = ['LSTM', 'Logistic Regression', 'GNN']
accuracies = [61.10, 63.58, 63.25]

# Create the bar plot
plt.figure(figsize=(10, 6))
plt.bar(models, accuracies, color=['blue', 'green', 'red'])

# Title and labels
plt.title('Model Accuracies Comparison')
plt.ylabel('Accuracy (%)')
plt.xlabel('Models')

# Display the plot
plt.show()


In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Extract the real labels
real_labels = df_summary['real_label']

# Define a list for the prediction columns
prediction_columns = ['LSTM_prediction', 'Logistic_prediction', 'GNN_prediction']

# Calculate and print the metrics for each model
for column in prediction_columns:
    predictions = df_summary[column]

    # Calculate metrics
    accuracy = accuracy_score(real_labels, predictions) * 100  # Convert to percentage
    precision = precision_score(real_labels, predictions, average='macro', zero_division=0)
    recall = recall_score(real_labels, predictions, average='macro', zero_division=0)
    f1 = f1_score(real_labels, predictions, average='macro', zero_division=0)

    # Print metrics
    print(f"Metrics for {column.replace('_prediction', '')}:")
    print(f"Accuracy: {accuracy:.2f}%")
    print(f"Precision: {precision:.2f}")
    print(f"Recall: {recall:.2f}")
    print(f"F1 Score: {f1:.2f}\n")


In [None]:
# Check NaN count for each prediction column
for column in prediction_columns:
    nan_count = df_summary[column].isna().sum()
    if nan_count > 0:
        print(f"{column} has {nan_count} NaN values.")


In [None]:
# Drop rows where any of the prediction columns or the real_label column has NaN values
df_summary_clean = df_summary.dropna(subset=prediction_columns + ['real_label'])


In [None]:
# Extract the real labels from cleaned data
real_labels_clean = df_summary_clean['real_label']

# Calculate and print the metrics for each model
for column in prediction_columns:
    predictions = df_summary_clean[column]

    # Calculate metrics
    accuracy = accuracy_score(real_labels_clean, predictions) * 100  # Convert to percentage
    precision = precision_score(real_labels_clean, predictions, average='macro', zero_division=0)
    recall = recall_score(real_labels_clean, predictions, average='macro', zero_division=0)
    f1 = f1_score(real_labels_clean, predictions, average='macro', zero_division=0)

    # Print metrics
    print(f"Metrics for {column.replace('_prediction', '')}:")
    print(f"Accuracy: {accuracy:.2f}%")
    print(f"Precision: {precision:.2f}")
    print(f"Recall: {recall:.2f}")
    print(f"F1 Score: {f1:.2f}\n")


In [None]:
print(df_summary_clean.head(10))


In [None]:
for column in prediction_columns + ['real_label']:
    print(f"Unique values in {column}: {df_summary_clean[column].unique()}")


In [None]:
print(f"Original dataframe length: {len(df_summary)}")
print(f"Cleaned dataframe length: {len(df_summary_clean)}")


In [None]:
labels = df_summary_clean['real_label'].unique()


In [None]:
for column in prediction_columns:
    predictions = df_summary_clean[column]

    # Calculate metrics with explicit labels
    accuracy = accuracy_score(real_labels_clean, predictions) * 100
    precision = precision_score(real_labels_clean, predictions, average='macro', labels=labels, zero_division=0)
    recall = recall_score(real_labels_clean, predictions, average='macro', labels=labels, zero_division=0)
    f1 = f1_score(real_labels_clean, predictions, average='macro', labels=labels, zero_division=0)

    # Print metrics
    print(f"Metrics for {column.replace('_prediction', '')}:")
    print(f"Accuracy: {accuracy:.2f}%")
    print(f"Precision: {precision:.2f}")
    print(f"Recall: {recall:.2f}")
    print(f"F1 Score: {f1:.2f}\n")


In [None]:
# Create a subset dataframe from the provided data
data_subset = {
    'id': [20800, 20801, 20802, 20803, 20804, 20805, 20806, 20807, 20808, 20809],
    'LSTM_prediction': [0, 1, 1, 0, 1, 1, 0, 1, 1, 1],
    'real_label': [0, 1, 0, 1, 1, 1, 1, 1, 0, 1],
    'Logistic_prediction': [0, 1, 1, 0, 1, 1, 0, 1, 1, 1],
    'GNN_prediction': [0, 1, 1, 0, 1, 1, 0, 1, 1, 1]
}

df_subset = pd.DataFrame(data_subset)

# Extract the real labels from the subset
real_labels_subset = df_subset['real_label']

# Calculate and print the metrics for each model
for column in prediction_columns:
    predictions = df_subset[column]

    # Calculate metrics
    accuracy = accuracy_score(real_labels_subset, predictions) * 100
    precision = precision_score(real_labels_subset, predictions, average='macro', zero_division=0)
    recall = recall_score(real_labels_subset, predictions, average='macro', zero_division=0)
    f1 = f1_score(real_labels_subset, predictions, average='macro', zero_division=0)

    # Print metrics
    print(f"Metrics for {column.replace('_prediction', '')}:")
    print(f"Accuracy: {accuracy:.2f}%")
    print(f"Precision: {precision:.2f}")
    print(f"Recall: {recall:.2f}")
    print(f"F1 Score: {f1:.2f}\n")


In [None]:
print(df_summary_clean[['LSTM_prediction', 'Logistic_prediction', 'GNN_prediction']].corr())


In [None]:
lstm_logistic_matches = (df_summary_clean['LSTM_prediction'] == df_summary_clean['Logistic_prediction']).sum()
lstm_gnn_matches = (df_summary_clean['LSTM_prediction'] == df_summary_clean['GNN_prediction']).sum()
logistic_gnn_matches = (df_summary_clean['Logistic_prediction'] == df_summary_clean['GNN_prediction']).sum()

print(f"Number of LSTM and Logistic matches: {lstm_logistic_matches}")
print(f"Number of LSTM and GNN matches: {lstm_gnn_matches}")
print(f"Number of Logistic and GNN matches: {logistic_gnn_matches}")


In [None]:
# LSTM gets it right, but Logistic and GNN get it wrong
lstm_right = ((df_summary['LSTM_prediction'] == df_summary['real_label']) &
             (df_summary['Logistic_prediction'] != df_summary['real_label']) &
             (df_summary['GNN_prediction'] != df_summary['real_label']))

# Logistic gets it right, but LSTM and GNN get it wrong
logistic_right = ((df_summary['Logistic_prediction'] == df_summary['real_label']) &
                 (df_summary['LSTM_prediction'] != df_summary['real_label']) &
                 (df_summary['GNN_prediction'] != df_summary['real_label']))

# GNN gets it right, but LSTM and Logistic get it wrong
gnn_right = ((df_summary['GNN_prediction'] == df_summary['real_label']) &
            (df_summary['LSTM_prediction'] != df_summary['real_label']) &
            (df_summary['Logistic_prediction'] != df_summary['real_label']))

# Print the counts
print(f"LSTM got right while others got wrong: {lstm_right.sum()}")
print(f"Logistic got right while others got wrong: {logistic_right.sum()}")
print(f"GNN got right while others got wrong: {gnn_right.sum()}")


In [None]:
import matplotlib.pyplot as plt
# Data
models = ['LSTM', 'Logistic Regression', 'GNN']
correct_predictions = [99, 73, 55]

# Plot
plt.figure(figsize=(10, 7))
plt.bar(models, correct_predictions, color=['blue', 'green', 'red'])

# Add labels and title
plt.ylabel('Number of Correct Predictions')
plt.title('Number of Correct Predictions Unique to Each Model')
plt.show()


In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Given metrics in your example
models = ['LSTM', 'Logistic', 'GNN']
accuracy_values = [60, 60, 60]
precision_values = [0.52, 0.52, 0.52]
recall_values = [0.52, 0.52, 0.52]
f1_values = [0.52, 0.52, 0.52]

barWidth = 0.2
r1 = np.arange(len(accuracy_values))
r2 = [x + barWidth for x in r1]
r3 = [x + barWidth for x in r2]
r4 = [x + barWidth for x in r3]

plt.figure(figsize=(12, 7))

# Create bars
plt.bar(r1, accuracy_values, width=barWidth, color='b', edgecolor='grey', label='Accuracy (%)')
plt.bar(r2, np.array(precision_values) * 100, width=barWidth, color='c', edgecolor='grey', label='Precision x100')
plt.bar(r3, np.array(recall_values) * 100, width=barWidth, color='m', edgecolor='grey', label='Recall x100')
plt.bar(r4, np.array(f1_values) * 100, width=barWidth, color='r', edgecolor='grey', label='F1 Score x100')

# Title & Subtitle
plt.title('Model Performance Metrics', fontweight='bold')
plt.xlabel('Model', fontweight='bold')
plt.xticks([r + barWidth for r in range(len(accuracy_values))], models)

# Create legend & Show graphic
plt.legend()
plt.show()


In [None]:
data_subset = {
    'id': [20800, 20801, 20802, 20803, 20804, 20805, 20806, 20807, 20808, 20809],
    'LSTM_prediction': [0, 1, 1, 0, 1, 1, 0, 1, 1, 1],
    'real_label': [0, 1, 0, 1, 1, 1, 1, 1, 0, 1],
    'Logistic_prediction': [0, 1, 1, 0, 1, 1, 0, 1, 1, 1],
    'GNN_prediction': [0, 1, 1, 0, 1, 1, 0, 1, 1, 1]
}
df_subset = pd.DataFrame(data_subset)
real_labels_subset = df_subset['real_label']


In [None]:
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import confusion_matrix

# Function to extract TP, TN, FP, FN
def get_confusion_metrics(real, pred):
    tn, fp, fn, tp = confusion_matrix(real, pred).ravel()
    return tp, tn, fp, fn

models = ['LSTM', 'Logistic', 'GNN']
prediction_columns = ['LSTM_prediction', 'Logistic_prediction', 'GNN_prediction']

tp_values = []
tn_values = []
fp_values = []
fn_values = []

# Calculate TP, TN, FP, FN for each model
for column in prediction_columns:
    predictions = df_subset[column]
    tp, tn, fp, fn = get_confusion_metrics(real_labels_subset, predictions)

    tp_values.append(tp)
    tn_values.append(tn)
    fp_values.append(fp)
    fn_values.append(fn)

barWidth = 0.2
r1 = np.arange(len(tp_values))
r2 = [x + barWidth for x in r1]
r3 = [x + barWidth for x in r2]
r4 = [x + barWidth for x in r3]

plt.figure(figsize=(12, 7))

# Create bars
plt.bar(r1, tp_values, width=barWidth, color='b', edgecolor='grey', label='True Positives')
plt.bar(r2, tn_values, width=barWidth, color='c', edgecolor='grey', label='True Negatives')
plt.bar(r3, fp_values, width=barWidth, color='m', edgecolor='grey', label='False Positives')
plt.bar(r4, fn_values, width=barWidth, color='r', edgecolor='grey', label='False Negatives')

# Title & Subtitle
plt.title('Model Confusion Metrics', fontweight='bold')
plt.xlabel('Model', fontweight='bold')
plt.xticks([r + barWidth for r in range(len(tp_values))], models)

# Add text labels on top of the bars
for i in range(len(r1)):
    plt.text(r1[i], tp_values[i] + 0.1, str(tp_values[i]), ha='center')
    plt.text(r2[i], tn_values[i] + 0.1, str(tn_values[i]), ha='center')
    plt.text(r3[i], fp_values[i] + 0.1, str(fp_values[i]), ha='center')
    plt.text(r4[i], fn_values[i] + 0.1, str(fn_values[i]), ha='center')

# Create legend & Show graphic
plt.legend()
plt.show()


In [None]:
# Print the number of rows/samples in the dataframe
print(f"We are working on {df_subset.shape[0]} rows/samples.")


In [None]:
# Print the number of rows/samples in the dataframe
print(f"We are working on {df_summary.shape[0]} rows/samples to calculate accuracies.")

# Your existing code for calculating accuracies, true positives, true negatives, etc. will follow.


In [None]:
print(df_summary.columns)


In [None]:
from sklearn.metrics import accuracy_score

# Calculate the accuracy for LSTM
lstm_accuracy = accuracy_score(df_summary['real_label'], df_summary['LSTM_prediction'])
print(f"LSTM Accuracy: {lstm_accuracy * 100:.2f}%")

# Calculate the accuracy for Logistic Regression
logistic_accuracy = accuracy_score(df_summary['real_label'], df_summary['Logistic_prediction'])
print(f"Logistic Regression Accuracy: {logistic_accuracy * 100:.2f}%")

# Calculate the accuracy for GNN
gnn_accuracy = accuracy_score(df_summary['real_label'], df_summary['GNN_prediction'])
print(f"GNN Accuracy: {gnn_accuracy * 100:.2f}%")


In [None]:
# Remove rows with NaN values in the prediction columns
df_clean = df_summary.dropna(subset=['LSTM_prediction', 'Logistic_prediction', 'GNN_prediction'])

# Calculate the accuracy for LSTM
lstm_accuracy = accuracy_score(df_clean['real_label'], df_clean['LSTM_prediction'])
print(f"LSTM Accuracy: {lstm_accuracy * 100:.2f}%")

# Calculate the accuracy for Logistic Regression
logistic_accuracy = accuracy_score(df_clean['real_label'], df_clean['Logistic_prediction'])
print(f"Logistic Regression Accuracy: {logistic_accuracy * 100:.2f}%")

# Calculate the accuracy for GNN
gnn_accuracy = accuracy_score(df_clean['real_label'], df_clean['GNN_prediction'])
print(f"GNN Accuracy: {gnn_accuracy * 100:.2f}%")


In [None]:
df_clean.to_csv('prediction_summary2.csv', index=False)


In [None]:
# Load the cleaned data
df_cleaned = pd.read_csv('prediction_summary2.csv')

# Find out the number of rows
num_rows = df_cleaned.shape[0]

print(f"We are working on {num_rows} rows.")


In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Extract the real labels and logistic predictions
real_labels = df_cleaned['real_label']
logistic_predictions = df_cleaned['Logistic_prediction']

# Calculate metrics
accuracy = accuracy_score(real_labels, logistic_predictions)
precision = precision_score(real_labels, logistic_predictions, average='binary')
recall = recall_score(real_labels, logistic_predictions, average='binary')
f1 = f1_score(real_labels, logistic_predictions, average='binary')

# Print metrics
print(f"Metrics for Logistic Prediction:")
print(f"Accuracy: {accuracy * 100:.2f}%")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")


In [None]:
# Extract the LSTM predictions
lstm_predictions = df_cleaned['LSTM_prediction']

# Calculate metrics
accuracy = accuracy_score(real_labels, lstm_predictions)
precision = precision_score(real_labels, lstm_predictions, average='binary')
recall = recall_score(real_labels, lstm_predictions, average='binary')
f1 = f1_score(real_labels, lstm_predictions, average='binary')

# Print metrics
print(f"Metrics for LSTM Prediction:")
print(f"Accuracy: {accuracy * 100:.2f}%")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")


In [None]:
# Extract the GNN predictions
gnn_predictions = df_cleaned['GNN_prediction']

# Calculate metrics
accuracy = accuracy_score(real_labels, gnn_predictions)
precision = precision_score(real_labels, gnn_predictions, average='binary')
recall = recall_score(real_labels, gnn_predictions, average='binary')
f1 = f1_score(real_labels, gnn_predictions, average='binary')

# Print metrics
print(f"Metrics for GNN Prediction:")
print(f"Accuracy: {accuracy * 100:.2f}%")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")


In [None]:
import matplotlib.pyplot as plt

# Metrics to plot
metrics = ['Accuracy', 'Precision', 'Recall', 'F1 Score']
values = [accuracy, precision, recall, f1]

# Plot
plt.figure(figsize=(10, 6))
plt.bar(metrics, values, color=['blue', 'green', 'red', 'cyan'])
plt.ylim(0, 1)  # Setting the y-axis limits to be between 0 and 1 since our metrics are in that range
plt.title('Metrics for Logistic Prediction')
plt.ylabel('Score')
plt.xlabel('Metrics')

# Displaying the value of each metric on top of each bar
for i, v in enumerate(values):
    plt.text(i, v + 0.02, f"{v:.2f}", ha='center', va='bottom', fontweight='bold')

# Show plot
plt.show()


In [None]:
import matplotlib.pyplot as plt

# Given metrics for LSTM
accuracy_lstm = 0.6118
precision_lstm = 0.67
recall_lstm = 0.58
f1_lstm = 0.62

# Lists to hold metrics and their values for LSTM
metrics_lstm = ['Accuracy', 'Precision', 'Recall', 'F1 Score']
values_lstm = [accuracy_lstm, precision_lstm, recall_lstm, f1_lstm]

# Plot
plt.figure(figsize=(10, 6))
plt.bar(metrics_lstm, values_lstm, color=['blue', 'green', 'red', 'cyan'])
plt.ylim(0, 1)  # Setting the y-axis limits to be between 0 and 1 since our metrics are in that range
plt.title('Metrics for LSTM Prediction')
plt.ylabel('Score')
plt.xlabel('Metrics')

# Displaying the value of each metric on top of each bar
for i, v in enumerate(values_lstm):
    plt.text(i, v + 0.02, f"{v:.2f}", ha='center', va='bottom', fontweight='bold')

# Show plot
plt.show()


In [None]:
# Given metrics for GNN (from the provided values)
accuracy_gnn = accuracy
precision_gnn = precision
recall_gnn = recall
f1_gnn = f1

# Lists to hold metrics and their values for GNN
metrics_gnn = ['Accuracy', 'Precision', 'Recall', 'F1 Score']
values_gnn = [accuracy_gnn, precision_gnn, recall_gnn, f1_gnn]

# Plot
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 6))
plt.bar(metrics_gnn, values_gnn, color=['blue', 'green', 'red', 'cyan'])
plt.ylim(0, 1)  # Setting the y-axis limits to be between 0 and 1 since our metrics are in that range
plt.title('Metrics for GNN Prediction')
plt.ylabel('Score')
plt.xlabel('Metrics')

# Displaying the value of each metric on top of each bar
for i, v in enumerate(values_gnn):
    plt.text(i, v + 0.02, f"{v:.2f}", ha='center', va='bottom', fontweight='bold')

# Show plot
plt.show()


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix

# Define a function to plot confusion matrix
def plot_confusion_matrix(y_true, y_pred, title):
    matrix = confusion_matrix(y_true, y_pred)

    plt.figure(figsize=(8,6))
    sns.heatmap(matrix, annot=True, fmt='d', cmap='Blues',
                xticklabels=['Negative', 'Positive'],
                yticklabels=['Negative', 'Positive'])

    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')
    plt.title(title)
    plt.show()

# Plot confusion matrix for each model
plot_confusion_matrix(df_cleaned['real_label'], df_cleaned['Logistic_prediction'], "Confusion Matrix for Logistic Prediction")
plot_confusion_matrix(df_cleaned['real_label'], df_cleaned['LSTM_prediction'], "Confusion Matrix for LSTM Prediction")
plot_confusion_matrix(df_cleaned['real_label'], df_cleaned['GNN_prediction'], "Confusion Matrix for GNN Prediction")


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix

# Define a function to plot and save confusion matrix
def save_confusion_matrix(y_true, y_pred, title, filename):
    matrix = confusion_matrix(y_true, y_pred)

    plt.figure(figsize=(8,6))
    sns.heatmap(matrix, annot=True, fmt='d', cmap='Blues',
                xticklabels=['Negative', 'Positive'],
                yticklabels=['Negative', 'Positive'])

    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')
    plt.title(title)
    plt.tight_layout()  # This ensures that labels are not cut off when saving
    plt.savefig(filename)  # Save the image
    plt.close()  # Close the current figure

# Save confusion matrix image for Logistic
save_confusion_matrix(df_cleaned['real_label'], df_cleaned['Logistic_prediction'], "Confusion Matrix for Logistic Prediction", "logistic_cm.png")


In [None]:
# Save confusion matrix image for LSTM
save_confusion_matrix(df_cleaned['real_label'], df_cleaned['LSTM_prediction'], "Confusion Matrix for LSTM Prediction", "lstm_cm.png")


In [None]:
# Save confusion matrix image for GNN
save_confusion_matrix(df_cleaned['real_label'], df_cleaned['GNN_prediction'], "Confusion Matrix for GNN Prediction", "gnn_cm.png")
