In [None]:
import os
import numpy as np

os.getcwd()


In [None]:
!ls

In [None]:
!cd analysis

In [None]:
os.getcwd()

In [None]:
# !python3 -m venv analysis-venv
!source analysis-venv/bin/activate


In [None]:
# %pip uninstall scikit-learn imbalanced-learn
%pip install scikit-learn==1.3.0 imbalanced-learn==0.11.0
%pip install emoji


In [None]:
import pandas as pd
import numpy as np
import warnings
from sklearn.preprocessing import MinMaxScaler
from imblearn.over_sampling import RandomOverSampler, SMOTE, ADASYN, BorderlineSMOTE, KMeansSMOTE, SVMSMOTE
from IPython.display import display
import re
import emoji
import joblib
import random

In [None]:
model_dir_name = 'flan-t5-base'

In [None]:
random_state = 42
np.random.seed(random_state)
random.seed(random_state)
import os
os.environ['PYTHONHASHSEED'] = str(random_state)

In [None]:
# github-toxic/dataset/derailment-paper-data/unified_final_dataset_conversations.csv
file_path = '/path/to/input/files' % replace with real path

df_unified_conv = pd.read_csv(file_path)
df_unified_conv=df_unified_conv.rename(columns={
    "speaker_text":"conversation"
})

# df_unified_conv2 = pd.read_csv(file_path)
# df_unified_conv2['is_toxic']=1


# df_unified_conv =  pd.concat([df_unified_conv, df_unified_conv2], axis=0).reset_index(drop=True)
df_unified_conv['issue_id'] = df_unified_conv['issue_id'].astype(str)
df_unified_conv

In [None]:
second_person_pronouns = ['you', 'your', 'yours', 'yourself', 'yourselves']
def count_second_person_pronouns(text):
    count = 0
    words = text.lower().split()
    for pronoun in second_person_pronouns:
        count += words.count(pronoun)
    return count

In [None]:
file_path = '/path/to/input/files' % replace with real path
df_unified_comment_data = pd.read_csv(file_path)
# df_unified_comment_data = pd.read_csv(file_path)

df_unified_comment_data

In [None]:
df_unified_comment_data['text'] = df_unified_comment_data['text'].astype(str).replace({'nan': '', '': ''})
df_unified_comment_data['text'] = df_unified_comment_data['text'].astype(str).fillna('')
df_unified_comment_data['pronoun_count'] = df_unified_comment_data['text'].apply(count_second_person_pronouns)
df_unified_comment_data['comment_length'] = df_unified_comment_data['text'].apply(len)


In [None]:

# First, group by 'issue_id' and 'speaker_name' and count the comments per speaker
speaker_comment_counts = df_unified_comment_data.groupby(['issue_id', 'speaker']).size()
speaker_comment_counts

In [None]:
max_comments_per_issue_by_speakers = speaker_comment_counts.groupby(level=0).max()
max_comments_per_issue_by_speakers

In [None]:
# Define a function to count '@' in each comment
def count_at_signs(text):
    text = text or ''
    
    try:
        return text.count('@')
    except Exception as e:
        # Catch all other exceptions
        # print("An error occurred:", e)
        return 0
    
def count_quotes(text):
    try:
        return text.count("'") + text.count('"') + text.count('`')
    except Exception as e:
        # print("An error occurred:", e)
        return 0
    
def count_hashes(text):
    try:
        return text.count("#")
    except Exception as e:
        # print("An error occurred:", e)
        return 0

In [None]:
def count_words(text):
    return len(str(text).split())

def count_quote_markers(text):
    # Check if the text contains "> " and return 1 if true, 0 if false
    return 1 if "> " in str(text) else 0


# Get max number of comments per speaker per issue
median_comments_per_issue_by_speakers = speaker_comment_counts.groupby(level=0).median()

# Group by 'issue_id' and aggregate
aggregated_data = df_unified_comment_data.groupby('issue_id').agg(
    comment_counts=('comment_unique_id', 'count'),  # Count of comments
    unique_speakers=('speaker', 'nunique'),  # Count of unique speaker names
    total_second_person_pronouns=('pronoun_count', 'sum'),
    max_comment_length=('comment_length', 'max'),  # Maximum length of comments,
    total_ats=('text', lambda x: x.apply(count_at_signs).sum()),  # Sum of '@' in all comments
    total_quotes=('text', lambda x: x.apply(count_quotes).sum()),  # Sum of quote characters in all comments
    median_words_in_comment=('text', lambda x: x.apply(count_words).median()),
    std_dev_words_in_comment=('text', lambda x: x.apply(count_words).std()),
    max_words_in_comment=('text', lambda x: x.apply(count_words).max()),
    total_previous_comment_mentions=('text', lambda x: x.apply(count_quote_markers).sum()),  # Count of comments with "> "

)

aggregated_data['max_comments_by_one_speaker'] = aggregated_data.index.map(max_comments_per_issue_by_speakers)
aggregated_data['median_comments_by_one_speaker'] = aggregated_data.index.map(median_comments_per_issue_by_speakers)


# Reset index to make 'issue_id' a column again
aggregated_data.reset_index(inplace=True)

aggregated_data

In [None]:
def check_is_closed(group):
    last_two_comments = group.tail(2)  # Get the last 2 comments for the issue_id
    return int(last_two_comments['text'].str.contains(r'\bclose\b', case=False).any())

# Group by issue_id and apply the function
aggregated_data2 = df_unified_comment_data.groupby('issue_id').apply(
    lambda group: pd.Series({'is_issue_closed': check_is_closed(group)})
).reset_index()


def count_emojis(text):
    """
    Count the total number of emojis in a text string.
    
    Args:
        text (str): The text string to analyze (e.g., GitHub issue comment)
    
    Returns:
        int: Total number of emojis found
    """
    # Get all emojis present in the text
    emoji_list = [c for c in text if c in emoji.EMOJI_DATA]
    
    # Return the count
    return len(emoji_list)

def calculate_emoji_metrics(group):
    # Calculate total emojis across all comments for an issue_id
    total_emojis = group['text'].apply(count_emojis).sum()
    
    # Count the number of comments containing at least one emoji
    total_comments_with_emoji = group['text'].apply(lambda x: count_emojis(x) > 0).sum()
    total_comments = len(group)

    return pd.Series({
        'emoji_count': total_emojis,
        'total_comment_with_emoji': total_comments_with_emoji,
        'total_comment_with_emoji_ratio': total_comments_with_emoji/total_comments
    })

# Group by issue_id and apply the function
emoji_metrics = df_unified_comment_data.groupby('issue_id').apply(calculate_emoji_metrics).reset_index()

# Merge with the aggregated_data2
aggregated_data2 = pd.merge(aggregated_data2, emoji_metrics, on='issue_id')

# aggregated_data2['code_of_conduct_mentioned'] = df_unified_comment_data.groupby('issue_id')['text'].apply(
#     lambda comments: int(comments.str.contains("code of conduct", case=False).any())
# ).reset_index(drop=True)

def get_first_coc_mention_idx(comments):
    # Find the first occurrence of "code of conduct" in comments
    mask = comments.str.contains("code of conduct", case=False)
    # If found, return the index (starting from 1), else return 0
    if mask.any():
        return mask.idxmax() + 1  # Adding 1 to convert from 0-based to 1-based indexing
    return 0

# Apply the function to get the index of first code of conduct mention
aggregated_data2['code_of_conduct_mentioned_comment_idx'] = df_unified_comment_data.groupby('issue_id')['text'].apply(
    get_first_coc_mention_idx
).reset_index(drop=True)

aggregated_data2

In [None]:
# Technical features
def has_stack_trace(text):
    st_regex = re.compile(r'at [a-zA-Z0-9\.<>$]+\(.+\)')
    return bool(st_regex.search(text))

def check_template(first_comment):
    template_indicators = ['### Description', '## Description', '### Steps to reproduce', '## Steps to reproduce']
    return int(any(indicator in first_comment for indicator in template_indicators))

# Update aggregation
def calculate_technical_features(group):
    texts = group['text'].astype(str)
    first_comment = texts.iloc[0] if not texts.empty else ""
    
    return pd.Series({
        'has_stack_trace': int(texts.apply(has_stack_trace).any()),
        'has_code': int(texts.str.contains(r'```\w*\n.*?\n```', flags=re.DOTALL).any()),
        # 'has_markdown': int(texts.str.contains(r'```').any()),
        'has_template': check_template(first_comment)
    })

technical_feature_metrics = df_unified_comment_data.groupby('issue_id').apply(calculate_technical_features).reset_index()
aggregated_data2 = pd.merge(aggregated_data2, technical_feature_metrics, on='issue_id')

In [None]:
print(aggregated_data2['code_of_conduct_mentioned_comment_idx'].isnull().sum())  # Check for NaN/None


In [None]:
file_path = '/path/to/input/files' % replace with real path
df_prediction_explanation_llama = pd.read_csv(file_path)

df_prediction_explanation_llama = df_prediction_explanation_llama.rename(columns={
    'toxicity_score': 'toxicity_score_llama',
    'toxicity_explanation':'toxicity_explanation_llama'
})

file_path = '/path/to/input/files' % replace with real path

df_prediction_explanation_qwen = pd.read_csv(file_path)
df_prediction_explanation_qwen = df_prediction_explanation_qwen.rename(columns={
    'toxicity_score': 'toxicity_score_qwen',
    'toxicity_explanation':'toxicity_explanation_qwen'
})

df_prediction_explanation = pd.merge(
    df_prediction_explanation_llama[['issue_id', 'toxicity_score_llama', 'toxicity_explanation_llama']],
    df_prediction_explanation_qwen[['issue_id', 'toxicity_score_qwen', 'toxicity_explanation_qwen']],
    on='issue_id')

df_prediction_explanation['is_toxic_llm_pred_llama']= df_prediction_explanation['toxicity_score_llama']>=0.3
df_prediction_explanation['is_toxic_llm_pred_qwen']= df_prediction_explanation['toxicity_score_qwen']>=0.3

df_prediction_explanation['avg_toxicity_score'] = (df_prediction_explanation['toxicity_score_llama']+df_prediction_explanation['toxicity_score_qwen'])/2

df_prediction_explanation['toxicity_score_diff'] = abs(df_prediction_explanation['toxicity_score_llama']-df_prediction_explanation['toxicity_score_qwen'])

df_prediction_explanation



In [None]:
file_path = '/path/to/input/files' % replace with real path

df_time_features = pd.read_csv(file_path)

df_time_features

In [None]:
file_path = '/path/to/input/files' % replace with real path

df_tone_features = pd.read_csv(file_path)
df_tone_features['tone_score_diff']= df_tone_features['first_half_tone'].astype(float) - df_tone_features['second_half_tone'].astype(float)
df_tone_features

In [None]:
file_path = '/path/to/input/files' % replace with real path
df_text_descriptive = pd.read_csv(file_path)
# df_text_descriptive2 = pd.read_csv(file_path)
# df_text_descriptive =  pd.concat([df_text_descriptive, df_text_descriptive2], axis=0).reset_index(drop=True)


df_text_descriptive = df_text_descriptive.drop(columns=['is_toxic','text'])
df_text_descriptive = df_text_descriptive.rename(columns={col:  col+'_TD' if col != 'issue_id' else col for col in df_text_descriptive.columns})

df_text_descriptive

In [None]:
len(df_text_descriptive.columns)

In [None]:
file_path = '/path/to/input/files' % replace with real path
df_text_content_features= pd.read_csv(file_path)
# df_text_content_features2 = pd.read_csv(file_path)
# df_text_content_features =  pd.concat([df_text_content_features, df_text_content_features2], axis=0).reset_index(drop=True)

df_text_content_features=df_text_content_features.drop(columns=['is_toxic', 'speaker_text'])
df_text_content_features

In [None]:
emotions= [
            # 'admiration', 'amusement',
    'anger', 'annoyance', 'approval',
    # 'caring',
            'confusion', 'curiosity', 'desire', 'disappointment', 'disapproval',
            'disgust', 'embarrassment', 
    # 'excitement', 'fear', 'gratitude', 'grief',
            # 'joy', 'love', 'nervousness', 'neutral', 'optimism', 
    'pride',
    # 'realization', 'relief', 'remorse', 
    'sadness', 'surprise'
        ]

file_path = '/path/to/input/files' % replace with real path

df_emotion_scores = pd.read_csv(file_path)
# df_emotion_scores2 = pd.read_csv(file_path)
# df_emotion_scores =  pd.concat([df_emotion_scores, df_emotion_scores2], axis=0).reset_index(drop=True)

df_emotion_scores= df_emotion_scores[emotions+['issue_id']]
df_emotion_scores

In [None]:
file_path = '/path/to/input/files' % replace with real path
df_sentiment_polarity_senticr = pd.read_csv(file_path)
# df_sentiment_polarity_senticr2 = pd.read_csv(file_path)
# df_sentiment_polarity_senticr =  pd.concat([df_sentiment_polarity_senticr, df_sentiment_polarity_senticr2], axis=0).reset_index(drop=True)

df_sentiment_polarity_senticr

In [None]:
df_sentiment_polarity_senticr.columns

In [None]:
df_sentiment_polarity_senticr=df_sentiment_polarity_senticr[['issue_id', 'has_neg_comment_sentcr', 'non_neg_comment_ratio_sentcr',
       'neg_comment_ratio_sentcr', 'sentiment_transition_ratio_sentcr']]


In [None]:
file_path = '/path/to/input/files' % replace with real path
df_sentiment_polarity_textblob = pd.read_csv(file_path)
# df_sentiment_polarity_textblob2 = pd.read_csv(file_path)
# df_sentiment_polarity_textblob =  pd.concat([df_sentiment_polarity_textblob, df_sentiment_polarity_textblob2], axis=0).reset_index(drop=True)

df_sentiment_polarity_textblob

In [None]:
file_path = '/path/to/input/files' % replace with real path
df_sentiment_polarity_vader = pd.read_csv(file_path)
# df_sentiment_polarity_vader2 = pd.read_csv(file_path)
# df_sentiment_polarity_vader =  pd.concat([df_sentiment_polarity_vader, df_sentiment_polarity_vader2], axis=0).reset_index(drop=True)

df_sentiment_polarity_vader['has_neg_comment_vader'] = df_sentiment_polarity_vader['has_neg_comment_vader'].astype(int)
df_sentiment_polarity_vader

In [None]:
file_path = '/path/to/input/files' % replace with real path
df_perplexity = pd.read_csv(file_path)
# df_perplexity2 = pd.read_csv(file_path)
# df_perplexity =  pd.concat([df_perplexity, df_perplexity2], axis=0).reset_index(drop=True)

df_perplexity = df_perplexity.rename(columns={
    'perplexity': 'conversastion_perplexity'
})
df_perplexity

In [None]:
file_path = '/path/to/input/files' % replace with real path
df_explanation_features_given_conv = pd.read_csv(file_path)

df_explanation_features_given_conv

In [None]:
file_path = '/path/to/input/files' % replace with real path
df_explanation_features_given_conv_exp = pd.read_csv(file_path)
# df_explanation_features_given_conv_exp2 = pd.read_csv(file_path)
# df_explanation_features_given_conv_exp =  pd.concat([df_explanation_features_given_conv_exp, df_explanation_features_given_conv_exp2], axis=0).reset_index(drop=True)

df_explanation_features_given_conv_exp

In [None]:
file_path = '/path/to/input/files' % replace with real path
df_explanation_features_given_exp = pd.read_csv(file_path)
# df_explanation_features_given_exp2 = pd.read_csv(file_path)
# df_explanation_features_given_exp =  pd.concat([df_explanation_features_given_exp, df_explanation_features_given_exp2], axis=0).reset_index(drop=True)

df_explanation_features_given_exp

In [None]:

df_explanation_features_given_conv = df_explanation_features_given_conv.rename(columns=
                                     {
                                         'toxic_label_probability': 'toxic_prob_conv',
                                         'non_toxic_label_probability': 'non_toxic_prob_conv',
                                     })
df_explanation_features_given_conv

In [None]:

df_explanation_features_given_conv_exp = df_explanation_features_given_conv_exp.rename(columns=
                                                 {
                                                     'toxic_label_probability': 'toxic_prob_conv_exp',
                                                     'non_toxic_label_probability': 'non_toxic_prob_conv_exp',
                                                 })
df_explanation_features_given_conv_exp

In [None]:

df_explanation_features_given_exp = df_explanation_features_given_exp.rename(columns=
                                     {
                                         'toxic_label_probability': 'toxic_prob_exp',
                                         'non_toxic_label_probability': 'non_toxic_prob_exp',
                                     })
df_explanation_features_given_exp

In [None]:
file_path = '/path/to/input/files' % replace with real path
df_explanation_logits_features = pd.read_csv(file_path)
# df_explanation_logits_features2 = pd.read_csv(file_path)
# df_explanation_logits_features2=df_explanation_logits_features2.rename(columns={
#     'input_id':"issue_id"
# })
# df_explanation_logits_features2['is_toxic']=1
# df_explanation_logits_features =  pd.concat([df_explanation_logits_features, df_explanation_logits_features2], axis=0).reset_index(drop=True)

df_explanation_logits_features

In [None]:
df_explanation_logits_features = df_explanation_logits_features.rename(columns=
                                     {
                                         'first_token_logit': 'exp_first_token_logit',
                                         'first_token_entropy': 'exp_first_token_entropy',
                                         'avg_entropy': 'exp_avg_entropy',
                                         'avg_logits': 'exp_avg_logits'
                                     })
df_explanation_logits_features

In [None]:
df_explanation_features = pd.DataFrame()
df_explanation_features = pd.merge(df_explanation_features_given_conv_exp, df_explanation_features_given_conv, on='issue_id')

df_explanation_features = pd.merge(df_explanation_features, df_explanation_features_given_exp, on='issue_id')
df_explanation_features = pd.merge(df_explanation_features, df_explanation_logits_features, on='issue_id')
df_explanation_features=df_explanation_features.drop('Unnamed: 0', axis=1, errors='ignore')
df_explanation_features

In [None]:
df_prediction_explanation['issue_id']=df_prediction_explanation['issue_id'].astype(str)
aggregated_data2['issue_id']=aggregated_data2['issue_id'].astype(str)
df_time_features['issue_id']=df_time_features['issue_id'].astype(str)
df_perplexity['issue_id']=df_perplexity['issue_id'].astype(str)
aggregated_data['issue_id']=aggregated_data['issue_id'].astype(str)
df_emotion_scores['issue_id']=df_emotion_scores['issue_id'].astype(str)
df_sentiment_polarity_senticr['issue_id']=df_sentiment_polarity_senticr['issue_id'].astype(str)
df_sentiment_polarity_textblob['issue_id']=df_sentiment_polarity_textblob['issue_id'].astype(str)
df_sentiment_polarity_vader['issue_id']=df_sentiment_polarity_vader['issue_id'].astype(str)
df_text_descriptive['issue_id']=df_text_descriptive['issue_id'].astype(str)
df_text_content_features['issue_id']=df_text_content_features['issue_id'].astype(str)
df_explanation_features['issue_id']=df_explanation_features['issue_id'].astype(str)
df_unified_conv['issue_id']=df_unified_conv['issue_id'].astype(str)

In [None]:
##########################################
df_merged = aggregated_data2.copy()

df_merged = pd.merge(df_merged, df_prediction_explanation[[
    'issue_id', 'toxicity_score_llama', 'is_toxic_llm_pred_llama',
    'toxicity_score_qwen', 'is_toxic_llm_pred_qwen',
    'avg_toxicity_score', 'toxicity_score_diff'
]], on='issue_id')

# df_merged = pd.merge(df_merged, df_bert_score, on='issue_id')
df_merged = pd.merge(df_merged, df_time_features, on='issue_id')

df_merged = pd.merge(df_merged, df_perplexity, on='issue_id')
df_merged = pd.merge(df_merged, aggregated_data, on='issue_id')

# df_merged = pd.merge(df_merged, df_tone_features, on='issue_id')

# # we may not use emotion scores
df_merged = pd.merge(df_merged, df_emotion_scores, on='issue_id')

df_merged = pd.merge(df_merged, df_sentiment_polarity_senticr, on='issue_id')
df_merged = pd.merge(df_merged, df_sentiment_polarity_textblob, on='issue_id')
df_merged = pd.merge(df_merged, df_sentiment_polarity_vader, on='issue_id')

df_merged = pd.merge(df_merged, df_text_descriptive, on='issue_id')
df_merged = pd.merge(df_merged, df_text_content_features, on='issue_id')

# df_merged = pd.merge(df_merged, df_label_logits_features, on=['issue_id'])

# df_merged = pd.merge(df_merged, df_outlier_score, on='issue_id')

# df_merged = pd.merge(df_merged, df_explanation_features[['is_toxic', 'issue_id']], on=['issue_id'])
df_merged = pd.merge(df_merged, df_explanation_features, on=['issue_id'])
df_merged["is_toxic"] = df_unified_conv['is_toxic']


# df_merged
df_merged['issue_id'] = df_merged['issue_id'].astype(str)

df_merged



In [None]:
df_merged=df_merged.drop(columns = ['Unnamed: 0'])

In [None]:
df_merged.columns.tolist()

## Create Verifier Label

In [None]:
print("Total toxic conversation is ", sum(df_merged['is_toxic']))
print("Total non-toxic conversation is ", len(df_merged['is_toxic'])-sum(df_merged['is_toxic']))

In [None]:
# Assuming `series` is your Pandas Series
indices_with_none = df_merged['is_toxic'][df_merged['is_toxic'].isna()].index.tolist()

# Display the result
indices_with_none

In [None]:
# Assuming `series` is your Pandas Series
indices_with_none = df_merged['is_toxic_llm_pred_llama'][df_merged['is_toxic_llm_pred_llama'].isna()].index.tolist()

# Display the result
indices_with_none

In [None]:
"""
Here df_merged['is_correct_llm_pred'] is the llm predicted class label
df_merged['is_toxic'] is the actual class label
"""
df_merged['is_correct_llm_pred']= df_merged['is_toxic']==df_merged['is_toxic_llm_pred_llama']
df_merged['is_correct_llm_pred']

In [None]:
print("Total correct prediction is ", sum(df_merged['is_correct_llm_pred']))
print("Total incorrect prediction is ", len(df_merged['is_correct_llm_pred'])-sum(df_merged['is_correct_llm_pred']))

In [None]:
# Actual label of the correct predictions
total_toxic_conv_that_are_incorrect = sum(df_merged[[True if is_correct_llm_pred==False else False for is_correct_llm_pred in df_merged['is_correct_llm_pred'] ]]['is_toxic'])
total_toxic_conv_that_are_incorrect

In [None]:
# df_merged = df_merged.drop(columns=['is_toxic'])

In [None]:
print(len(df_merged.columns))
df_merged.columns.tolist()

## Removing NAN Containing Columns

In [None]:
nan_columns = df_merged.columns[df_merged.isna().any()].tolist()

print(nan_columns)

In [None]:
df_merged=df_merged.drop(columns=nan_columns)

In [None]:
string_columns = df_merged.select_dtypes(include=['object', 'string']).columns

print("String columns:", list(string_columns))

## Feature Standardization

In [None]:
# taking selected features
# feature_columns = [column for column in df_merged.columns if (column not in ['issue_id', 'is_correct_llm_pred', 'is_toxic']) and (column in selected_feature_names)]

# taking all features
feature_columns = [column for column in df_merged.columns if (column not in ['issue_id', 'is_correct_llm_pred', 'is_toxic'])]

feature_columns

In [None]:
from sklearn.preprocessing import StandardScaler
# Initialize the scaler
scaler = StandardScaler()

# Fit the scaler on the training data and transform it
df_standardized = pd.DataFrame(scaler.fit_transform(df_merged[feature_columns]), columns = feature_columns)
df_standardized['is_correct_llm_pred'] = df_merged['is_correct_llm_pred']

In [None]:
nan_columns = df_standardized.columns[df_standardized.isna().any()].tolist()

print(nan_columns)

In [None]:
df_standardized=df_standardized.drop(columns=nan_columns)

df_standardized

In [None]:
feature_columns = [feature for feature in feature_columns if feature not in nan_columns]

## Taking Top Features

In [None]:
file_path = '/path/to/input/files' % replace with real path
top_features_df =  pd.read_csv(file_path)
prev_top_features = top_features_df['features'].to_list()


In [None]:
# from sklearn.feature_selection import mutual_info_classif

# mi_scores = mutual_info_classif(df_standardized[top_features], df_standardized['is_correct_llm_pred'], random_state=42)

# feature_importance = pd.DataFrame({
#     'Feature': top_features,
#     'Importance': mi_scores
# }).sort_values('Importance', ascending=False)

# feature_importance

In [None]:
prev_top_features= [feature for feature in prev_top_features if feature not in ['toxicity_score']]
prev_top_features

In [None]:
len(prev_top_features)

In [None]:
score_features=[
    'toxicity_score_llama',
 'toxicity_score_qwen',
 'is_toxic_llm_pred_llama',
 'is_toxic_llm_pred_qwen',
 'avg_toxicity_score',
 'toxicity_score_diff']

considered_features = score_features+[feature for feature in prev_top_features if feature not in score_features]
considered_features

In [None]:
file_path = '/path/to/input/files' % replace with real path
feature_df = pd.read_csv(file_path)
top_features = feature_df['features'].tolist()
top_features

In [None]:
len(top_features)

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

def format_classification_report(y_test, y_pred):
    """
    Create a formatted classification report as a pandas DataFrame.
    
    Parameters:
    y_test: True labels
    y_pred: Predicted labels
    
    Returns:
    pandas.DataFrame: Formatted classification report
    """
    # Get the classification report as a dictionary
    report_dict = classification_report(y_test, y_pred, 
                                      # target_names=['Incorrect Prediction', 'Correct Prediction'],
                                      output_dict=True)
        
    # Convert to DataFrame
    df_report = pd.DataFrame.from_dict(report_dict).round(3)


    # Transpose for better readability
    df_report = df_report.transpose()
    
    # Reorder columns to a more logical sequence
    if 'precision' in df_report.columns:
        df_report = df_report[['precision', 'recall', 'f1-score', 'support']]
    
    # Format support column as integer
    if 'support' in df_report.columns:
        df_report['support'] = df_report['support'].astype(int)
    
    # Add styling
    styled_report = df_report.style\
        .background_gradient(subset=['precision', 'recall', 'f1-score'], cmap='Blues')\
        .format({'precision': '{:.3f}', 'recall': '{:.3f}', 'f1-score': '{:.3f}', 'support': '{:,d}'})
    
    return df_report, styled_report


## Create Train Test Split

In [None]:
file_path = '/path/to/input/files' % replace with real path
train_issue_ids = pd.read_csv(file_path)['issue_id'].tolist()

file_path = '/path/to/input/files' % replace with real path
test_issue_ids = pd.read_csv(file_path)['issue_id'].tolist()
train_issue_ids=[str(x) for x in train_issue_ids]
test_issue_ids=[str(x) for x in test_issue_ids]

In [None]:
train_df = df_merged[df_merged['issue_id'].isin(train_issue_ids)].reset_index(drop=True)
test_df = df_merged[df_merged['issue_id'].isin(test_issue_ids)].reset_index(drop=True)
train_df

In [None]:
# train_df.to_csv('./dataset/train_issue_ids_70_30_split_without_miller.csv', index = False)
# test_df.to_csv('./dataset/test_issue_ids_70_30_split_without_miller.csv', index = False)

In [None]:
test_df

In [None]:
train_df['issue_id'] = train_df['issue_id'].astype(str)
test_df['issue_id'] = test_df['issue_id'].astype(str)


In [None]:
# train_df = train_df.merge(df_merged[['issue_id', 'code_of_conduct_mentioned_comment_idx']], on='issue_id', how='left').reset_index(drop=True)
# test_df = test_df.merge(df_merged[['issue_id', 'code_of_conduct_mentioned_comment_idx']], on='issue_id', how='left').reset_index(drop=True)
# train_df

In [None]:
# train_df.to_csv('./dataset/train_data_70_30_split_v2.csv', index=False)
# test_df.to_csv('./dataset/test_data_70_30_split_v2.csv', index=False)


### Standardize the Train and Test Set

In [None]:
train_df['code_of_conduct_mentioned_comment_idx'] = pd.to_numeric(train_df['code_of_conduct_mentioned_comment_idx'], errors='coerce')

In [None]:
print(train_df['code_of_conduct_mentioned_comment_idx'].std())

In [None]:
if 'symbol_to_word_ratio_#_TD' in df_merged.columns: 
    print("min value:", df_merged['symbol_to_word_ratio_#_TD'].min() )
    print("max value:", df_merged['symbol_to_word_ratio_#_TD'].max() )
    print("median:",df_merged['symbol_to_word_ratio_#_TD'].median() )
    print("std:", df_merged['symbol_to_word_ratio_#_TD'].std() )# as std is infinite we need to drop it 

In [None]:
print(train_df['code_of_conduct_mentioned_comment_idx'].isnull().sum())  # Check for NaN/None
print(train_df['code_of_conduct_mentioned_comment_idx'].dtype)          # Check data type

In [None]:
from sklearn.preprocessing import StandardScaler
# Initialize the scaler
scaler = StandardScaler()

# Fit the scaler on the training data and transform it
X_train_scaled_df = pd.DataFrame(scaler.fit_transform(train_df[top_features]), columns = top_features)
X_train_scaled_df['is_correct_llm_pred'] = train_df['is_correct_llm_pred']

# Save the scaler and model for later use
# joblib.dump(scaler, './verifier_randomforest_models/standardScaler_70_30_split_explainable_features_reduced_with_qwen_final.pkl')

# Transform the test set using the same scaler (do not fit again!)
X_test_scaled_df = pd.DataFrame(scaler.transform(test_df[top_features]), columns = top_features)
X_test_scaled_df['is_correct_llm_pred'] = test_df['is_correct_llm_pred']

print("Scaled Train Set:\n", X_train_scaled_df)
print("Scaled Test Set:\n", X_test_scaled_df)



In [None]:
nan_columns = X_train_scaled_df.columns[X_train_scaled_df.isna().any()].tolist()

print(nan_columns)

## Random Forest Classifier 

In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
import seaborn as sns
import matplotlib.pyplot as plt

def train_and_evaluate_RF(X_train, X_test, y_train, y_test, random_state=42):
    """
    Train Random Forest using train set and evaluate on test set
    
    Parameters:
    X_train (pandas.DataFrame): Training features
    X_test (pandas.DataFrame): Test features
    y_train (pandas.Series): Training labels
    y_test (pandas.Series): Test labels
    random_state (int): Random state for reproducibility
    
    Returns:
    tuple: (trained model, DataFrame with results, dict with metrics)
    """
    X_train.columns = [str(col) for col in X_train.columns]
    X_test.columns = [str(col) for col in X_test.columns]

    # Get unique classes
    classes = sorted(y_train.unique())
    class_0 = str(classes[0])  # Convert to string to match classification report
    class_1 = str(classes[1])
    
    # Print class distribution before oversampling
    true_class_instances = sum(y_train)
    print("True class samples in train set: ", true_class_instances)
    print("False class samples in train set: ", len(y_train)-true_class_instances)
    
    # ros = RandomOverSampler(random_state=42)
    # sm = SMOTE(random_state=42,k_neighbors=10)
    # ada = ADASYN(random_state=42)
    # sm = BorderlineSMOTE(random_state=42)
    # sm = KMeansSMOTE(random_state=42, cluster_balance_threshold=0.05)
    # Apply SVMSMOTE oversampling
    
    sm = SVMSMOTE(random_state=random_state, k_neighbors=10)
    X_train_resampled, y_train_resampled = sm.fit_resample(X_train, y_train)


    # Print class distribution after oversampling
    true_class_instances = sum(y_train_resampled)
    print("True class samples in train set after oversampling: ", true_class_instances)
    print("False class samples in train set after oversampling: ", len(y_train_resampled)-true_class_instances)
    
    # Train model
    rf_model = RandomForestClassifier(
        # max_depth=20, 
        # min_samples_split=10,
        random_state=random_state,
    )
    rf_model.fit(X_train_resampled, y_train_resampled)
    
    # Get predictions on test set
    y_pred_binary = rf_model.predict(X_test)
    y_pred_proba = rf_model.predict_proba(X_test)
    
    # Store results
    results = pd.DataFrame({
        'actual': y_test,
        'predicted_binary': y_pred_binary,
        'probability_class_0': y_pred_proba[:, 0],
        'probability_class_1': y_pred_proba[:, 1],
        'index': X_test.index
    })
    
    # Get classification report
    report_dict = classification_report(y_test, y_pred_binary, output_dict=True)
    
    # Calculate metrics
    metrics = {
        'accuracy': (y_pred_binary == y_test).mean(),
        'roc_auc': roc_auc_score(y_test, y_pred_proba[:, 1]),
        f'class_{class_0}_precision': report_dict[class_0]['precision'],
        f'class_{class_0}_recall': report_dict[class_0]['recall'],
        f'class_{class_1}_precision': report_dict[class_1]['precision'],
        f'class_{class_1}_recall': report_dict[class_1]['recall']
    }
    
    # Print results
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred_binary))
    print("\nMetrics:")
    for metric, value in metrics.items():
        print(f"{metric}: {value:.3f}")
    
    return rf_model, results, metrics

# Example usage:
"""
# Assuming you have X_train, X_test, y_train, y_test ready
feature_columns = ['feature1', 'feature2', 'feature3']  # Replace with your feature columns
X_train = train_df[feature_columns]
y_train = train_df['is_correct_llm_pred']
X_test = test_df[feature_columns]
y_test = test_df['is_correct_llm_pred']

model, results, metrics = train_and_evaluate_RF(X_train, X_test, y_train, y_test)
"""

In [None]:
X_train_scaled_df['is_correct_llm_pred']

In [None]:



model, predictions, metrics = train_and_evaluate_RF(X_train_scaled_df[top_features], X_test_scaled_df[top_features], X_train_scaled_df['is_correct_llm_pred'], X_test_scaled_df['is_correct_llm_pred'])

# To see detailed metrics for each fold
print("\nDetailed metrics by fold:")
print(metrics)

# To get predictions for a specific fold
predictions

In [None]:
metrics

In [None]:
predictions=predictions.sort_values(by='index', ascending=True)
predictions

In [None]:
thresholds = np.arange(0.1, 1, 0.1)
print(thresholds)

threshold_results = []

for threshold in thresholds:
    is_correct = predictions['probability_class_1']>=threshold
    
    print(f"Threshold {threshold}")
    
 
    df_report, _ = format_classification_report(predictions['actual'], is_correct)
    
    threshold_results.append({
        'Threshold':threshold,
        'False_precision':df_report.loc['False','precision'].item(),
        'False_recall':df_report.loc['False','recall'].item(),
        'Flase_f1_score':df_report.loc['False','f1-score'].item(),
        'True_precision':df_report.loc['True','precision'].item(),
        'True_recall':df_report.loc['True','recall'].item(),
        'True_f1_score':df_report.loc['True','f1-score'].item(),
        'weighted_avg_f1_score':df_report.loc['weighted avg','f1-score'].item(),
        'accuracy':df_report.loc['accuracy','f1-score'].item()

    })
    print(df_report)
    print("\n******************************")

In [None]:
df = pd.DataFrame(threshold_results)
df

### Feature Importance of Model

In [None]:
def plot_feature_importance(model, feature_names):
    """
    Plot feature importance from a trained random forest model
    
    Args:
        model: Trained random forest model
        feature_names: List of feature names
    """
    import matplotlib.pyplot as plt
    import pandas as pd
    
    # Get feature importance
    importances = model.feature_importances_
    
    # Create DataFrame with features and importance
    feature_imp = pd.DataFrame({
        'feature': feature_names,
        'importance': importances
    }).sort_values('importance', ascending=False)
    
    # Plot
    plt.figure(figsize=(10, 6))
    plt.bar(range(len(importances)), feature_imp['importance'])
    plt.xticks(range(len(importances)), feature_imp['feature'], rotation=45, ha='right')
    plt.xlabel('Features')
    plt.ylabel('Importance')
    plt.title('Feature Importance in Random Forest Model')
    plt.tight_layout()
    plt.show()
    
    return feature_imp

In [None]:
plot_feature_importance(model, top_features)