<a href="https://colab.research.google.com/github/suleiman-odeh/NLP_Project_Team16/blob/main/01_Preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
"""
Load dataset and seperate train and test set
"""
import pandas as pd
from datasets import load_dataset
# Load dataset
dataset = load_dataset("ailsntua/QEvasion")

# Construct the train and and test set
train_df = dataset['train'].to_pandas()
test_df = dataset['test'].to_pandas()

train_df['split_type'] = 'train'
test_df['split_type'] = 'test'

# Combine into one main dataframe
df = pd.concat([train_df, test_df], ignore_index=True)

print(f"Total train set loaded: {len(train_df)}")
print(f"Total test set loaded: {len(test_df)}")
print(f"Total combined set loaded: {len(df)}")
print("Columns found:", df.columns.tolist())

Total train set loaded: 3448
Total test set loaded: 308
Total combined set loaded: 3756
Columns found: ['title', 'date', 'president', 'url', 'question_order', 'interview_question', 'interview_answer', 'gpt3.5_summary', 'gpt3.5_prediction', 'question', 'annotator_id', 'annotator1', 'annotator2', 'annotator3', 'inaudible', 'multiple_questions', 'affirmative_questions', 'index', 'clarity_label', 'evasion_label', 'split_type']


In [7]:
"""
Check sample of messy raws
"""
row_numbers = [3268, 1987, 2114, 1873, 3191]
samples = df[df['index'].isin(row_numbers)]
print("--- RAW ANSWERS (By ID) ---")
for _, row in samples.iterrows():
    print(f"\nID: {row['index']}")
    print(row['interview_answer'])
    print("-" * 50)

--- RAW ANSWERS (By ID) ---

ID: 1873
So, first of ll, on TPP, Angel, I hven't been round s long s Sentors Crper or Secretry Kerry, but I've spent enough tie in the Sente to know tht every trde del is pinful, becuse folks re lwys seeing if they cn get n even better del. And especilly when you hve ultiple prties involved, folks re going to be scrutinizing it, they're going to be debting it, nd in n election yer, you cn nticipte tht soe folks re going to try to score politicl points off it.Hving sid tht, I rein confident we're going to get it done. And the reson I' confident is becuse it's the right thing to do. It's good for the country. It's good for Aeric. It's good for the regio It's good for the world.And I know I've sold this to you before, but let e reiterte: This is the fstest growing prt of the world. This represents n enorous rket for the United Sttes. Most countries here lredy sell their stuff to the United Sttes, nd we hve reltively low triffs. In other words, we put reltivel

In [8]:
"""
Clean the text
"""
import re

def clean_text(text):
    if not isinstance(text, str):
        return ""

    # Remove brackets []
    text = re.sub(r'\[.*?\]', '', text, flags=re.DOTALL)

    # remove em dashes
    # "The—all right" becomes "The, all right" or "The all right"
    text = re.sub(r'—', ', ', text)
    text = re.sub(r'--', ', ', text)

    # through data analysis, we found the word haven't with missing vowel
    text = re.sub(r"\bhven't\b", "haven't", text)

    # after removing brackets spaces will be there, therfore we remove the spaces
    text = re.sub(r'\s+', ' ', text).strip()
    text = re.sub(r'\s+([,.!?;:])', r'\1', text)

    return text

# Apply cleaning to Answer column
print("Cleaning data...")
df['cleaned_answer'] = df['interview_answer'].apply(clean_text)

Cleaning data...


In [9]:
"""
Check sample of cleaned raws
"""
row_numbers = [3268, 1987, 2114, 1873, 3191]
samples = df[df['index'].isin(row_numbers)]
print("--- RAW ANSWERS (By ID) ---")
for _, row in samples.iterrows():
    print(f"\nID: {row['index']}")
    print(row['cleaned_answer'])
    print("-" * 50)

--- RAW ANSWERS (By ID) ---

ID: 1873
So, first of ll, on TPP, Angel, I haven't been round s long s Sentors Crper or Secretry Kerry, but I've spent enough tie in the Sente to know tht every trde del is pinful, becuse folks re lwys seeing if they cn get n even better del. And especilly when you hve ultiple prties involved, folks re going to be scrutinizing it, they're going to be debting it, nd in n election yer, you cn nticipte tht soe folks re going to try to score politicl points off it.Hving sid tht, I rein confident we're going to get it done. And the reson I' confident is becuse it's the right thing to do. It's good for the country. It's good for Aeric. It's good for the regio It's good for the world.And I know I've sold this to you before, but let e reiterte: This is the fstest growing prt of the world. This represents n enorous rket for the United Sttes. Most countries here lredy sell their stuff to the United Sttes, nd we hve reltively low triffs. In other words, we put reltive

In [10]:
print("\n--- Label Distribution (Task 1) ---")
# Counts for the 3-label 'clarity_label'
clarity_counts = df['clarity_label'].value_counts()
print(clarity_counts)

print("\n--- Label Distribution (Task 2) ---")
# Counts for the 9-label 'evasion_label'
evasion_counts = df['evasion_label'].value_counts()
print(evasion_counts)

# annotator1 count
annoter1_counts = df['annotator1'].value_counts()
print("\n--- Annoter1 Distribution ---")
print(annoter1_counts)

# annotator2 count
annoter2_counts = df['annotator2'].value_counts()
print("\n--- Annoter2 Distribution ---")
print(annoter2_counts)

# annotator3 count
annotator2_counts = df['annotator3'].value_counts()
print("\n--- Annotator3 Distribution ---")
print(annotator2_counts)


--- Label Distribution (Task 1) ---
clarity_label
Ambivalent         2246
Clear Reply        1131
Clear Non-Reply     379
Name: count, dtype: int64

--- Label Distribution (Task 2) ---
evasion_label
Explicit               1052
Dodging                 706
Implicit                488
General                 386
Deflection              381
                        308
Declining to answer     145
Claims ignorance        119
Clarification            92
Partial/half-answer      79
Name: count, dtype: int64

--- Annoter1 Distribution ---
annotator1
Explicit               106
Dodging                 58
Implicit                54
Deflection              30
General                 29
Declining to answer     10
Claims ignorance         9
Partial/half-answer      8
Clarification            4
Name: count, dtype: int64

--- Annoter2 Distribution ---
annotator2
General                78
Dodging                72
Implicit               54
Explicit               53
Deflection             22
Claims igno

In [11]:
"""
mapping labels to ids
"""
import json

# clarity map labels
clarity_map = {
    'Clear Reply': 0,
    'Ambivalent': 1,
    'Clear Non-Reply': 2
}

# evasion labels
evasion_map = {
    'Claims ignorance': 0,
    'Clarification': 1,
    'Declining to answer': 2,
    'Deflection': 3,
    'Dodging': 4,
    'Explicit': 5,
    'General': 6,
    'Implicit': 7,
    'Partial/half-answer': 8
}

df['clarity_id'] = df['clarity_label'].map(clarity_map).fillna(-1).astype(int)
df['evasion_id'] = df['evasion_label'].map(evasion_map).fillna(-1).astype(int)
df['annotator1_id'] = df['annotator1'].map(evasion_map).fillna(-1).astype(int)
df['annotator2_id'] = df['annotator2'].map(evasion_map).fillna(-1).astype(int)
df['annotator3_id'] = df['annotator3'].map(evasion_map).fillna(-1).astype(int)


# We check if any row became "NaN"
if df['clarity_id'].isnull().any() or df['evasion_id'].isnull().any():
    print("WARNING: Some labels failed to map!")
    # Print the specific bad values to debug
    print("not mapped Clarity Labels:", df[df['clarity_id'].isnull()]['clarity_label'].unique())
    print("not mapped Evasion Labels:", df[df['evasion_id'].isnull()]['evasion_label'].unique())
else:
    print("SUCCESS: All labels mapped correctly.")


# 6. Save the Final Dataset
output_file = "QEvasion_cleaned.jsonl"
df.to_json(output_file, orient='records', lines=True)

print("-" * 30)
print(f"Dataset saved to: {output_file}")

SUCCESS: All labels mapped correctly.
------------------------------
Dataset saved to: QEvasion_cleaned.jsonl


In [13]:
print("\n--- Label Distribution (Task 1) ---")
# Counts for the 3-label 'clarity_label'
clarity_counts = df['clarity_id'].value_counts()
print(clarity_counts)

print("\n--- Label Distribution (Task 2) ---")
# Counts for the 9-label 'evasion_label'
evasion_counts = df['evasion_id'].value_counts()
print(evasion_counts)

# annotator1 count
annoter1_counts = df['annotator1_id'].value_counts()
print("\n--- Annoter1 Distribution ---")
print(annoter1_counts)

# annotator2 count
annoter2_counts = df['annotator2_id'].value_counts()
print("\n--- Annoter2 Distribution ---")
print(annoter2_counts)

# annotator3 count
annotator2_counts = df['annotator3_id'].value_counts()
print("\n--- Annotator3 Distribution ---")
print(annotator2_counts)


--- Label Distribution (Task 1) ---
clarity_id
1    2246
0    1131
2     379
Name: count, dtype: int64

--- Label Distribution (Task 2) ---
evasion_id
 5    1052
 4     706
 7     488
 6     386
 3     381
-1     308
 2     145
 0     119
 1      92
 8      79
Name: count, dtype: int64

--- Annoter1 Distribution ---
annotator1_id
-1    3448
 5     106
 4      58
 7      54
 3      30
 6      29
 2      10
 0       9
 8       8
 1       4
Name: count, dtype: int64

--- Annoter2 Distribution ---
annotator2_id
-1    3448
 6      78
 4      72
 7      54
 5      53
 3      22
 0      11
 2       9
 8       5
 1       4
Name: count, dtype: int64

--- Annotator3 Distribution ---
annotator3_id
-1    3448
 5      80
 7      67
 6      65
 4      43
 3      23
 2      14
 0       7
 8       5
 1       4
Name: count, dtype: int64


In [14]:
print("Columns found:", df.columns.tolist())

Columns found: ['title', 'date', 'president', 'url', 'question_order', 'interview_question', 'interview_answer', 'gpt3.5_summary', 'gpt3.5_prediction', 'question', 'annotator_id', 'annotator1', 'annotator2', 'annotator3', 'inaudible', 'multiple_questions', 'affirmative_questions', 'index', 'clarity_label', 'evasion_label', 'split_type', 'cleaned_answer', 'clarity_id', 'evasion_id', 'annotator1_id', 'annotator2_id', 'annotator3_id']
