# Advanced Feature Engineering

This notebook will investigate the outcomes of thorough feature engineering above to improve baseline performance (64%). 

In [108]:
import os
import re
import random
import numpy as np
import pandas as pd
from datetime import datetime 
from collections import Counter
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier

### Import Training Data

In [109]:
training_data = "../data/train_long_df.csv"
train_df = pd.read_csv(training_data, dtype={'folder': int, 'text': str, 'is_real': int})
train_df.head()

Unnamed: 0,folder,text,is_real
0,0,The VIRSA (Visible Infrared Survey Telescope A...,1
1,0,The China relay network has released a signifi...,0
2,1,China\nThe goal of this project involves achie...,0
3,1,The project aims to achieve an accuracy level ...,1
4,2,Scientists can learn about how galaxies form a...,1


### Remove NA Rows

In [110]:
clean_df = train_df.copy()
print(f"Number of samples before data cleaning: {len(clean_df)}")

# Removing NA rows altogether 
clean_df.dropna(subset=["text"], inplace=True)
print(f"Number of samples before data cleaning: {len(clean_df)}")

Number of samples before data cleaning: 190
Number of samples before data cleaning: 188


### Feature Extraction

In [111]:
features = []
for idx, row in clean_df.iterrows():
    text = row['text']
    
    # Basic length features
    num_chars = len(text)
    num_words = len(text.split())
    
    # Basic punctuation features
    punctuation_count = len(re.findall(r'[.!?,:;]', text))
    num_punct_types = len(set(re.findall(r'[.!?,:;()]', text)))
    
    # Basic sentence features
    num_sentences = len([s for s in re.split(r'[.!?]+', text) if s.strip()])
    avg_sent_length = np.mean([len(s.split()) for s in re.split(r'[.!?]+', text)]) if num_sentences else 0
    
    # Basic variance features
    word_lengths = [len(w) for w in text.split()] if num_words > 0 else [0]
    word_var = np.var(word_lengths) if len(word_lengths) > 1 else 0
    
    # Count proper nouns
    proper_noun_count = len([w for i, w in enumerate(text.split()) 
                        if w and w[0].isupper() and i > 0])
    
    # Numbers
    num_numbers = len(re.findall(r'\d+', text))
    num_precise_num = len([n for n in re.findall(r'\d+\.?\d*', text) if '.' in n])
    num_large_num = len([n for n in re.findall(r'\d+', text) if len(n) >= 3])

    # Science terms
    science_terms = ['telescope', 'survey', 'observation', 'stellar', 'galaxy', 'star', 
            'astronomical', 'magnitude', 'photometric', 'spectroscopic', 
            'wavelength', 'redshift', 'luminosity', 'parsec', 'light-year']

    num_science_terms = sum(1 for term in science_terms if term.lower() in text.lower())

    # Abbreviations
    abbrevs = ['ESO', 'NASA', 'VLT', 'HST', 'ALMA', 'VISTA', 'VIRSA', 'VMC', 'VVV']
    num_abbrev = sum(1 for abbrev in abbrevs if abbrev in text)

    # Repetition patterns
    real_word_freq = Counter(text.lower().split())
    repetition_score = sum(count for count in real_word_freq.values() if count > 1)
    unique_ratio = len(set(text.lower().split())) / (len(text.split()) + 1)
    
    feature_row = {
        # Length differences
        'num_char': num_chars,
        'num_words': num_words,
        
        # Punctuation differences  
        'punctuation_count': punctuation_count,
        'num_punct_types': num_punct_types,
        
        # Sentence differences
        'num_sentences': num_sentences,
        'avg_sent_length': avg_sent_length,
        
        # Variance differences
        'word_var': word_var,
        
        # Content differences
        'proper_noun_count': proper_noun_count,
        'proper_noun_density': proper_noun_count / (proper_noun_count + 1),

        # Numbers
        'num_numbers': num_numbers,
        'num_precise_num': num_precise_num,
        'num_large_num': num_large_num,

        # Science terms
        'num_science_terms': num_science_terms,

        # Abbreviations
        'num_abbrev': num_abbrev,

        # Repetition patterns
        'repetition_score': repetition_score,
        'unique_ratio': unique_ratio,
    }
    
    features.append(feature_row)

feature_df = pd.DataFrame(features)
feature_df.head()

Unnamed: 0,num_char,num_words,punctuation_count,num_punct_types,num_sentences,avg_sent_length,word_var,proper_noun_count,proper_noun_density,num_numbers,num_precise_num,num_large_num,num_science_terms,num_abbrev,repetition_score,unique_ratio
0,2196,304,16,4,9,29.8,8.116246,38,0.974359,0,0,0,6,5,102,0.8
1,2018,296,18,6,10,26.818182,8.045642,20,0.952381,0,0,0,5,0,103,0.794613
2,3124,454,17,7,9,45.5,8.279323,17,0.944444,1,0,0,2,0,233,0.696703
3,936,137,11,5,6,19.428571,9.229687,6,0.857143,6,1,3,3,0,36,0.847826
4,1139,159,9,5,4,31.8,7.398837,3,0.75,0,0,0,4,0,47,0.81875


### Building the ML Model

In [112]:
test_size=0.2
random_state=42

X = feature_df
# Our labels for the real text
y = (clean_df['is_real']).values

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=test_size, random_state=random_state, stratify=y
)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

models = {}
results = {}

lr = LogisticRegression(random_state=random_state, max_iter=1000)
lr.fit(X_train_scaled, y_train)
lr_pred = lr.predict(X_test_scaled)
models['Logistic Regression'] = lr
results['Logistic Regression'] = {
    'accuracy': accuracy_score(y_test, lr_pred),
    'cv_score': cross_val_score(lr, X_train_scaled, y_train, cv=5).mean()
}

# Random Forest
rf = RandomForestClassifier(n_estimators=100, random_state=random_state)
rf.fit(X_train_scaled, y_train)
rf_pred = rf.predict(X_test_scaled)
models['Random Forest'] = rf
results['Random Forest'] = {
    'accuracy': accuracy_score(y_test, rf_pred),
    'cv_score': cross_val_score(rf, X_train_scaled, y_train, cv=5).mean()
}

# Gradient Boosting
gb = GradientBoostingClassifier(n_estimators=100, random_state=random_state)
gb.fit(X_train_scaled, y_train)
gb_pred = gb.predict(X_test_scaled)
models['Gradient Boosting'] = gb
results['Gradient Boosting'] = {
    'accuracy': accuracy_score(y_test, gb_pred),
    'cv_score': cross_val_score(gb, X_train_scaled, y_train, cv=5).mean()
}

estimators = [
    ('lr', models['Logistic Regression']),
    ('rf', models['Random Forest']),
    ('gb', models['Gradient Boosting'])
]

ensemble_model = VotingClassifier(estimators=estimators, voting='hard')
ensemble_model.fit(X_train_scaled, y_train)
ensemble_pred = ensemble_model.predict(X_test_scaled)
models['Ensemble (Majority Vote)'] = ensemble_model
results['Ensemble (Majority Vote)'] = {
    'accuracy': accuracy_score(y_test, ensemble_pred),
    'cv_score': cross_val_score(ensemble_model, X_train_scaled, y_train, cv=5).mean()
}

for model_name, metrics in results.items():
    print(f"Model: {model_name}")
    print(f"  Accuracy: {metrics['accuracy']:.4f}")
    print(f"  Cross-Validation Score (Mean): {metrics['cv_score']:.4f}\n")


Model: Logistic Regression
  Accuracy: 0.6579
  Cross-Validation Score (Mean): 0.7533

Model: Random Forest
  Accuracy: 0.6579
  Cross-Validation Score (Mean): 0.7000

Model: Gradient Boosting
  Accuracy: 0.6316
  Cross-Validation Score (Mean): 0.6200

Model: Ensemble (Majority Vote)
  Accuracy: 0.6053
  Cross-Validation Score (Mean): 0.7133



### Importing Test Data

In [71]:
test_data = "../data/test_long_df.csv"
test_df = pd.read_csv(test_data, dtype={'folder': int, 'text': str, 'is_real': int})
test_df.head()

Unnamed: 0,folder,text
0,0,"""Music"" Music music music Music music Music mu..."
1,0,Since its launch on Paranal observatory's Very...
2,1,underground exploration on SN's birth has prov...
3,1,SN 1987A provides valuable insights as newer o...
4,2,This research aimed to understand how star sha...


### Extracting Test Features

In [93]:
# Fill NaN values
test_df['text'] = test_df['text'].fillna('').astype(str)

test_features = []
for idx, row in test_df.iterrows():
    text = row['text']
    
    # Basic length features
    num_chars = len(text)
    num_words = len(text.split())
    
    # Basic punctuation features
    punctuation_count = len(re.findall(r'[.!?,:;]', text))
    num_punct_types = len(set(re.findall(r'[.!?,:;()]', text)))
    
    # Basic sentence features
    num_sentences = len([s for s in re.split(r'[.!?]+', text) if s.strip()])
    avg_sent_length = np.mean([len(s.split()) for s in re.split(r'[.!?]+', text)]) if num_sentences else 0
    
    # Basic variance features
    word_lengths = [len(w) for w in text.split()] if num_words > 0 else [0]
    word_var = np.var(word_lengths) if len(word_lengths) > 1 else 0
    
    # Count proper nouns
    proper_noun_count = len([w for i, w in enumerate(text.split()) 
                        if w and w[0].isupper() and i > 0])
    
    # Numbers
    num_numbers = len(re.findall(r'\d+', text))
    num_precise_num = len([n for n in re.findall(r'\d+\.?\d*', text) if '.' in n])
    num_large_num = len([n for n in re.findall(r'\d+', text) if len(n) >= 3])

    # Science terms
    science_terms = ['telescope', 'survey', 'observation', 'stellar', 'galaxy', 'star', 
            'astronomical', 'magnitude', 'photometric', 'spectroscopic', 
            'wavelength', 'redshift', 'luminosity', 'parsec', 'light-year']

    num_science_terms = sum(1 for term in science_terms if term.lower() in text.lower())

    # Abbreviations
    abbrevs = ['ESO', 'NASA', 'VLT', 'HST', 'ALMA', 'VISTA', 'VIRSA', 'VMC', 'VVV']
    num_abbrev = sum(1 for abbrev in abbrevs if abbrev in text)

    # Repetition patterns
    real_word_freq = Counter(text.lower().split())
    repetition_score = sum(count for count in real_word_freq.values() if count > 1)
    unique_ratio = len(set(text.lower().split())) / (len(text.split()) + 1)
    
    feature_row = {
        # Length differences
        'num_char': num_chars,
        'num_words': num_words,
        
        # Punctuation differences  
        'punctuation_count': punctuation_count,
        'num_punct_types': num_punct_types,
        
        # Sentence differences
        'num_sentences': num_sentences,
        'avg_sent_length': avg_sent_length,
        
        # Variance differences
        'word_var': word_var,
        
        # Content differences
        'proper_noun_count': proper_noun_count,
        'proper_noun_density': proper_noun_count / (proper_noun_count + 1),

        # Numbers
        'num_numbers': num_numbers,
        'num_precise_num': num_precise_num,
        'num_large_num': num_large_num,

        # Science terms
        'num_science_terms': num_science_terms,

        # Abbreviations
        'num_abbrev': num_abbrev,

        # Repetition patterns
        'repetition_score': repetition_score,
        'unique_ratio': unique_ratio,
    }
    
    test_features.append(feature_row)

test_features_df = pd.DataFrame(test_features)
test_features_df.head()

Unnamed: 0,num_char,num_words,punctuation_count,num_punct_types,num_sentences,avg_sent_length,word_var,proper_noun_count,proper_noun_density,num_numbers,num_precise_num,num_large_num,num_science_terms,num_abbrev,repetition_score,unique_ratio
0,1710,262,9,5,8,28.888889,7.890289,18,0.947368,0,0,0,2,0,116,0.726236
1,1249,173,10,5,4,33.8,10.602359,9,0.9,0,0,0,3,1,67,0.775862
2,1168,165,10,6,5,27.5,7.823104,17,0.944444,0,0,0,2,1,43,0.855422
3,1516,216,11,5,6,30.285714,8.031872,20,0.952381,5,0,2,2,1,78,0.78341
4,752,112,3,3,3,27.75,6.789461,2,0.666667,0,0,0,1,0,46,0.769912


### Generating Test Predictions

In [None]:
test_features_df = test_features_df[feature_df.columns]

# Scale features using fitted scaler
test_features_scaled = scaler.transform(test_features_df)

# Generate predictions
predictions = ensemble_model.predict(test_features_scaled)
predictions_df = pd.DataFrame(predictions)
predictions_df = predictions_df.reset_index().rename(columns={'index': 'id', 0: 'real_text_id'})

# Place text pair predictions on the same row
real_text_ids = predictions_df['real_text_id']
new_list_array = real_text_ids.to_numpy().reshape(-1, 2)
paired_df = pd.DataFrame(new_list_array)

# 
diff_count = 0
reduced_values = []
for idx, row in paired_df.iterrows():
    col0_val = row[0]
    col1_val = row[1]
    transformed_value = None

    if col0_val == 0 and col1_val == 1:
        transformed_value = 2
    elif col0_val == 1 and col1_val == 0:
        transformed_value = 1
    elif (col0_val == 1 and col1_val == 1) or \
         (col0_val == 0 and col1_val == 0):
        diff_count += 1
        transformed_value = random.choice([1, 2])

    reduced_values.append(transformed_value)

final_predictions_df = pd.DataFrame(reduced_values).rename(columns={'index': 'id', 0: 'real_text_id'})
timestamp_str = datetime.now().strftime("%Y-%m-%d_%H:%M:%S")
final_predictions_df.to_csv(f"../predictions/predictions_em_{timestamp_str}.csv")
print(f"diff count: {diff_count}")

diff count: 558


In [61]:
# We need to flip the prediction label -> this is definately wrong
file = pd.read_csv("../predictions/predictions_em_2025-07-10_14:12:24.csv")
file['real_text_id'] = file['real_text_id'].map({1: 2, 2: 1})
timestamp_str = datetime.now().strftime("%Y-%m-%d_%H:%M:%S")
file.to_csv(f"../predictions/predictions_em_{timestamp_str}.csv", index=False)