# Random Forest Classifier for Baseline Performance

Efforts in another notebook reveal Random Forest has the best performance of the simple ML models. This notebook will establish the data extraction, training loop and test prediction scripts necessary to implement a random forest for this task. 

In [52]:
import re
import numpy as np
import pandas as pd
from datetime import datetime 
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score

## Importing Training Data

The following code will ingest data from a predefined merged dataset

In [14]:
training_data = "../data/train_df.csv"
train_df = pd.read_csv(training_data, dtype={'file_1': str, 'file_2': str, 'real_file_label': int})
train_df.rename(columns={"Unnamed: 0": "index"}, inplace=True) # Mistake from origin df
train_df.head()

Unnamed: 0,index,file_1,file_2,real_file_label
0,0,The VIRSA (Visible Infrared Survey Telescope A...,The China relay network has released a signifi...,2
1,1,China\nThe goal of this project involves achie...,The project aims to achieve an accuracy level ...,1
2,2,Scientists can learn about how galaxies form a...,Dinosaur eggshells offer clues about what dino...,2
3,3,China\nThe study suggests that multiple star s...,The importance for understanding how stars evo...,2
4,4,Dinosaur Rex was excited about his new toy set...,Analyzing how fast stars rotate within a galax...,1


#### Remove NA Rows

In [18]:
clean_df = train_df.copy()
print(f"Number of samples before data cleaning: {len(clean_df)}")

# Removing NA rows altogether 
clean_df.dropna(subset=["file_1", "file_2"], inplace=True)
print(f"Number of samples before data cleaning: {len(clean_df)}")

Number of samples before data cleaning: 94
Number of samples before data cleaning: 92


## Baseline Feature Extraction 

The following code snippit will extract basic text related features from each of the texts

In [19]:
# Creating real_text and fake_text columns
clean_df['real_text'] = clean_df.apply(
    lambda row: row['file_1'] if row['real_file_label'] == 1 else row['file_2'], 
    axis=1
)
clean_df['fake_text'] = clean_df.apply(
    lambda row: row['file_2'] if row['real_file_label'] == 1 else row['file_1'], 
    axis=1
)

# Iterating through rows and generating features
features = []
for idx, row in clean_df.iterrows():
    real_text = str(row['real_text'])
    fake_text = str(row['fake_text'])
    
    # Basic length features
    real_chars = len(real_text)
    fake_chars = len(fake_text)
    real_words = len(real_text.split())
    fake_words = len(fake_text.split())
    
    # Basic punctuation features
    real_punct = len(re.findall(r'[.!?,:;]', real_text))
    fake_punct = len(re.findall(r'[.!?,:;]', fake_text))
    
    # Basic sentence features
    real_sents = len([s for s in re.split(r'[.!?]+', real_text) if s.strip()])
    fake_sents = len([s for s in re.split(r'[.!?]+', fake_text) if s.strip()])
    
    # Word variance
    real_word_lengths = [len(w) for w in real_text.split()] if real_words > 0 else [0]
    fake_word_lengths = [len(w) for w in fake_text.split()] if fake_words > 0 else [0]
    real_word_var = np.var(real_word_lengths) if len(real_word_lengths) > 1 else 0
    fake_word_var = np.var(fake_word_lengths) if len(fake_word_lengths) > 1 else 0
    
    # Proper nouns
    real_proper = len([w for i, w in enumerate(real_text.split()) 
                        if w and w[0].isupper() and i > 0])
    fake_proper = len([w for i, w in enumerate(fake_text.split()) 
                        if w and w[0].isupper() and i > 0])
    
    # Numbers
    real_numbers = len(re.findall(r'\d+', real_text))
    fake_numbers = len(re.findall(r'\d+', fake_text))
    
    # Select most promising features for baseline
    feature_row = {
        # Length ratios
        'char_ratio': real_chars / (fake_chars + 1),
        'word_ratio': real_words / (fake_words + 1),
        
        # Difference features
        'char_diff': real_chars - fake_chars,
        'word_diff': real_words - fake_words,
        'punct_diff': real_punct - fake_punct,
        'sent_diff': real_sents - fake_sents,
        
        # Density features
        'punct_density_diff': (real_punct / (real_words + 1)) - (fake_punct / (fake_words + 1)),
        'proper_density_diff': (real_proper / (real_words + 1)) - (fake_proper / (fake_words + 1)),
        
        # Variance features
        'word_var_diff': real_word_var - fake_word_var,
        'word_var_ratio': real_word_var / (fake_word_var + 1e-6),
        
        # Content features
        'numbers_diff': real_numbers - fake_numbers,
        'proper_diff': real_proper - fake_proper,
    }
    
    features.append(feature_row)

feature_df = pd.DataFrame(features)
feature_df.head()

Unnamed: 0,char_ratio,word_ratio,char_diff,word_diff,punct_diff,sent_diff,punct_density_diff,proper_density_diff,word_var_diff,word_var_ratio,numbers_diff,proper_diff
0,0.918525,0.970492,-178,-8,2,1,0.008147,-0.05725,-0.070604,0.991301,0,-18
1,3.334045,3.289855,2188,317,6,3,-0.042348,-0.006116,-0.950364,0.897032,-5,11
2,0.702632,0.78125,-338,-34,-5,-1,-0.024504,0.028869,-0.531893,0.928111,1,3
3,1.052958,0.992424,95,-1,-5,-1,-0.018781,-0.071797,0.663026,1.081888,0,-19
4,0.223624,0.274194,-676,-89,-1,0,0.114977,0.090092,-2.715467,0.710071,-1,1


## Buidling the Random Forest Model 

In [42]:
test_size=0.2
random_state=42

X = feature_df
# Our labels for the real text
y = (clean_df['real_file_label'] - 1).values # [1,2] -> [0,1] for modeling 

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=test_size, random_state=random_state, stratify=y
)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Random Forest
rf = RandomForestClassifier(n_estimators=100, random_state=random_state)
rf.fit(X_train_scaled, y_train)
rf_pred = rf.predict(X_test_scaled) 

print(f"Training accuracy score: {accuracy_score(y_test, rf_pred)}")
print(f"CV validation score: {cross_val_score(rf, X_train_scaled, y_train, cv=5).mean()}")

Training accuracy score: 0.631578947368421
CV validation score: 0.5771428571428572


## Importing Testing Data

In [None]:
test_data_file = "../data/test_df.csv"
test_df = pd.read_csv(test_data_file, dtype={'file_1': str, 'file_2': str})
test_df.rename(columns={"Unnamed: 0": "index"}, inplace=True) # Mistake from origin df
test_df.head()

Unnamed: 0,index,file_1,file_2
0,0,"""Music"" Music music music Music music Music mu...",Since its launch on Paranal observatory's Very...
1,1,underground exploration on SN's birth has prov...,SN 1987A provides valuable insights as newer o...
2,2,This research aimed to understand how star sha...,ChromeDriver music player\nThis study focused ...
3,3,Using OmegaCAM's wide field capabilities spann...,"greek translation :\nvazhi (megaCAM), territor..."
4,4,AssemblyCulture AssemblyCulture AssemblyCultur...,XClass is software tool that helps astronomers...


## Extracting Testing Features

In [45]:
# Fill NaN values
test_df['file_1'] = test_df['file_1'].fillna('').astype(str)
test_df['file_2'] = test_df['file_2'].fillna('').astype(str)

features = []
for idx, row in test_df.iterrows():
    text1 = row['file_1']
    text2 = row['file_2']
    
    # Basic length features
    chars1 = len(text1)
    chars2 = len(text2)
    words1 = len(text1.split())
    words2 = len(text2.split())
    
    # Basic punctuation features
    punct1 = len(re.findall(r'[.!?,:;]', text1))
    punct2 = len(re.findall(r'[.!?,:;]', text2))
    
    # Basic sentence features
    sents1 = len([s for s in re.split(r'[.!?]+', text1) if s.strip()])
    sents2 = len([s for s in re.split(r'[.!?]+', text2) if s.strip()])
    
    # Word variance
    word_lengths1 = [len(w) for w in text1.split()] if words1 > 0 else [0]
    word_lengths2 = [len(w) for w in text2.split()] if words2 > 0 else [0]
    word_var1 = np.var(word_lengths1) if len(word_lengths1) > 1 else 0
    word_var2 = np.var(word_lengths2) if len(word_lengths2) > 1 else 0
    
    # Proper nouns
    proper1 = len([w for i, w in enumerate(text1.split()) 
                    if w and w[0].isupper() and i > 0])
    proper2 = len([w for i, w in enumerate(text2.split()) 
                    if w and w[0].isupper() and i > 0])
    
    # Numbers
    numbers1 = len(re.findall(r'\d+', text1))
    numbers2 = len(re.findall(r'\d+', text2))
    
    # Create features assuming file_1 is "real" and file_2 is "fake"
    # The model will predict if this assumption is correct
    feature_row = {
        # Length ratios (file_1 / file_2)
        'char_ratio': chars1 / (chars2 + 1),
        'word_ratio': words1 / (words2 + 1),
        
        # Difference features (file_1 - file_2)
        'char_diff': chars1 - chars2,
        'word_diff': words1 - words2,
        'punct_diff': punct1 - punct2,
        'sent_diff': sents1 - sents2,
        
        # Density features
        'punct_density_diff': (punct1 / (words1 + 1)) - (punct2 / (words2 + 1)),
        'proper_density_diff': (proper1 / (words1 + 1)) - (proper2 / (words2 + 1)),
        
        # Variance features
        'word_var_diff': word_var1 - word_var2,
        'word_var_ratio': word_var1 / (word_var2 + 1e-6),
        
        # Content features
        'numbers_diff': numbers1 - numbers2,
        'proper_diff': proper1 - proper2,
    }
    
    features.append(feature_row)

test_features_df = pd.DataFrame(features)

## Generating Testing Predictions

In [60]:
test_features_df = test_features_df[feature_df.columns]

# Scale features using fitted scaler
test_features_scaled = scaler.transform(test_features_df)

# Generate predictions
predictions = rf.predict(test_features_scaled)
prediction_probabilities = rf.predict_proba(test_features_scaled)

final_predictions = pd.DataFrame(predictions + 1)  # Convert [0,1] back to [1,2]
final_predictions = final_predictions.rename(columns={'index': 'id', 0: 'real_text_id'})

timestamp_str = datetime.now().strftime("%Y-%m-%d_%H:%M:%S")
final_predictions.to_csv(f"../predictions/predictions_rf_{timestamp_str}.csv")