# Exploratory Data Analysis
This notebook will explore the dataset provided for the purposes of feature engineering

In [75]:
import re
import warnings
import numpy as np
import pandas as pd
from collections import Counter
warnings.filterwarnings('ignore')

### Fetching training data from local store

In [52]:
train_data_path = "data/train_df.csv"
train_df = pd.read_csv(train_data_path)
print(train_df.head())

   Unnamed: 0                                             file_1  \
0           0  The VIRSA (Visible Infrared Survey Telescope A...   
1           1  China\nThe goal of this project involves achie...   
2           2  Scientists can learn about how galaxies form a...   
3           3  China\nThe study suggests that multiple star s...   
4           4  Dinosaur Rex was excited about his new toy set...   

                                              file_2  real_file_label  
0  The China relay network has released a signifi...                2  
1  The project aims to achieve an accuracy level ...                1  
2  Dinosaur eggshells offer clues about what dino...                2  
3  The importance for understanding how stars evo...                2  
4  Analyzing how fast stars rotate within a galax...                1  


### Identifying missing values

I have implemented two options, first is to replace the NA values with an empty string. The second is to drop the observation all together.

Importantly from the output below there are two rows that have NAs. Both rows have missing fake texts. Since the sample size is already small, injecting empty strings may influence performance of the algorithms more than removing the samples altogether. 

In [53]:
clean_df = train_df.copy()
print(f"Number of samples before data cleaning: {len(clean_df)}")

# Method 1: Replacing NaNs with empty strings
for idx, row in clean_df.iterrows():
    if pd.isna(row["file_1"]):
        print(row)
        # clean_df.at[idx, "file_1"] = ""
    elif pd.isna(row["file_2"]):
        print(row)
        # clean_df.at[idx, "file_2"] = ""

# Method 2: Removing NA rows altogether 
clean_df.dropna(subset=["file_1", "file_2"], inplace=True)
print(f"Number of samples before data cleaning: {len(clean_df)}")

Number of samples before data cleaning: 94
Unnamed: 0                                                        10
file_1             To determine how old stars are within R136's c...
file_2                                                           NaN
real_file_label                                                    1
Name: 10, dtype: object
Unnamed: 0                                                        14
file_1                                                           NaN
file_2             The design phase for CTAs (the Cherenkov Teles...
real_file_label                                                    2
Name: 14, dtype: object
Number of samples before data cleaning: 92


### Extract Basic Data Functions 

In [79]:
def extract_basic_features(df):
    # Create real and fake text columns
    df['real_text'] = df.apply(
        lambda row: row['file_1'] if row['real_file_label'] == 1 else row['file_2'], 
        axis=1
    )
    df['fake_text'] = df.apply(
        lambda row: row['file_2'] if row['real_file_label'] == 1 else row['file_1'], 
        axis=1
    )

    features = []

    for idx, row in df.iterrows():
        real_text = row['real_text']
        fake_text = row['fake_text']
        
        # Basic length features
        real_chars = len(real_text)
        fake_chars = len(fake_text)
        real_words = len(real_text.split())
        fake_words = len(fake_text.split())
        
        # Basic punctuation features
        real_punct = len(re.findall(r'[.!?,:;]', real_text))
        fake_punct = len(re.findall(r'[.!?,:;]', fake_text))
        real_punct_types = len(set(re.findall(r'[.!?,:;()]', real_text)))
        fake_punct_types = len(set(re.findall(r'[.!?,:;()]', fake_text)))
        
        # Basic sentence features
        real_sents = len([s for s in re.split(r'[.!?]+', real_text) if s.strip()])
        fake_sents = len([s for s in re.split(r'[.!?]+', fake_text) if s.strip()])
        #real_avg_sent_len = np.mean([len(s.split()) for s in real_sents]) if real_sents else 0
        #fake_avg_sent_len = np.mean([len(s.split()) for s in fake_sents]) if fake_sents else 0
        
        # Basic variance features
        real_word_lengths = [len(w) for w in real_text.split()] if real_words > 0 else [0]
        fake_word_lengths = [len(w) for w in fake_text.split()] if fake_words > 0 else [0]
        real_word_var = np.var(real_word_lengths) if len(real_word_lengths) > 1 else 0
        fake_word_var = np.var(fake_word_lengths) if len(fake_word_lengths) > 1 else 0
        
        # Count proper nouns
        real_proper = len([w for i, w in enumerate(real_text.split()) 
                            if w and w[0].isupper() and i > 0])
        fake_proper = len([w for i, w in enumerate(fake_text.split()) 
                            if w and w[0].isupper() and i > 0])
        
        # Numbers
        real_numbers = len(re.findall(r'\d+', real_text))
        fake_numbers = len(re.findall(r'\d+', fake_text))
        real_precise_nums = len([n for n in re.findall(r'\d+\.?\d*', real_text) if '.' in n])
        fake_precise_nums = len([n for n in re.findall(r'\d+\.?\d*', fake_text) if '.' in n])
        real_large_nums = len([n for n in re.findall(r'\d+', real_text) if len(n) >= 3])
        fake_large_nums = len([n for n in re.findall(r'\d+', fake_text) if len(n) >= 3])

        # Science terms
        science_terms = ['telescope', 'survey', 'observation', 'stellar', 'galaxy', 'star', 
                'astronomical', 'magnitude', 'photometric', 'spectroscopic', 
                'wavelength', 'redshift', 'luminosity', 'parsec', 'light-year']

        real_science_count = sum(1 for term in science_terms if term.lower() in real_text.lower())
        fake_science_count = sum(1 for term in science_terms if term.lower() in fake_text.lower())

        # Abbreviations
        abbrevs = ['ESO', 'NASA', 'VLT', 'HST', 'ALMA', 'VISTA', 'VIRSA', 'VMC', 'VVV']
        real_abbrev_count = sum(1 for abbrev in abbrevs if abbrev in real_text)
        fake_abbrev_count = sum(1 for abbrev in abbrevs if abbrev in fake_text)

        # Repetition patterns
        real_word_freq = Counter(real_text.lower().split())
        fake_word_freq = Counter(fake_text.lower().split())
        real_repetition_score = sum(count for count in real_word_freq.values() if count > 1)
        fake_repetition_score = sum(count for count in fake_word_freq.values() if count > 1)
        real_unique_ratio = len(set(real_text.lower().split())) / (len(real_text.split()) + 1)
        fake_unique_ratio = len(set(fake_text.lower().split())) / (len(fake_text.split()) + 1)
        
        feature_row = {
            # Length differences
            'char_real': real_chars,
            'char_fake': fake_chars,
            'word_real': real_words,
            'word_fake': fake_words,
            'char_diff': real_chars - fake_chars,
            'word_diff': real_words - fake_words,
            'char_ratio': real_chars / (fake_chars + 1) if fake_chars > 0 else real_chars,
            'word_ratio': real_words / (fake_words + 1) if fake_words > 0 else real_words,
            
            # Punctuation differences  
            'punct_real': real_punct,
            'punct_fake': fake_punct, 
            'punct_diff': real_punct - fake_punct,
            'real_punct_types': real_punct_types,
            'fake_punct_types': fake_punct_types, 
            'punct_density_real': real_punct / (real_words + 1),
            'punct_density_fake': fake_punct / (fake_words + 1),
            'punct_density_diff': (real_punct / (real_words + 1)) - (fake_punct / (fake_words + 1)),
            
            # Sentence differences
            'real_sent': real_sents,
            'fake_sent': fake_sents,
            'sent_diff': real_sents - fake_sents,
            'sent_ratio': real_sents / (fake_sents + 1) if fake_sents > 0 else real_sents,
            # 'real_avg_sent_len': real_avg_sent_len,
            # 'fake_avg_sent_len': fake_avg_sent_len,

            
            # Variance differences
            'word_var_real': real_word_var,
            'word_var_fake': fake_word_var,
            'word_var_diff': real_word_var - fake_word_var,
            
            # Content differences
            'real_proper': real_proper,
            'fake_proper': fake_proper,
            'proper_diff': real_proper - fake_proper,
            'proper_density_real': real_proper / (real_words + 1),
            'proper_density_fake': fake_proper / (fake_words + 1),

            # Numbers
            'real_numbers': real_numbers,
            'fake_numbers': fake_numbers,
            'numbers_diff': real_numbers - fake_numbers,
            'real_precise_nums': real_precise_nums,
            'fake_precise_nums': fake_precise_nums,
            'real_large_nums': real_large_nums,
            'fake_large_nums': fake_large_nums,

            # Science terms
            'real_science_count': real_science_count,
            'fake_science_count': fake_science_count,

            # Abbreviations
            'real_abbrev_count': real_abbrev_count,
            'fake_abbrev_count': fake_abbrev_count,

            # Repetition patterns
            'real_repetition_score': real_repetition_score,
            'fake_repetition_score': fake_repetition_score,
            'real_unique_ratio': real_unique_ratio,
            'fake_unique_ratio': fake_unique_ratio
        }
        
        features.append(feature_row)

    return pd.DataFrame(features)

In [82]:
features_df = extract_basic_features(clean_df)
labels = (clean_df['real_file_label'] - 1).values
print(features_df.describe())

         char_real     char_fake    word_real    word_fake     char_diff  \
count    92.000000     92.000000    92.000000    92.000000     92.000000   
mean   2321.728261   2622.391304   312.608696   340.858696   -300.663043   
std    2226.269161   4519.078181   229.454679   460.685719   5162.392454   
min      69.000000    285.000000     9.000000    32.000000 -38648.000000   
25%    1264.750000   1132.250000   184.250000   170.750000   -643.750000   
50%    1631.000000   1442.000000   242.000000   218.500000     75.500000   
75%    2158.750000   2091.750000   333.000000   320.250000    688.750000   
max    9561.000000  40316.000000  1008.000000  4158.000000   8521.000000   

         word_diff  char_ratio  word_ratio  punct_real  punct_fake  ...  \
count    92.000000   92.000000   92.000000   92.000000   92.000000  ...   
mean    -28.250000    1.573956    1.414684   37.771739   42.880435  ...   
std     522.505284    1.740542    1.309458   42.134297   74.898296  ...   
min   -3902.000

In [87]:
def analyze_feature_distributions(features_df):
    for feature in features_df.columns:
        values = features_df[feature]
        
        print(f"{feature}:")
        print(f"  Mean: {values.mean():.4f}")
        print(f"  Std:  {values.std():.4f}")
        print(f"  Min:  {values.min():.4f}")
        print(f"  Max:  {values.max():.4f}")
        
        # For difference features, show how often real > fake
        if 'diff' in feature:
            positive_count = (values > 0).sum()
            negative_count = (values < 0).sum()
            zero_count = (values == 0).sum()
            total = len(values)
            
            print(f"  Real > Fake: {positive_count}/{total} ({100*positive_count/total:.1f}%)")
            print(f"  Real < Fake: {negative_count}/{total} ({100*negative_count/total:.1f}%)")
            print(f"  Real = Fake: {zero_count}/{total} ({100*zero_count/total:.1f}%)")
        
        print()

analyze_feature_distributions(features_df)

char_real:
  Mean: 2321.7283
  Std:  2226.2692
  Min:  69.0000
  Max:  9561.0000

char_fake:
  Mean: 2622.3913
  Std:  4519.0782
  Min:  285.0000
  Max:  40316.0000

word_real:
  Mean: 312.6087
  Std:  229.4547
  Min:  9.0000
  Max:  1008.0000

word_fake:
  Mean: 340.8587
  Std:  460.6857
  Min:  32.0000
  Max:  4158.0000

char_diff:
  Mean: -300.6630
  Std:  5162.3925
  Min:  -38648.0000
  Max:  8521.0000
  Real > Fake: 55/92 (59.8%)
  Real < Fake: 37/92 (40.2%)
  Real = Fake: 0/92 (0.0%)

word_diff:
  Mean: -28.2500
  Std:  522.5053
  Min:  -3902.0000
  Max:  826.0000
  Real > Fake: 51/92 (55.4%)
  Real < Fake: 40/92 (43.5%)
  Real = Fake: 1/92 (1.1%)

char_ratio:
  Mean: 1.5740
  Std:  1.7405
  Min:  0.0414
  Max:  9.1844

word_ratio:
  Mean: 1.4147
  Std:  1.3095
  Min:  0.0476
  Max:  7.0909

punct_real:
  Mean: 37.7717
  Std:  42.1343
  Min:  1.0000
  Max:  178.0000

punct_fake:
  Mean: 42.8804
  Std:  74.8983
  Min:  0.0000
  Max:  633.0000

punct_diff:
  Mean: -5.1087
  Std:  8

In [88]:
def compare_real_vs_fake_features(features_df):    
    # Compare pairs of real/fake features
    feature_pairs = [
        ('char_real', 'char_fake'),
        ('word_real', 'word_fake'),
        ('punct_real', 'punct_fake'),
        ('punct_density_real', 'punct_density_fake'),
        ('real_sent', 'fake_sent'),
        ('word_var_real', 'word_var_fake'),
        ('real_proper', 'fake_proper'),
        ('proper_density_real', 'proper_density_fake'),
        ('real_punct_types', 'fake_punct_types'),
        ('real_numbers', 'fake_numbers'),
        ('real_precise_nums', 'fake_precise_nums'),
        ('real_large_nums','fake_large_nums'),
        ('real_science_count', 'fake_science_count'),
        ('real_abbrev_count', 'fake_abbrev_count'),
        ('real_repetition_score', 'fake_repetition_score'),
        ('real_unique_ratio', 'fake_unique_ratio')
    ]
    
    for real_feature, fake_feature in feature_pairs:
        if real_feature in features_df.columns and fake_feature in features_df.columns:
            real_values = features_df[real_feature]
            fake_values = features_df[fake_feature]
            
            print(f"{real_feature} vs {fake_feature}:")
            print(f"  Real mean: {real_values.mean():.4f}")
            print(f"  Fake mean: {fake_values.mean():.4f}")
            print(f"  Difference: {real_values.mean() - fake_values.mean():.4f}")
            print()

compare_real_vs_fake_features(features_df)

char_real vs char_fake:
  Real mean: 2321.7283
  Fake mean: 2622.3913
  Difference: -300.6630

word_real vs word_fake:
  Real mean: 312.6087
  Fake mean: 340.8587
  Difference: -28.2500

punct_real vs punct_fake:
  Real mean: 37.7717
  Fake mean: 42.8804
  Difference: -5.1087

punct_density_real vs punct_density_fake:
  Real mean: 0.1061
  Fake mean: 0.1035
  Difference: 0.0027

real_sent vs fake_sent:
  Real mean: 17.2391
  Fake mean: 20.1413
  Difference: -2.9022

word_var_real vs word_var_fake:
  Real mean: 12.3761
  Fake mean: 12.4253
  Difference: -0.0492

real_proper vs fake_proper:
  Real mean: 37.5326
  Fake mean: 44.6413
  Difference: -7.1087

proper_density_real vs proper_density_fake:
  Real mean: 0.1198
  Fake mean: 0.1219
  Difference: -0.0020

real_punct_types vs fake_punct_types:
  Real mean: 4.8261
  Fake mean: 4.7500
  Difference: 0.0761

real_numbers vs fake_numbers:
  Real mean: 7.8804
  Fake mean: 7.6630
  Difference: 0.2174

real_precise_nums vs fake_precise_nums:
