# Exploratory Data Analysis
This notebook will explore the dataset provided for the purposes of feature engineering

In [None]:
import re
import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
warnings.filterwarnings('ignore')

### Fetching training data from local store

In [43]:
train_data_path = "data/train_df.csv"
train_df = pd.read_csv(train_data_path)
print(train_df.head())

   Unnamed: 0                                             file_1  \
0           0  The VIRSA (Visible Infrared Survey Telescope A...   
1           1  China\nThe goal of this project involves achie...   
2           2  Scientists can learn about how galaxies form a...   
3           3  China\nThe study suggests that multiple star s...   
4           4  Dinosaur Rex was excited about his new toy set...   

                                              file_2  real_file_label  
0  The China relay network has released a signifi...                2  
1  The project aims to achieve an accuracy level ...                1  
2  Dinosaur eggshells offer clues about what dino...                2  
3  The importance for understanding how stars evo...                2  
4  Analyzing how fast stars rotate within a galax...                1  


### Identifying missing values

The code below will also replace these missing values with empty strings. 

In [45]:
clean_df = train_df.copy()

# Method 1: Replacing NaNs with empty strings
# for idx, row in clean_df.iterrows():
#     if pd.isna(row["file_1"]):
#         print(row)
#         clean_df.at[idx, "file_1"] = ""
#     elif pd.isna(row["file_2"]):
#         print(row)
#         clean_df.at[idx, "file_2"] = ""

# Method 2: Removing NA rows altogether 
clean_df.dropna(subset=["file_1", "file_2"], inplace=True)

### Extract Basic Data Functions 

In [32]:
def extract_basic_features(df):
    # Create real and fake text columns
    df['real_text'] = df.apply(
        lambda row: row['file_1'] if row['real_file_label'] == 1 else row['file_2'], 
        axis=1
    )
    df['fake_text'] = df.apply(
        lambda row: row['file_2'] if row['real_file_label'] == 1 else row['file_1'], 
        axis=1
    )

    features = []

    for idx, row in df.iterrows():
        real_text = row['real_text']
        fake_text = row['fake_text']
        
        # Basic length features
        real_chars = len(real_text)
        fake_chars = len(fake_text)
        real_words = len(real_text.split())
        fake_words = len(fake_text.split())
        
        # Basic punctuation features
        real_punct = len(re.findall(r'[.!?,:;]', real_text))
        fake_punct = len(re.findall(r'[.!?,:;]', fake_text))
        
        # Basic sentence features
        real_sents = len([s for s in re.split(r'[.!?]+', real_text) if s.strip()])
        fake_sents = len([s for s in re.split(r'[.!?]+', fake_text) if s.strip()])
        
        # Basic variance features
        real_word_lengths = [len(w) for w in real_text.split()] if real_words > 0 else [0]
        fake_word_lengths = [len(w) for w in fake_text.split()] if fake_words > 0 else [0]
        real_word_var = np.var(real_word_lengths) if len(real_word_lengths) > 1 else 0
        fake_word_var = np.var(fake_word_lengths) if len(fake_word_lengths) > 1 else 0
        
        # Count proper nouns
        real_proper = len([w for i, w in enumerate(real_text.split()) 
                            if w and w[0].isupper() and i > 0])
        fake_proper = len([w for i, w in enumerate(fake_text.split()) 
                            if w and w[0].isupper() and i > 0])
        
        # Numbers
        real_numbers = len(re.findall(r'\d+', real_text))
        fake_numbers = len(re.findall(r'\d+', fake_text))
        
        feature_row = {
            # Length differences
            'char_real': real_chars,
            'char_fake': fake_chars,
            'word_real': real_words,
            'word_fake': fake_words,
            'char_diff': real_chars - fake_chars,
            'word_diff': real_words - fake_words,
            'char_ratio': real_chars / (fake_chars + 1) if fake_chars > 0 else real_chars,
            'word_ratio': real_words / (fake_words + 1) if fake_words > 0 else real_words,
            
            # Punctuation differences  
            'punct_real': real_punct,
            'punct_fake': fake_punct, 
            'punct_diff': real_punct - fake_punct,
            'punct_density_real': real_punct / (real_words + 1),
            'punct_density_fake': fake_punct / (fake_words + 1),
            'punct_density_diff': (real_punct / (real_words + 1)) - (fake_punct / (fake_words + 1)),
            
            # Sentence differences
            'real_sent': real_sents,
            'fake_sent': fake_sents,
            'sent_diff': real_sents - fake_sents,
            'sent_ratio': real_sents / (fake_sents + 1) if fake_sents > 0 else real_sents,
            
            # Variance differences
            'word_var_real': real_word_var,
            'word_var_fake': fake_word_var,
            'word_var_diff': real_word_var - fake_word_var,
            
            # Content differences
            'real_proper': real_proper,
            'fake_proper': fake_proper,
            'proper_diff': real_proper - fake_proper,
            'proper_density_real': real_proper / (real_words + 1),
            'proper_density_fake': fake_proper / (fake_words + 1),
            'real_numbers': real_numbers,
            'fake_numbers': fake_numbers,
            'numbers_diff': real_numbers - fake_numbers,
        }
        
        features.append(feature_row)

    return pd.DataFrame(features)

In [33]:
features_df = extract_basic_features(df_fill)
labels = (df_fill['real_file_label'] - 1).values

In [34]:
print(features_df.head())

   char_real  char_fake  word_real  word_fake  char_diff  word_diff  \
0       2018       2196        296        304       -178         -8   
1       3124        936        454        137       2188        317   
2        801       1139        125        159       -338        -34   
3       1869       1774        262        263         95         -1   
4        195        871         34        123       -676        -89   

   char_ratio  word_ratio  punct_real  punct_fake  ...  word_var_fake  \
0    0.918525    0.970492          18          16  ...       8.116246   
1    3.334045    3.289855          17          11  ...       9.229687   
2    0.702632    0.781250           4           9  ...       7.398837   
3    1.052958    0.992424          11          16  ...       8.096720   
4    0.223624    0.274194           6           7  ...       9.365986   

   word_var_diff  real_proper  fake_proper  proper_diff  proper_density_real  \
0      -0.070604           20           38          -1

In [35]:
print(features_df.describe())

         char_real     char_fake    word_real    word_fake     char_diff  \
count    94.000000     94.000000    94.000000    94.000000     94.000000   
mean   2311.106383   2566.595745   311.319149   333.606383   -255.489362   
std    2203.397184   4486.382489   227.145456   458.380464   5115.865186   
min      69.000000      0.000000     9.000000     0.000000 -38648.000000   
25%    1274.000000   1118.000000   186.250000   167.000000   -610.000000   
50%    1655.000000   1437.500000   245.000000   216.500000     83.000000   
75%    2122.750000   2072.750000   327.000000   314.750000    735.250000   
max    9561.000000  40316.000000  1008.000000  4158.000000   8521.000000   

         word_diff   char_ratio  word_ratio  punct_real  punct_fake  ...  \
count    94.000000    94.000000   94.000000   94.000000   94.000000  ...   
mean    -22.287234    40.317063    6.746286   37.287234   41.968085  ...   
std     518.453648   264.231546   36.385037   41.809507   74.349285  ...   
min   -3902

In [None]:
def analyze_feature_distributions(features_df, labels):    
    for feature in features_df.columns:
        real_vals = features_df[labels == 0][feature]
        fake_vals = features_df[labels == 1][feature]
        
        real_mean = real_vals.mean()
        fake_mean = fake_vals.mean()
        diff = real_mean - fake_mean
        
        print(f"{feature}:")
        print(f"  Real: {real_mean:.4f}")
        print(f"  Fake: {fake_mean:.4f}")
        print(f"  Difference: {diff:.4f}")
        print()

analyze_feature_distributions(features_df, labels)

char_diff:
  Real (file_1): 820.3333
  Fake (file_2): -1243.4898
  Difference: 2063.8231

word_diff:
  Real (file_1): 94.4889
  Fake (file_2): -129.5306
  Difference: 224.0195

char_ratio:
  Real (file_1): 43.2404
  Fake (file_2): 37.6324
  Difference: 5.6080

word_ratio:
  Real (file_1): 7.4773
  Fake (file_2): 6.0750
  Difference: 1.4023

punct_diff:
  Real (file_1): 12.8667
  Fake (file_2): -20.7959
  Difference: 33.6626

punct_density_real:
  Real (file_1): 0.1061
  Fake (file_2): 0.1043
  Difference: 0.0018

punct_density_fake:
  Real (file_1): 0.0947
  Fake (file_2): 0.1073
  Difference: -0.0125

punct_density_diff:
  Real (file_1): 0.0114
  Fake (file_2): -0.0030
  Difference: 0.0144

sent_diff:
  Real (file_1): 5.1111
  Fake (file_2): -9.7755
  Difference: 14.8866

sent_ratio:
  Real (file_1): 1.9842
  Fake (file_2): 1.2083
  Difference: 0.7758

word_var_real:
  Real (file_1): 13.2514
  Fake (file_2): 11.4557
  Difference: 1.7958

word_var_fake:
  Real (file_1): 11.2089
  Fake 