# Feature Vector Data Preproecessing Pipeline
---
---

## Step-1: Install dependencies
---

In [None]:
# 0. Install / Import Dependencies
!pip install -q scikit-learn pandas matplotlib numpy seaborn
!pip install -q opendatasets  # For Kaggle dataset download

import numpy as np
import pandas as pd
import os
import glob
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

print("Dependencies installed successfully!")

## Step-2: Load Feature Vectors
---

### Dowload Dataset from Kaggle -->

In [None]:
# Dataset Selection - Choose which dataset to use
DATASET_CHOICE = 'ember'  # Change to 'bodmas' to use BODMAS dataset

# Kaggle dataset URLs
EMBER_URL = 'https://www.kaggle.com/datasets/dhoogla/ember-2018-v2-features'
BODMAS_URL = 'https://www.kaggle.com/datasets/dhoogla/bodmas'

print(f"Selected dataset: {DATASET_CHOICE.upper()}")

In [None]:
# Download datasets from Kaggle (requires Kaggle API credentials)
# Note: You need to upload your kaggle.json file to use this
import os

# Set your Kaggle credentials here
os.environ['KAGGLE_USERNAME'] = 'razeenahmed10'
os.environ['KAGGLE_KEY'] = '43efe04888a9c1878a4753108e73e0b9'

# Then import or use Kaggle API functions that require authentication
import opendatasets as od

# Proceed with downloading datasets

try:
    # import opendatasets as od

    if DATASET_CHOICE == 'ember':
        print("Downloading EMBER 2018 v2 Features dataset...")
        od.download(EMBER_URL)
        data_path = './ember-2018-v2-features'
    else:
        print("Downloading BODMAS dataset...")
        od.download(BODMAS_URL)
        data_path = './bodmas'

    print(f"Dataset downloaded to: {data_path}")

except Exception as e:
    print(f"Error downloading dataset: {e}")
    print("Please manually download the dataset or set up Kaggle API credentials")

    # Manual path setup (if you've already downloaded)
    if DATASET_CHOICE == 'ember':
        data_path = './ember-2018-v2-features'  # Adjust path as needed
    else:
        data_path = './bodmas'  # Adjust path as needed

### Load Dataset -->

In [None]:
import os
import glob
import numpy as np
import pandas as pd

def load_ember_features_parquet(data_path):
    """Load EMBER 2018 v2 features from Kaggle dataset in parquet format"""
    # Look for parquet files
    parquet_files = glob.glob(os.path.join(data_path, '*.parquet'))

    if not parquet_files:
        raise FileNotFoundError(f"No parquet files found in {data_path}")

    print(f"Loading features from: {parquet_files[0]}")

    # Load the features
    df = pd.read_parquet(parquet_files[0])

    # Find label column
    label_cols = ['label', 'target', 'y', 'class']
    label_col = None

    for col in label_cols:
        if col in df.columns:
            label_col = col
            break

    if label_col is None:
        # Assume last column is label
        label_col = df.columns[-1]
        print(f"No standard label column found, using: {label_col}")

    # Extract features and labels
    X = df.drop(columns=[label_col])
    y = df[label_col]

    # Remove non-numeric columns
    numeric_cols = X.select_dtypes(include=[np.number]).columns
    X = X[numeric_cols]

    return X.values, y.values


def load_bodmas_features_parquet(data_path):
    """Load BODMAS features from Kaggle dataset in parquet format"""
    parquet_files = glob.glob(os.path.join(data_path, '*.parquet'))

    if not parquet_files:
        raise FileNotFoundError(f"No parquet files found in {data_path}")

    print(f"Found parquet files: {parquet_files}")

    df = pd.read_parquet(parquet_files[0])

    if 'label' in df.columns:
        X = df.drop(columns=['label'])
        y = df['label']
    else:
        X = df.iloc[:, :-1]
        y = df.iloc[:, -1]

    numeric_cols = X.select_dtypes(include=[np.number]).columns
    X = X[numeric_cols]

    return X.values, y.values


print(f"Loading {DATASET_CHOICE.upper()} dataset...")

if DATASET_CHOICE == 'ember':
    X_raw, y_raw = load_ember_features_parquet(data_path)
else:
    X_raw, y_raw = load_bodmas_features_parquet(data_path)

print(f'Raw {DATASET_CHOICE.upper()} shape: X={X_raw.shape}, y={y_raw.shape}')
print(f'Feature dimensions: {X_raw.shape[1]}')
print(f'Label distribution: {np.unique(y_raw, return_counts=True)}')


## Step-2.5: Data Cleaning
---

In [None]:
# 2. Data Preprocessing and Cleaning

# Handle missing values
print(f"Missing values in features: {np.sum(np.isnan(X_raw))}")
if np.sum(np.isnan(X_raw)) > 0:
    # Replace NaN with 0 or median
    X_raw = np.nan_to_num(X_raw, nan=0.0)
    print("Replaced NaN values with 0")

# Handle infinite values
inf_mask = ~np.isfinite(X_raw)
if np.sum(inf_mask) > 0:
    X_raw[inf_mask] = 0
    print(f"Replaced {np.sum(inf_mask)} infinite values with 0")

# Encode labels to binary (0, 1)
le = LabelEncoder()
y_encoded = le.fit_transform(y_raw)

print(f"Label encoding: {dict(zip(le.classes_, le.transform(le.classes_)))}")
print(f"Encoded label distribution: {np.unique(y_encoded, return_counts=True)}")

# Filter out unlabeled data (if any)
if -1 in y_encoded:
    mask = y_encoded != -1
    X_raw = X_raw[mask]
    y_encoded = y_encoded[mask]
    print(f"Filtered data shape: X={X_raw.shape}, y={y_encoded.shape}")

## Step-3: Train/Test Split
---

In [None]:
# 3. Train/Test Split (stratified)
X_train, X_test, y_train, y_test = train_test_split(
    X_raw, y_encoded, test_size=0.20, random_state=42, stratify=y_encoded)

print('Train:', X_train.shape, 'Test:', X_test.shape)
print('Train labels:', np.unique(y_train, return_counts=True))
print('Test labels:', np.unique(y_test, return_counts=True))

## Step-4: Standardization
---

In [None]:
# 4. Standardization (mean=0, std=1)
print("Standardizing features...")
scaler = StandardScaler()
X_train_std = scaler.fit_transform(X_train)
X_test_std = scaler.transform(X_test)

print(f"After standardization:")
print(f"Train mean: {X_train_std.mean():.6f}, std: {X_train_std.std():.6f}")
print(f"Test mean: {X_test_std.mean():.6f}, std: {X_test_std.std():.6f}")

## Step-5: PCA30 Transformation of Feature Vectors
---

In [None]:
# 5. PCA → 30 Principal Components
from sklearn.decomposition import PCA

print("Applying PCA (30 components)...")
pca = PCA(n_components=30, random_state=42, svd_solver='full')
X_train_pca = pca.fit_transform(X_train_std)
X_test_pca = pca.transform(X_test_std)

print('After PCA:', X_train_pca.shape)
print(f'Variance explained by 30 components: {pca.explained_variance_ratio_.sum()*100:.2f}%')
print(f'Top 10 component variances: {pca.explained_variance_ratio_[:10]}')