# Feature Vector Data Preproecessing Pipeline
---
---

## Step-1: Install dependencies
---

In [None]:
# 0. Install / Import Dependencies
!pip install -q scikit-learn pandas matplotlib numpy seaborn
!pip install -q opendatasets  # For Kaggle dataset download

import numpy as np
import pandas as pd
import os
import glob
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

print("Dependencies installed successfully!")

## Step-2: Load Feature Vectors
---

### Dowload Dataset from Kaggle -->

In [None]:
# Dataset Selection - Choose which dataset to use
DATASET_CHOICE = 'ember'  # Change to 'bodmas' to use BODMAS dataset

# Kaggle dataset URLs
EMBER_URL = 'https://www.kaggle.com/datasets/dhoogla/ember-2018-v2-features'
BODMAS_URL = 'https://www.kaggle.com/datasets/dhoogla/bodmas'

print(f"Selected dataset: {DATASET_CHOICE.upper()}")

In [None]:
# Download datasets from Kaggle (requires Kaggle API credentials)
# Note: You need to upload your kaggle.json file to use this
import os

# Set your Kaggle credentials here
os.environ['KAGGLE_USERNAME'] = 'razeenahmed10'
os.environ['KAGGLE_KEY'] = '43efe04888a9c1878a4753108e73e0b9'

# Then import or use Kaggle API functions that require authentication
import opendatasets as od

# Proceed with downloading datasets

try:
    # import opendatasets as od

    if DATASET_CHOICE == 'ember':
        print("Downloading EMBER 2018 v2 Features dataset...")
        od.download(EMBER_URL)
        data_path = './ember-2018-v2-features'
    else:
        print("Downloading BODMAS dataset...")
        od.download(BODMAS_URL)
        data_path = './bodmas'

    print(f"Dataset downloaded to: {data_path}")

except Exception as e:
    print(f"Error downloading dataset: {e}")
    print("Please manually download the dataset or set up Kaggle API credentials")

    # Manual path setup (if you've already downloaded)
    if DATASET_CHOICE == 'ember':
        data_path = './ember-2018-v2-features'  # Adjust path as needed
    else:
        data_path = './bodmas'  # Adjust path as needed

### Load Dataset -->

In [None]:
# 1. Load EMBER Feature Vectors #it can also be BODMAS
from ember import EMBER2018

DATA_PATH = '/content/ember2018'  # adjust if needed
EMBER2018(DATA_PATH)               # download + extract if not present

# Load vectorized features
X_train_full = np.memmap(f'{DATA_PATH}/X_train.dat', dtype=np.float32, mode='r', shape=(900000, 2351))
y_train_full = np.memmap(f'{DATA_PATH}/y_train.dat', dtype=np.int32,  mode='r', shape=(900000,))

mask = y_train_full != -1  # keep labeled rows only
X_raw = X_train_full[mask]
y_raw = y_train_full[mask]

print('Raw EMBER shape:', X_raw.shape, 'Labels:', y_raw.shape)

## Step-3: Train/Test Split
---

In [None]:
# 2. Train/Test Split (stratified)
X_train, X_test, y_train, y_test = train_test_split(
    X_raw, y_raw, test_size=0.20, random_state=42, stratify=y_raw)
print('Train:', X_train.shape, 'Test:', X_test.shape)

## Step-4: Standardization
---

In [None]:
# 3. Standardization (mean=0, std=1)
scaler = StandardScaler(copy=False)
X_train_std = scaler.fit_transform(X_train)
X_test_std = scaler.transform(X_test)

## Step-5: PCA30 Transformation of Feature Vectors
---

In [None]:
# 4. PCA → 30 Principal Components
pca = PCA(n_components=30, random_state=42, svd_solver='full')
X_train_pca = pca.fit_transform(X_train_std)
X_test_pca = pca.transform(X_test_std)

print('After PCA:', X_train_pca.shape)
print('Variance kept:', pca.explained_variance_ratio_.sum()*100, '%')