# Feature Vector Data Preproecessing Pipeline
---
---

## Step-1: Install dependencies
---

In [None]:
!pip install -q ember==0.4.0 scikit-learn pandas matplotlib numpy
#ember dataset as default

import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

## Step-2: Load Feature Vectors
---

In [None]:
# 1. Load EMBER Feature Vectors #it can also be BODMAS
from ember import EMBER2018

DATA_PATH = '/content/ember2018'  # adjust if needed
EMBER2018(DATA_PATH)               # download + extract if not present

# Load vectorized features
X_train_full = np.memmap(f'{DATA_PATH}/X_train.dat', dtype=np.float32, mode='r', shape=(900000, 2351))
y_train_full = np.memmap(f'{DATA_PATH}/y_train.dat', dtype=np.int32,  mode='r', shape=(900000,))

mask = y_train_full != -1  # keep labeled rows only
X_raw = X_train_full[mask]
y_raw = y_train_full[mask]

print('Raw EMBER shape:', X_raw.shape, 'Labels:', y_raw.shape)

## Step-3: Train/Test Split
---

In [None]:
# 2. Train/Test Split (stratified)
X_train, X_test, y_train, y_test = train_test_split(
    X_raw, y_raw, test_size=0.20, random_state=42, stratify=y_raw)
print('Train:', X_train.shape, 'Test:', X_test.shape)

## Step-4: Standardization
---

In [None]:
# 3. Standardization (mean=0, std=1)
scaler = StandardScaler(copy=False)
X_train_std = scaler.fit_transform(X_train)
X_test_std = scaler.transform(X_test)

## Step-5: PCA30 Transformation of Feature Vectors
---

In [None]:
# 4. PCA → 30 Principal Components
pca = PCA(n_components=30, random_state=42, svd_solver='full')
X_train_pca = pca.fit_transform(X_train_std)
X_test_pca = pca.transform(X_test_std)

print('After PCA:', X_train_pca.shape)
print('Variance kept:', pca.explained_variance_ratio_.sum()*100, '%')