# Module 0 Project 2: Data Collection

Implement a data preprocessing pipeline from [sklearn](https://scikit-learn.org/stable/), use advanced preprocessing techniques with data augmentation on a sample dataset.

## STEP 1: IMPORTS AND DATASET GENERATION
- Import necessary libraries
- Generate a random dataset for ease of use including categorical values and null values

In [None]:
# Imports
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.model_selection import train_test_split

# Randomly generated sample data
np.random.seed(42)
data = {
    'numerical_1': np.random.randint(1, 100, 100),
    'numerical_2': np.random.normal(1000, 100, 100),
    'numerical_3': np.random.choice([np.nan, 5, 10, 15], 100),
    'categorical': np.random.choice(['A', 'B', 'C'], 100),
    'target': np.random.randint(0, 2, 100)
}

df = pd.DataFrame(data)

## STEP 2: DEFINE FEATURES AND SPLIT DATA
- Define the feature types and split up the data into train and test sets
- We want to do this before any preprocessing steps

In [None]:
# Define numerical and categorical features
numerical_features = ['numerical_1', 'numerical_2', 'numerical_3']
categorical_features = ['categorical']

# Splitting data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(df.drop('target', axis=1), df['target'], test_size=0.2, random_state=42)

## STEP 3: DEFINE SKLEARN PIPELINE
- Define the pipeline steps for the preprocessing pipeline setup
- Handle categorical values, null values, scaling and standardization/normalization
- Add a piece for polynomial feature augmentation to increase diversity in the data

In [None]:
# Pipeline for numerical features
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler()),
    ('poly', PolynomialFeatures(degree=2, include_bias=False))
])

# Pipeline for categorical features
categorical_transformer = Pipeline(steps=[
    ('encoder', OneHotEncoder())
])

# Combine numerical and categorical transformers
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Define the final pipeline
pipeline = Pipeline(steps=[('preprocessor', preprocessor)])

## STEP 4: FIT THE PIPELINE, DISPLAY AND SAVE THE DATA
- Fit the pipeline to our generated dataset
- Display the preprocessed and augmented data and save it to a CSV file

In [None]:
# Fit and transform the training data
X_train_augmented = pipeline.fit_transform(X_train, y_train)
print(X_train_augmented)

# Save data to CSV format
np.savetxt("data.csv", X_train_augmented, delimiter=",")