In [1]:
# General packages
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import datasets as ds
import seaborn

# Classifiers
from sklearn import model_selection
from sklearn import metrics
from sklearn import feature_selection 
from sklearn import preprocessing
from sklearn import neighbors
from sklearn import svm
from sklearn import decomposition
from sklearn.impute import KNNImputer

def data_preprocessing(X_train, y_train):
    # Data preprocessing
    # Input data is X_train en y_train

    # 1. Scaling (standard, min max, robust)

    # Scale the data to be normal
    scaler = preprocessing.StandardScaler()
    # scaler = preprocessing.MinMaxScaler()
    # scaler = preprocessing.RobustScaler()
    scaler.fit(X_train)
    X_train_scaled = scaler.transform(X_train)

    # 2. Feature selection/extraction
    # Create the Recursive Feature Elimination object and compute a cross-validated score.
    svc = svm.SVC(kernel="linear")

    # classifications
    rfecv = feature_selection.RFECV(
        estimator=svc, step=1, 
        cv=model_selection.StratifiedKFold(4),
        scoring='roc_auc')
    rfecv.fit(X_train_scaled, y_train)

    # Plot number of features VS. cross-validation scores
    plt.figure()
    plt.xlabel("Number of features selected")
    plt.ylabel("Cross validation score (nb of correct classifications)")
    plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)
    plt.show()
    print("Optimal number of features : %d" % rfecv.n_features_)

    # 3. PCA
    n_selected_features = rfecv.n_features_
    n_samples = 113  # moeten we nog uit de data halen, is netter
    n_components == min(n_samples, n_selected_features)
    pca = decomposition.PCA(n_components)
    pca.fit(X_train_scaled)
    X_train_pca = pca.transform(X_train_scaled)

    seaborn.scatterplot(x=X_train_pca[:, 0], y=X_train_pca[:, 1], hue=y)

    # 4. Imputation missing data
    missing_values=[0.0, 1.0]
    for elem in missing_values:
        imputer = KNNImputer(missing_values=elem, n_neighbors=5, weights='uniform')
        X_train_imputed = imputer.fit_transform(X_train_pca)

    return X_train_imputed

In [None]:
# Preprocessing test
X_test_scaled = scaler.transform(X_test)

In [None]:
# PCA

plt.figure()
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('Number of Components')
plt.ylabel('Variance (%)') #for each component
plt.title('Pulsar Dataset Explained Variance')
plt.show()