In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

# import torch
# import torch.nn as nn
# import torch.nn.functional as F
# from torch.utils.data import TensorDataset,DataLoader

from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import *
# import xgboost as xgb

warnings.filterwarnings('ignore')
plt.style.use('fivethirtyeight')

In [None]:
train = pd.read_csv('../input/tabular-playground-series-nov-2021/train.csv')
test = pd.read_csv('../input/tabular-playground-series-nov-2021/test.csv')
sample_submission = pd.read_csv('../input/tabular-playground-series-nov-2021/sample_submission.csv')

In [None]:
train.head()

In [None]:
feature_cols = [col for col in train.columns if col.startswith('f')]

train['f100'] = train[feature_cols].max(axis=1)
test['f100'] = test[feature_cols].max(axis=1)

train['f101'] = train[feature_cols].min(axis=1)
test['f101'] = test[feature_cols].min(axis=1)

train['f102'] = train[feature_cols].median(axis=1)
test['f102'] = test[feature_cols].median(axis=1)

feature_cols = [col for col in train.columns if col.startswith('f')]

In [None]:
X = train[feature_cols].values
y = train.target.values

test = test[feature_cols].values

In [None]:
xtrain,xvalid,ytrain,yvalid = train_test_split(X,y,test_size=0.2,random_state=1234)

In [None]:
print(f'No of train samples: {xtrain.shape[0]}')
print(f'No of validation samples: {xvalid.shape[0]}')
print(f'No of test samples: {test.shape[0]}')

## Perform PCA. 
### In here,
 - We will fit `PCA` with 50 components
 - Then we plot data points against top two components of the PCA 

In [None]:
def apply_transformations(transformer=False):
    data = xvalid
    if transformer:
        transformer.fit(data)
        data = transformer.transform(data)
        
    pcaData = PCA(n_components=50).fit(data)
    transformedData = pcaData.fit_transform(data)
    
    varianceExplained = np.sum(100*pcaData.explained_variance_ratio_)
    
    fig,ax = plt.subplots(1,2,figsize=(20,8))

    for lab in range(2):
        ax[0].plot(transformedData[yvalid==lab,0],transformedData[yvalid==lab,1],'o',markersize=3,alpha=.4)
    ax[0].set_xlabel('PC1 projection')
    ax[0].set_ylabel('PC2 projection')
    ax[0].legend(range(2))
    ax[0].set_title(f'PCA of data\n\nPercent variance explained: {varianceExplained:.2f}%')
    
    ax[1].plot(100*pcaData.explained_variance_ratio_,'ms--')
    ax[1].set_xlabel('Components')
    ax[1].set_ylabel('Percent variance explained')
    ax[1].set_title('PCA scree plot')
    ax[1].legend()
    
    
    plt.show()

## WIthout any transformations

In [None]:
apply_transformations()

## Apply StandardScaler()

In [None]:
apply_transformations(StandardScaler())

## Apply PowerTransformer() with default parameters

In [None]:
apply_transformations(PowerTransformer())

## Apply MinMaxScaler()

In [None]:
apply_transformations(MinMaxScaler())

## Apply Binarizer() with default parameters

In [None]:
apply_transformations(Binarizer())

## Apply MaxAbsScaler()

In [None]:
apply_transformations(MaxAbsScaler())

## Apply QuantileTransformer()

In [None]:
apply_transformations(QuantileTransformer())

## Apply OrdinalEncoder()

In [None]:
apply_transformations(OrdinalEncoder())

## Apply Normalizer()

In [None]:
apply_transformations(Normalizer())