<div>
<h1 align="center">Dimensionality Reduction Vs. Feature Selection</h3>
<h3 align="center">Tabular Playground Series - Aug 2021</h3>
<h4 align="center">By: Saurabh Zinjad</h3>
<h3 align="center">If you find this work useful, please don't forget upvoting :)</h3>
</div>

Perform 2 different approaches dimensionality reduction(using PCA) and feature selection(using random forest) to reduce features, and derive which is better approach for given dataset.

## 1. Libraries and packages setup

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler

%matplotlib inline

## 2. Read dataset

In [None]:
train = pd.read_csv('../input/tabular-playground-series-aug-2021/train.csv')
test = pd.read_csv('../input/tabular-playground-series-aug-2021/test.csv')
submission = pd.read_csv('../input/tabular-playground-series-aug-2021/sample_submission.csv')

In [None]:
print("The Shape of Training Set : {}".format(train.shape))
print("The Shape of Testing Set : {}".format(test.shape))
print("The Shape of Submission Set : {}".format(submission.shape))
train.head()

## 3. Plot the Target Distribution

In [None]:
train.loss.value_counts().plot(figsize=(16, 8), kind='bar')

## 4. Checking Missvalues

In [None]:
print("Total number of missing values: ", train.isna().sum().sum())

## 5. Feature selection

### 5.1. Information Gain - filter method

In [None]:
# from sklearn.feature_selection import mutual_info_classif

# importances = mutual_info_classif(X,Y)
# feat_importances = pd.Series(importances, X.columns)
# feat_importances.plot(kind='barh', color='teal')
# plt.show()

In [None]:
# np.where(importances == importances.max())
# print("Column with max information gain: ", X.columns[82])

## 6. Analysis Correlation

In [None]:
# no correlation
# corr_train=train.corr()
# fig = plt.figure(figsize=(15,12))
# sns.heatmap(corr_train)
# corr.style.background_gradient(cmap='coolwarm')

In [None]:
# corr_train = corr_train['loss'].sort_values(ascending=False).round(2)
# corr_train

### The most correlated Features (Above 0)

In [None]:
# corr_train[corr_train >0]


## 7. Split dataset

In [None]:
#  Remove `id` column,no Relevant for the Work
X = train.drop('id', axis = 1)
X_test = test.drop('id', axis = 1)

# target and labels
X = X.drop(columns='loss')
y = train['loss']

# split dataset
X_train,X_val,y_train,y_val=train_test_split(X,y,test_size = 0.3, random_state = 1)

## 8. Dimensionality reduction With PCA

In [None]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline

In [None]:
#The best 50 Features
pca = Pipeline([('scale', MinMaxScaler()), ('pca', PCA(49, random_state=0))
]).fit(X_train)
X_train_pca = pca.transform(X_train)
X_val_pca = pca.transform(X_val)
X_test =pca.transform(X_test)

In [None]:
# check gpu 
import tensorflow as tf
print("is_built_with_cuda: ", tf.test.is_built_with_cuda())
tf.config.list_physical_devices()

### 8.1 PCA Model I - XGBRegressor

In [None]:
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error

pca_xgb_model = XGBRegressor(max_depth=7,                     
                      n_estimators=2500,
                      learning_rate=0.008,
                      subsample=0.84,
                      booster= 'gbtree',
                      tree_method= 'gpu_hist',
                      colsample_bytree= 0.70,
                      reg_lambda= 5,
                      reg_alpha= 32,
                      n_jobs= 4,            
                      alpha=0.5,
                      random_state=123).fit(X_train_pca, y_train, verbose = 200)

In [None]:
# prediction
pca_xgb_model_pred = pca_xgb_model.predict(X_val_pca)
pca_xgb_model_pred = np.clip(pca_xgb_model_pred, y.min(), y.max())

print(44 * '=')
print(f'XgBoost PCA - Mean Error: {mean_squared_error(y_val,pca_xgb_model_pred, squared = False)}')
print(44 * '=')

In [None]:
pca_xgb_model_test_pred = pca_xgb_model.predict(X_test)
pca_xgb_model_test_pred = np.clip(pca_xgb_model_test_pred, y.min(), y.max())

# save submission in csv file
pca_xgb_model_sub = submission.copy()
pca_xgb_model_sub.iloc[:,1] = pca_xgb_model_test_pred.data
pca_xgb_model_sub.to_csv("submission_pca_xgb.csv",index=False)
print("submission_pca_xgb.csv file saved !!!")

### 8.2 PCA Model II - CatBoostRegressor

In [None]:
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error

pca_catb_model = CatBoostRegressor(depth=6,
                            iterations=1600,
                            learning_rate=0.024,
                            l2_leaf_reg=20,
                            random_strength=1.5,
                            grow_policy='Depthwise',
                            leaf_estimation_method='Newton', 
                            bootstrap_type='Bernoulli',
                            thread_count=4,
                            verbose=False,
                            loss_function='RMSE',
                            eval_metric='RMSE',
                            od_type='Iter',
                            task_type='GPU',
                            devices='0:1',
                            early_stopping_rounds=500,
                            random_state=123).fit(X_train_pca, y_train, verbose = 200)

In [None]:
pca_catb_model_pred = pca_catb_model.predict(X_val_pca)
pca_catb_model_pred = np.clip(pca_catb_model_pred, y.min(), y.max())

print(44 * '=')
print(f'CatBoost PCA - Mean Error: {mean_squared_error(y_val,pca_catb_model_pred, squared = False)}')
print(44 * '=')

In [None]:
pca_catb_model_test_pred = pca_catb_model.predict(X_test)
pca_catb_model_test_pred = np.clip(pca_catb_model_test_pred, y.min(), y.max())

# save submission in csv file
pca_catb_model_sub = submission.copy()
pca_catb_model_sub.iloc[:,1] = pca_catb_model_test_pred.data
pca_catb_model_sub.to_csv("submission_pca_catb.csv",index=False)
print("submission_pca_catb.csv file saved !!!")

## 9. Feature Selection using DecisionTree

### 9.1 Get best 50 features using `DecisionTreeRegressor`

In [None]:
from sklearn.tree import DecisionTreeRegressor
dt = DecisionTreeRegressor(random_state=123).fit(X_train, y_train)
Features_importance = pd.DataFrame([(col,coef) for col , coef in zip(X.columns,dt.feature_importances_)], 
             columns = ['feature', 'importance']).set_index('feature').sort_values('importance',ascending = False)
Features_importance

In [None]:
Features_importance = Features_importance[Features_importance.values > 0.01]
Features_importance.shape

In [None]:
Features_importance.index

In [None]:
# filter columns according to above features
X_RF = X[Features_importance.index]
X_test_RF = test.drop (columns = ['id'])
X_test_RF = X_test_RF[Features_importance.index]
X_RF.shape,X_test_RF.shape

In [None]:
## Scaling data
scaler = StandardScaler()
scaler.fit(X_RF)
X_RF = scaler.transform(X_RF)
X_test_RF = scaler.transform(X_test_RF)
X_train_RF,X_val_RF,y_train_RF,y_val_RF=train_test_split(X_RF,y,test_size = 0.3, random_state = 1)

### 9.2 RF Model I - XGBRegressor

In [None]:
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error

rf_xgb_model = XGBRegressor(max_depth=7,                     
                      n_estimators=2500,
                      learning_rate=0.008,
                      subsample=0.84,
                      booster= 'gbtree',
                      tree_method= 'gpu_hist',
                      colsample_bytree= 0.70,
                      reg_lambda= 5,
                      reg_alpha= 32,
                      n_jobs= 4,            
                      alpha=0.5,
                      random_state=123).fit(X_train_RF, y_train_RF)

In [None]:
rf_xgb_model_pred = rf_xgb_model.predict(X_val_RF)
rf_xgb_model_pred = np.clip(rf_xgb_model_pred, y.min(), y.max())

print(44 * '=')
print(f'XgBoost RF - Mean Error: {mean_squared_error(y_val_RF,rf_xgb_model_pred, squared = False)}')
print(44 * '=')

In [None]:
rf_xgb_model_test_pred = rf_xgb_model.predict(X_test_RF)
rf_xgb_model_test_pred = np.clip(rf_xgb_model_test_pred, y.min(), y.max())

# save submission in csv file
rf_xgb_model_sub = submission.copy()
rf_xgb_model_sub.iloc[:,1] = rf_xgb_model_test_pred.data
rf_xgb_model_sub.to_csv("submission_rf_xgb.csv",index=False)
print("submission_rf_xgb.csv file saved !!!")

### 9.3 RF Model II - CatBoostRegressor

In [None]:
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error

rf_catb_model = CatBoostRegressor(depth=6,
                            iterations=1600,
                            learning_rate=0.024,
                            l2_leaf_reg=20,
                            random_strength=1.5,
                            grow_policy='Depthwise',
                            leaf_estimation_method='Newton', 
                            bootstrap_type='Bernoulli',
                            thread_count=4,
                            verbose=False,
                            loss_function='RMSE',
                            eval_metric='RMSE',
                            od_type='Iter',
                            task_type='GPU',
                            devices='0:1',
                            early_stopping_rounds=500,
                            random_state=123).fit(X_train_RF, y_train_RF, verbose = 200)

In [None]:
rf_catb_model_pred = rf_catb_model.predict(X_val_RF)
rf_catb_model_pred = np.clip(rf_catb_model_pred, y.min(), y.max())

print(44 * '=')
print(f'catBoost RF - Mean Error: {mean_squared_error(y_val_RF,rf_catb_model_pred, squared = False)}')
print(44 * '=')

In [None]:
rf_catb_model_test_pred = rf_catb_model.predict(X_test_RF)
rf_catb_model_test_pred = np.clip(rf_catb_model_test_pred, y.min(), y.max())

# save submission in csv file
rf_catb_model_sub = submission.copy()
rf_catb_model_sub.iloc[:,1] = rf_catb_model_test_pred.data
rf_catb_model_sub.to_csv("submission_rf_catb.csv",index=False)
print("submission_rf_catb.csv file saved !!!")

## 10. Final result

In [None]:
print(f'XgBoost PCA - Mean Error : {mean_squared_error(y_val,pca_xgb_model_pred, squared = False)}')
print(f'CatBoost PCA - Mean Error: {mean_squared_error(y_val,pca_catb_model_pred, squared = False)}')
print(f'XgBoost RF - Mean Error  : {mean_squared_error(y_val_RF,rf_xgb_model_pred, squared = False)}')
print(f'CatBoost RF - Mean Error : {mean_squared_error(y_val_RF,rf_catb_model_pred, squared = False)}')