# Feature Engineering Approach towards Tabular Playground Feb

In this notebook, I use several feature engineering and selection methods on the February Tabular Playground dataset. Due to this being synthetic data,feature engineering and selection produces lower accuracy results for prediction submissions than those without. However, I think it is worth sharing my approaches for others to learn from and potentially use in other competitions which use real data, where this appraoch would be appropriate.

 - Requirements: feature_engine-1.0.2

In [None]:
#!pip install feature_engine

## Exploratory Data Analysis

In [None]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from category_encoders import *
from feature_engine.encoding import CountFrequencyEncoder
from matplotlib.lines import Line2D
from pylab import rcParams
rcParams['figure.figsize'] = 15,15

In [None]:
path = "../input/tabular-playground-series-feb-2021/"
train = pd.read_csv(path + "train.csv", index_col="id")
test = pd.read_csv(path + "test.csv", index_col="id")
train.head(5)

In [None]:
# Check for duplicates:

duplicates = train.duplicated()
duplicates.sum()

### Numerical columns:

In [None]:
numerical_columns = train[["cont0", "cont1", "cont2", "cont3", "cont4", "cont5", "cont6", "cont7", "cont8", "cont9", "cont10", "cont11", "cont12"]]
numerical_columns.head(5)

In [None]:
numerical_columns.shape

In [None]:
numerical_columns.to_csv("numerical_columns.csv")

In [None]:
numerical_columns.hist(figsize=((20,20)), alpha=0.5, animated=True, edgecolor='blue', color='lightblue', grid=False);

In [None]:
numerical_columns.describe()

#### Check for outliers:

In [None]:
numerical_columns.boxplot(showcaps=True, showfliers=True)
plt.xticks(rotation=90);

##### "cont0" has outliers:

In [None]:
sns.violinplot(numerical_columns["cont0"], color='lightblue', showcaps=True, showfliers=True)
sns.stripplot(numerical_columns["cont0"], color='red', alpha=0.01);

In [None]:
# Find z-scores and put a threshold of 3 to determine which outliers
# are too much and need to be cut:
z=np.abs(stats.zscore(numerical_columns["cont0"]))
print(z)

print(np.where(z > 3))

This shows that there is only one outlier that has a z-score greater than 3. We must locate this in order to remove it:

In [None]:
numerical_columns["cont0"].iloc[[132579]]

In [None]:
#remove this outlier:
numerical_columns = numerical_columns.loc[numerical_columns["cont0"] >= -0.093]

In [None]:
# check it is removed:
numerical_columns["cont0"].sort_values(ascending=True)

##### "cont2" outliers:

In [None]:
sns.violinplot(numerical_columns["cont2"], color='lightblue', showcaps=True, showfliers=True)
sns.stripplot(numerical_columns["cont2"], color='red', alpha=0.01);

In [None]:
# Find z-scores and put a threshold of 3 to determine which outliers
# are too much and need to be cut:
z=np.abs(stats.zscore(numerical_columns["cont2"]))
print(z)

print(np.where(z > 3))

The outliers we see on the boxplot for "cont2" are not too bad according to the zscore, so we leave them in.

##### "cont6" outliers:

In [None]:
sns.violinplot(numerical_columns["cont6"], color='lightblue', showcaps=True, showfliers=True)
sns.stripplot(numerical_columns["cont6"], color='red', alpha=0.01);

In [None]:
# Find z-scores and put a threshold of 3 to determine which outliers
# are too much and need to be cut:
z=np.abs(stats.zscore(numerical_columns["cont6"]))
print(z)
print(np.where(z > 3))

In [None]:
# find where z is greater than three:
x = z.tolist()
x = sorted(x, reverse=True)
print(x[110:130])
print("Index of first value with z-score > 3: ", np.where(z == 3.0010249691534847))
print("Index of last value with z-score < 3: ", np.where(z == 2.999924948184652))

In [None]:
print(numerical_columns["cont6"].iloc[[16410]])
print(numerical_columns["cont6"].iloc[[254380]])

In [None]:
# remove the outliers with value greater than 1.055627:
numerical_columns = numerical_columns.loc[numerical_columns["cont6"] <= 1.055627]

In [None]:
# check they are removed:
numerical_columns["cont6"].sort_values(ascending=False).head(5)

##### "cont8" outliers:

In [None]:
sns.violinplot(numerical_columns["cont8"], color='lightblue', showcaps=True, showfliers=True)
sns.stripplot(numerical_columns["cont8"], color='red', alpha=0.01);

In [None]:
# Find z-scores and put a threshold of 3 to determine which outliers
# are too much and need to be cut:
z=np.abs(stats.zscore(numerical_columns["cont8"]))
print(z)
print(np.where(z > 3))

The outliers we see on the boxplot for "cont8" are not too bad according to the zscore, so we leave them in.

#### Correlation matrix:

In [None]:
numerical_columns["target"] = train["target"]
corr = numerical_columns.corr()
sns.heatmap(corr, annot=True);

In [None]:
print("Numerical columns most correlated with target column:\n", abs(corr["target"]).sort_values(ascending=False))

Numerical columns dataframe with removed outliers:

In [None]:
numerical_columns.head(5)

In [None]:
numerical_columns.to_csv("numerical_columns_NO.csv")

### Categorical columns:

In [None]:
categorical_columns = train[["cat0", "cat1", "cat2", "cat3", "cat4", "cat5", "cat6", "cat7", "cat8", "cat9"]]
categorical_columns.head(5)

In [None]:
categorical_columns.describe(include='all')

In [None]:
num_rows, num_cols = 3,4
f, axes = plt.subplots(nrows=num_rows, ncols=num_cols, figsize=(12, 12))
for index, column in enumerate(categorical_columns.columns):
    i,j = (index // num_cols, index % num_cols)
    sns.histplot(x=column, data=categorical_columns, ax=axes[i,j]);

### Target:

In [None]:
target = train["target"]
target.to_csv("target.csv")

In [None]:
ax = sns.kdeplot(target, shade=True, color='red', edgecolor='black', alpha=0.5, zorder=3)
plt.title('Target Distribution', fontsize=20);

In [None]:
target.describe()

#### Check for outliers:

In [None]:
plt.boxplot(target, showcaps=True, showfliers=True)
plt.xticks(rotation=90);

In [None]:
# Find z-scores and put a threshold of 3 to determine which outliers
# are too much and need to be cut:
z=np.abs(stats.zscore(target))
print(z)

print(np.where(z > 3))

In [None]:
# find where z is greater than three:
x = z.tolist()
x = sorted(x, reverse=True)
print(x[400:450])
print("Index of first value with z-score > 3: ", np.where(z == 3.0000214311993463))
print("Index of first value with z-score < 3: ", np.where(z == 2.9997811336456435))

In [None]:
print(target.iloc[[3882]])
print(target.iloc[[9720]])

In [None]:
# remove the outliers with value less than 4.794575:
target = target.loc[target >= 4.794575]

In [None]:
# check they are removed:
plt.boxplot(target, showcaps=True, showfliers=True)
plt.xticks(rotation=90);

In [None]:
target.to_csv("target_NO.csv")

## Test dataset

In [None]:
test.head(5)

In [None]:
# Check for duplicates:

duplicates = test.duplicated()
duplicates.sum()

In [None]:
numerical_test = test[["cont0", "cont1", "cont2", "cont3", "cont4", "cont5", "cont6", "cont7", "cont8", "cont9", "cont10", "cont11", "cont12"]]
numerical_test.head(5)

In [None]:
categorical_test = test[["cat0", "cat1", "cat2", "cat3", "cat4", "cat5", "cat6", "cat7", "cat8", "cat9"]]
categorical_test.head(5)

In [None]:
categorical_test.describe(include='all')

We see the test set has the same features and range of values as the train set. So we do not make any changes to the train set in relation to the test set.

***

# Feature Engineering
## Categorical columns:

In [None]:
categorical_columns.head(5)

#### Ordinal Encoder:

In [None]:
enc = OrdinalEncoder()
X = categorical_columns
enc.fit(X)
ordinal_categorical_columns = enc.transform(categorical_columns)
ordinal_categorical_columns = pd.DataFrame(ordinal_categorical_columns)
ordinal_categorical_columns.head(5)

In [None]:
ordinal_categorical_columns.shape

In [None]:
ordinal_categorical_columns.to_csv("ordinal_categorical_columns.csv")

#### One-Hot Encoding:

In [None]:
enc = OneHotEncoder(handle_unknown='ignore')
enc.fit(categorical_columns)
onehot_categorical_columns = enc.transform(categorical_columns)
onehot_categorical_columns.head(5)

In [None]:
onehot_categorical_columns.shape

In [None]:
onehot_categorical_columns.to_csv("onehot_categorical_columns.csv")

#### Binary Encoding:

In [None]:
enc = BinaryEncoder().fit(categorical_columns)
binary_categorical_columns = enc.transform(categorical_columns)
binary_categorical_columns.head(5)

In [None]:
binary_categorical_columns.shape

In [None]:
binary_categorical_columns.to_csv("binary_categorical_columns.csv")

#### Frequency encoding

In [None]:
encoder = CountFrequencyEncoder(encoding_method='frequency')
encoder.fit(categorical_columns)
freq_categorical_columns = encoder.transform(categorical_columns)
freq_categorical_columns.head(5)

In [None]:
freq_categorical_columns.shape

In [None]:
freq_categorical_columns.to_csv("freq_categorical_columns.csv")

### Point for next step - modelling:
- Try each different encoded set for the categorical_columns.
- See which one gives best result in modelling.

## Numerical columns

### Numerical Columns with Outliers:

In [None]:
numerical_columns = pd.read_csv("numerical_columns.csv")
numerical_columns.head(5)

In [None]:
log_numerical_columns = pd.DataFrame()
log_numerical_columns["id"] = numerical_columns["id"]

log_numerical_columns['cont0_log'] = np.log((1+ numerical_columns['cont0']))
cont0_log_mean = np.round(np.mean(log_numerical_columns['cont0_log']), 2)

log_numerical_columns['cont1_log'] = np.log((1+ numerical_columns['cont1']))
cont1_log_mean = np.round(np.mean(log_numerical_columns['cont1_log']), 2)

log_numerical_columns['cont2_log'] = np.log((1+ numerical_columns['cont2']))
cont2_log_mean = np.round(np.mean(log_numerical_columns['cont2_log']), 2)

log_numerical_columns['cont3_log'] = np.log((1+ numerical_columns['cont3']))
cont3_log_mean = np.round(np.mean(log_numerical_columns['cont3_log']), 2)

log_numerical_columns['cont4_log'] = np.log((1+ numerical_columns['cont4']))
cont4_log_mean = np.round(np.mean(log_numerical_columns['cont4_log']), 2)

log_numerical_columns['cont5_log'] = np.log((1+ numerical_columns['cont5']))
cont5_log_mean = np.round(np.mean(log_numerical_columns['cont5_log']), 2)

log_numerical_columns['cont6_log'] = np.log((1+ numerical_columns['cont6']))
cont6_log_mean = np.round(np.mean(log_numerical_columns['cont6_log']), 2)

log_numerical_columns['cont7_log'] = np.log((1+ numerical_columns['cont7']))
cont7_log_mean = np.round(np.mean(log_numerical_columns['cont7_log']), 2)

log_numerical_columns['cont8_log'] = np.log((1+ numerical_columns['cont8']))
cont8_log_mean = np.round(np.mean(log_numerical_columns['cont8_log']), 2)

log_numerical_columns['cont9_log'] = np.log((1+ numerical_columns['cont9']))
cont9_log_mean = np.round(np.mean(log_numerical_columns['cont9_log']), 2)

log_numerical_columns['cont10_log'] = np.log((1+ numerical_columns['cont10']))
cont10_log_mean = np.round(np.mean(log_numerical_columns['cont10_log']), 2)

log_numerical_columns['cont11_log'] = np.log((1+ numerical_columns['cont11']))
cont11_log_mean = np.round(np.mean(log_numerical_columns['cont11_log']), 2)

log_numerical_columns['cont12_log'] = np.log((1+ numerical_columns['cont12']))
cont12_log_mean = np.round(np.mean(log_numerical_columns['cont12_log']), 2)

In [None]:
log_numerical_columns.head(5)

In [None]:
log_numerical_columns.shape

In [None]:
log_numerical_columns.to_csv("log_numerical_columns.csv")

#### Plots comparing original with log transformation

In [None]:
plt.hist(numerical_columns["cont0"], color='green', alpha=0.5, edgecolor='black')
plt.hist(log_numerical_columns['cont0_log'], bins=30, color='lightblue', edgecolor='black')
plt.axvline(cont0_log_mean, color='red')
plt.title('cont0 histogram before and after Log Transform', fontsize=20)
plt.xlabel('cont0 (log scale)', fontsize=18)
plt.ylabel('Frequency', fontsize=18)
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
# custom legend
custom_lines = [Line2D([0], [0], color='blue', lw=4),
                Line2D([0], [0], color='lightblue', lw=4),
                Line2D([0], [0], color='red', lw=4)]

plt.legend(custom_lines, ['cont0', 'cont0_log', 'cont0_log_mean']);

In [None]:
plt.hist(numerical_columns["cont1"], color='green', alpha=0.5, edgecolor='black')
plt.hist(log_numerical_columns['cont1_log'], bins=30, color='lightblue', edgecolor='black')
plt.axvline(cont1_log_mean, color='red')
plt.title('cont1 histogram before and after Log Transform', fontsize=20)
plt.xlabel('cont1 (log scale)', fontsize=18)
plt.ylabel('Frequency', fontsize=18)
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
# custom legend
custom_lines = [Line2D([0], [0], color='blue', lw=4),
                Line2D([0], [0], color='lightblue', lw=4),
                Line2D([0], [0], color='red', lw=4)]

plt.legend(custom_lines, ['cont1', 'cont1_log', 'cont1_log_mean']);

#### Check how the log transformations are correlated with target:

In [None]:
corr_test = log_numerical_columns.merge(target, on="id")
corr_test = corr_test.iloc[::, 1:]
corr = corr_test.corr()
sns.heatmap(corr, annot=True);

In [None]:
print("Numerical columns most correlated with target column:\n", abs(corr["target"]).sort_values(ascending=False))

## Numerical Columns without Outliers:

In [None]:
numerical_columns_NO = pd.read_csv("numerical_columns_NO.csv")
numerical_columns_NO.head(5)

In [None]:
log_numerical_columns = pd.DataFrame()
log_numerical_columns["id"] = numerical_columns_NO["id"]

log_numerical_columns['cont0_log'] = np.log((1+ numerical_columns_NO['cont0']))
cont0_log_mean = np.round(np.mean(log_numerical_columns['cont0_log']), 2)

log_numerical_columns['cont1_log'] = np.log((1+ numerical_columns_NO['cont1']))
cont1_log_mean = np.round(np.mean(log_numerical_columns['cont1_log']), 2)

log_numerical_columns['cont2_log'] = np.log((1+ numerical_columns_NO['cont2']))
cont2_log_mean = np.round(np.mean(log_numerical_columns['cont2_log']), 2)

log_numerical_columns['cont3_log'] = np.log((1+ numerical_columns_NO['cont3']))
cont3_log_mean = np.round(np.mean(log_numerical_columns['cont3_log']), 2)

log_numerical_columns['cont4_log'] = np.log((1+ numerical_columns_NO['cont4']))
cont4_log_mean = np.round(np.mean(log_numerical_columns['cont4_log']), 2)

log_numerical_columns['cont5_log'] = np.log((1+ numerical_columns_NO['cont5']))
cont5_log_mean = np.round(np.mean(log_numerical_columns['cont5_log']), 2)

log_numerical_columns['cont6_log'] = np.log((1+ numerical_columns_NO['cont6']))
cont6_log_mean = np.round(np.mean(log_numerical_columns['cont6_log']), 2)

log_numerical_columns['cont7_log'] = np.log((1+ numerical_columns_NO['cont7']))
cont7_log_mean = np.round(np.mean(log_numerical_columns['cont7_log']), 2)

log_numerical_columns['cont8_log'] = np.log((1+ numerical_columns_NO['cont8']))
cont8_log_mean = np.round(np.mean(log_numerical_columns['cont8_log']), 2)

log_numerical_columns['cont9_log'] = np.log((1+ numerical_columns_NO['cont9']))
cont9_log_mean = np.round(np.mean(log_numerical_columns['cont9_log']), 2)

log_numerical_columns['cont10_log'] = np.log((1+ numerical_columns_NO['cont10']))
cont10_log_mean = np.round(np.mean(log_numerical_columns['cont10_log']), 2)

log_numerical_columns['cont11_log'] = np.log((1+ numerical_columns_NO['cont11']))
cont11_log_mean = np.round(np.mean(log_numerical_columns['cont11_log']), 2)

log_numerical_columns['cont12_log'] = np.log((1+ numerical_columns_NO['cont12']))
cont12_log_mean = np.round(np.mean(log_numerical_columns['cont12_log']), 2)

In [None]:
log_numerical_columns.head(5)

In [None]:
log_numerical_columns.shape

In [None]:
log_numerical_columns.to_csv("log_numerical_columns_NO.csv")

#### Plots comparing original with log transformation

In [None]:
plt.hist(numerical_columns["cont0"], color='green', alpha=0.5, edgecolor='black')
plt.hist(log_numerical_columns['cont0_log'], bins=30, color='lightblue', edgecolor='black')
plt.axvline(cont0_log_mean, color='red')
plt.title('cont0 histogram before and after Log Transform', fontsize=20)
plt.xlabel('cont0 (log scale)', fontsize=18)
plt.ylabel('Frequency', fontsize=18)
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
# custom legend
custom_lines = [Line2D([0], [0], color='blue', lw=4),
                Line2D([0], [0], color='lightblue', lw=4),
                Line2D([0], [0], color='red', lw=4)]

plt.legend(custom_lines, ['cont0', 'cont0_log', 'cont0_log_mean']);

#### Check how the log transformations are correlated with target:

In [None]:
corr_test = log_numerical_columns.merge(target, on="id")
corr_test = corr_test.iloc[::, 1:]
corr = corr_test.corr()
sns.heatmap(corr, annot=True);

In [None]:
print("Numerical columns most correlated with target column:\n", abs(corr["target"]).sort_values(ascending=False))

We see that the log transformations are more correlated with the target when the outliers from the original numerical columns are **NOT REMOVED**.

***

# Test Dataset
### Possible transformations

### Point:
Whatever transformations we do to the train dataset, we must be consistent and do those changes to the test dataset too - to take as different csv's for the modelling step.

For example:
- Binary encoded train set - must also have binary encoded test set.
- Log transformed train set - must also have log transformed test set.

In [None]:
test.head(5)

## Numerical transformations:
#### Log transformations:

In [None]:
test.reset_index(level=0, inplace=True)
test.head(5)

In [None]:
numerical_test = test[["id", "cont0", "cont1", "cont2", "cont3", "cont4", "cont5", "cont6", "cont7", "cont8", "cont9", "cont10", "cont11", "cont12"]]
numerical_test.head(5)

In [None]:
numerical_test.shape

In [None]:
numerical_test.to_csv("numerical_test.csv")

In [None]:
log_numerical_test = pd.DataFrame()
log_numerical_test["id"] = numerical_test["id"]

log_numerical_test['cont0_log'] = np.log((1+ numerical_test['cont0']))
cont0_log_mean = np.round(np.mean(log_numerical_test['cont0_log']), 2)

log_numerical_test['cont1_log'] = np.log((1+ numerical_test['cont1']))
cont1_log_mean = np.round(np.mean(log_numerical_test['cont1_log']), 2)

log_numerical_test['cont2_log'] = np.log((1+ numerical_test['cont2']))
cont2_log_mean = np.round(np.mean(log_numerical_test['cont2_log']), 2)

log_numerical_test['cont3_log'] = np.log((1+ numerical_test['cont3']))
cont3_log_mean = np.round(np.mean(log_numerical_test['cont3_log']), 2)

log_numerical_test['cont4_log'] = np.log((1+ numerical_test['cont4']))
cont4_log_mean = np.round(np.mean(log_numerical_test['cont4_log']), 2)

log_numerical_test['cont5_log'] = np.log((1+ numerical_test['cont5']))
cont5_log_mean = np.round(np.mean(log_numerical_test['cont5_log']), 2)

log_numerical_test['cont6_log'] = np.log((1+ numerical_test['cont6']))
cont6_log_mean = np.round(np.mean(log_numerical_test['cont6_log']), 2)

log_numerical_test['cont7_log'] = np.log((1+ numerical_test['cont7']))
cont7_log_mean = np.round(np.mean(log_numerical_test['cont7_log']), 2)

log_numerical_test['cont8_log'] = np.log((1+ numerical_test['cont8']))
cont8_log_mean = np.round(np.mean(log_numerical_test['cont8_log']), 2)

log_numerical_test['cont9_log'] = np.log((1+ numerical_test['cont9']))
cont9_log_mean = np.round(np.mean(log_numerical_test['cont9_log']), 2)

log_numerical_test['cont10_log'] = np.log((1+ numerical_test['cont10']))
cont10_log_mean = np.round(np.mean(log_numerical_test['cont10_log']), 2)

log_numerical_test['cont11_log'] = np.log((1+ numerical_test['cont11']))
cont11_log_mean = np.round(np.mean(log_numerical_test['cont11_log']), 2)

log_numerical_test['cont12_log'] = np.log((1+ numerical_test['cont12']))
cont12_log_mean = np.round(np.mean(log_numerical_test['cont12_log']), 2)

In [None]:
log_numerical_test.head(5)

In [None]:
log_numerical_test.to_csv("log_numerical_test.csv")

## Categorical Transformations:

In [None]:
categorical_test = test[["id", "cat0", "cat1", "cat2", "cat3", "cat4", "cat5", "cat6", "cat7", "cat8", "cat9"]]
categorical_test.head(5)

In [None]:
categorical_test.shape

#### Ordinal Encoder:

In [None]:
enc = OrdinalEncoder()
X = categorical_test
enc.fit(X)
ordinal_categorical_test = enc.transform(X)
ordinal_categorical_test = pd.DataFrame(ordinal_categorical_test)
ordinal_categorical_test.head(5)

In [None]:
ordinal_categorical_test.to_csv("ordinal_categorical_test.csv")

#### One-Hot Encoding:

In [None]:
enc = OneHotEncoder(handle_unknown='ignore')
enc.fit(categorical_test)
onehot_categorical_test = enc.transform(categorical_test)
onehot_categorical_test.head(5)

In [None]:
onehot_categorical_test.to_csv("onehot_categorical_test.csv")

#### Binary Encoding:

In [None]:
enc = BinaryEncoder().fit(categorical_test)
binary_categorical_test = enc.transform(categorical_test)
binary_categorical_test.head(5)

In [None]:
binary_categorical_test.to_csv("binary_categorical_test.csv")

#### Frequency encoding

In [None]:
encoder = CountFrequencyEncoder(encoding_method='frequency')
encoder.fit(categorical_test)
freq_categorical_test = encoder.transform(categorical_test)
freq_categorical_test.head(5)

In [None]:
freq_categorical_test.to_csv("freq_categorical_test.csv")

***

# Combining csv's accordingly:

### Training datasets:

### **binary_categorical_columns** with:
- numerical_columns
- log_numerical_columns
- numerical_columns_NO
- log_numerical_columns_NO

* ***target***
* ***target_NO***

### **ordinal_categorical_columns** with:
- numerical_columns
- log_numerical_columns
- numerical_columns_NO
- log_numerical_columns_NO

* ***target***
* ***target_NO***

### **freq_categorical_columns** with:
- numerical_columns
- log_numerical_columns
- numerical_columns_NO
- log_numerical_columns_NO

- ***target***
* ***target_NO***

In [None]:
target = pd.read_csv("target.csv")
target.shape

In [None]:
target.head(5)

In [None]:
binary_categorical_columns = pd.read_csv("binary_categorical_columns.csv")
numerical_columns = pd.read_csv("numerical_columns.csv")
binary_num = binary_categorical_columns.merge(numerical_columns, on='id')
binary_num = binary_num.merge(target, on='id')
binary_num.to_csv("binary_num.csv")

In [None]:
log_numerical_columns = pd.read_csv("log_numerical_columns.csv")
binary_log = binary_categorical_columns.merge(log_numerical_columns, on='id')
binary_log = binary_log.merge(target, on='id')
binary_log.to_csv("binary_log.csv")

In [None]:
numerical_columns_NO = pd.read_csv("numerical_columns_NO.csv")
binary_num_NO = binary_categorical_columns.merge(numerical_columns_NO, on='id')
binary_num_NO = binary_num_NO.merge(target, on='id')
binary_num_NO.to_csv("binary_num_NO.csv")

In [None]:
log_numerical_columns_NO = pd.read_csv("log_numerical_columns_NO.csv")
binary_log_NO = binary_categorical_columns.merge(log_numerical_columns_NO, on='id')
binary_log_NO = binary_log_NO.merge(target, on='id')
binary_log_NO.to_csv("binary_log_NO.csv")

***

In [None]:
target_NO = pd.read_csv("target_NO.csv")
target_NO.shape

In [None]:
numerical_columns = pd.read_csv("numerical_columns.csv")
binary_num_TNO = binary_categorical_columns.merge(numerical_columns, on='id')
binary_num_TNO = binary_num_TNO.merge(target_NO, on='id')
binary_num_TNO.to_csv("binary_num_TNO.csv")

In [None]:
log_numerical_columns = pd.read_csv("log_numerical_columns.csv")
binary_log_TNO = binary_categorical_columns.merge(log_numerical_columns, on='id')
binary_log_TNO = binary_log_TNO.merge(target_NO, on='id')
binary_log_TNO.to_csv("binary_log_TNO.csv")

In [None]:
numerical_columns_NO = pd.read_csv("numerical_columns_NO.csv")
binary_num_NO_TNO = binary_categorical_columns.merge(numerical_columns_NO, on='id')
binary_num_NO_TNO = binary_num_NO_TNO.merge(target_NO, on='id')
binary_num_NO_TNO.to_csv("binary_num_NO_TNO.csv")

In [None]:
log_numerical_columns_NO = pd.read_csv("log_numerical_columns_NO.csv")
binary_log_NO_TNO = binary_categorical_columns.merge(log_numerical_columns_NO, on='id')
binary_log_NO_TNO = binary_log_NO_TNO.merge(target_NO, on='id')
binary_log_NO_TNO.to_csv("binary_log_NO_TNO.csv")

***

In [None]:
ordinal_categorical_columns = pd.read_csv("ordinal_categorical_columns.csv")
numerical_columns = pd.read_csv("numerical_columns.csv")
ordinal_num = ordinal_categorical_columns.merge(numerical_columns, on='id')
ordinal_num = ordinal_num.merge(target, on='id')
ordinal_num.to_csv("ordinal_num.csv")

In [None]:
log_numerical_columns = pd.read_csv("log_numerical_columns.csv")
ordinal_log = ordinal_categorical_columns.merge(log_numerical_columns, on='id')
ordinal_log = ordinal_log.merge(target, on='id')
ordinal_log.to_csv("ordinal_log.csv")

In [None]:
numerical_columns_NO = pd.read_csv("numerical_columns_NO.csv")
ordinal_num_NO = ordinal_categorical_columns.merge(numerical_columns_NO, on='id')
ordinal_num_NO = ordinal_num_NO.merge(target, on='id')
ordinal_num_NO.to_csv("ordinal_num_NO.csv")

In [None]:
log_numerical_columns_NO = pd.read_csv("log_numerical_columns_NO.csv")
ordinal_log_NO = ordinal_categorical_columns.merge(log_numerical_columns_NO, on='id')
ordinal_log_NO = ordinal_log_NO.merge(target, on='id')
ordinal_log_NO.to_csv("ordinal_log_NO.csv")

***

In [None]:
numerical_columns = pd.read_csv("numerical_columns.csv")
ordinal_num_TNO = ordinal_categorical_columns.merge(numerical_columns, on='id')
ordinal_num_TNO = ordinal_num_TNO.merge(target_NO, on='id')
ordinal_num_TNO.to_csv("ordinal_log_TNO.csv")

In [None]:
log_numerical_columns = pd.read_csv("log_numerical_columns.csv")
ordinal_log_TNO = ordinal_categorical_columns.merge(log_numerical_columns, on='id')
ordinal_log_TNO = ordinal_log_TNO.merge(target_NO, on='id')
ordinal_log_TNO.to_csv("ordinal_log_TNO.csv")

In [None]:
numerical_columns_NO = pd.read_csv("numerical_columns_NO.csv")
ordinal_num_NO_TNO = ordinal_categorical_columns.merge(numerical_columns_NO, on='id')
ordinal_num_NO_TNO = ordinal_num_NO_TNO.merge(target_NO, on='id')
ordinal_num_NO_TNO.to_csv("ordinal_num_NO_TNO.csv")

In [None]:
log_numerical_columns_NO = pd.read_csv("log_numerical_columns_NO.csv")
ordinal_log_NO_TNO = ordinal_categorical_columns.merge(log_numerical_columns_NO, on='id')
ordinal_log_NO_TNO = ordinal_log_NO_TNO.merge(target_NO, on='id')
ordinal_log_NO_TNO.to_csv("ordinal_log_NO_TNO.csv")

***

In [None]:
freq_categorical_columns = pd.read_csv("freq_categorical_columns.csv")
numerical_columns = pd.read_csv("numerical_columns.csv")
freq_num = freq_categorical_columns.merge(numerical_columns, on='id')
freq_num = freq_num.merge(target, on='id')
freq_num.to_csv("freq_num.csv")

In [None]:
log_numerical_columns = pd.read_csv("log_numerical_columns.csv")
freq_log = freq_categorical_columns.merge(log_numerical_columns, on='id')
freq_log = freq_log.merge(target, on='id')
freq_log.to_csv("freq_log.csv")

In [None]:
numerical_columns_NO = pd.read_csv("numerical_columns_NO.csv")
freq_num_NO = freq_categorical_columns.merge(numerical_columns_NO, on='id')
freq_num_NO = freq_num_NO.merge(target, on='id')
freq_num_NO.to_csv("freq_num_NO.csv")

In [None]:
log_numerical_columns_NO = pd.read_csv("log_numerical_columns_NO.csv")
freq_log_NO = freq_categorical_columns.merge(log_numerical_columns_NO, on='id')
freq_log_NO = freq_log_NO.merge(target, on='id')
freq_log_NO.to_csv("freq_log_NO.csv")

***

In [None]:
numerical_columns = pd.read_csv("numerical_columns.csv")
freq_num_TNO = freq_categorical_columns.merge(numerical_columns, on='id')
freq_num_TNO = freq_num_TNO.merge(target_NO, on='id')
freq_num_TNO.to_csv("freq_log_TNO.csv")

In [None]:
log_numerical_columns = pd.read_csv("log_numerical_columns.csv")
freq_log_TNO = freq_categorical_columns.merge(log_numerical_columns, on='id')
freq_log_TNO = freq_log_TNO.merge(target_NO, on='id')
freq_log_TNO.to_csv("freq_log_TNO.csv")

In [None]:
numerical_columns_NO = pd.read_csv("numerical_columns_NO.csv")
freq_num_NO_TNO = freq_categorical_columns.merge(numerical_columns_NO, on='id')
freq_num_NO_TNO = freq_num_NO_TNO.merge(target_NO, on='id')
freq_num_NO_TNO.to_csv("freq_num_NO_TNO.csv")

In [None]:
log_numerical_columns_NO = pd.read_csv("log_numerical_columns_NO.csv")
freq_log_NO_TNO = freq_categorical_columns.merge(log_numerical_columns_NO, on='id')
freq_log_NO_TNO = freq_log_NO_TNO.merge(target_NO, on='id')
freq_log_NO_TNO.to_csv("freq_log_NO_TNO.csv")

## Test datasets:

### **binary_categorical_test** with:
- numerical_test
- log_numerical_test

### **ordinal_categorical_test** with:
- numerical_test
- log_numerical_test

### **freq_categorical_test** with:
- numerical_test
- log_numerical_test

In [None]:
binary_categorical_test = pd.read_csv("binary_categorical_test.csv")
numerical_test = pd.read_csv("numerical_test.csv")
binary_num_test = binary_categorical_test.merge(numerical_test, on='id')
binary_num_test.to_csv("binary_num_test.csv")

In [None]:
log_numerical_test = pd.read_csv("log_numerical_test.csv")
binary_log_test = binary_categorical_test.merge(log_numerical_test, on='id')
binary_log_test.to_csv("binary_log_test.csv")

In [None]:
ordinal_categorical_test = pd.read_csv("ordinal_categorical_test.csv")
numerical_test = pd.read_csv("numerical_test.csv")
ordinal_num_test = ordinal_categorical_test.merge(numerical_test, on='id')
ordinal_num_test.to_csv("ordinal_num_test.csv")

In [None]:
log_numerical_test = pd.read_csv("log_numerical_test.csv")
ordinal_log_test = ordinal_categorical_test.merge(log_numerical_test, on='id')
ordinal_log_test.to_csv("ordinal_log_test.csv")

In [None]:
freq_categorical_test = pd.read_csv("freq_categorical_test.csv")
numerical_test = pd.read_csv("numerical_test.csv")
freq_num_test = freq_categorical_test.merge(numerical_test, on='id')
freq_num_test.to_csv("freq_num_test.csv")

In [None]:
log_numerical_test = pd.read_csv("log_numerical_test.csv")
freq_log_test = freq_categorical_test.merge(log_numerical_test, on='id')
freq_log_test.to_csv("freq_log_test.csv")

# Feature Selection

## Univariate Feature Selection

In [None]:
from sklearn.feature_selection import SelectKBest, f_regression, mutual_info_regression

### Training datasets:

1. binary_log:

In [None]:
binary_log = pd.read_csv("binary_log.csv", index_col="Unnamed: 0")
train_id = binary_log[["id", "target"]]
binary_log = binary_log.drop(["Unnamed: 0.1", "id"], axis=1)
binary_log_test = pd.read_csv("binary_log_test.csv", index_col="Unnamed: 0")
test_id = pd.DataFrame(binary_log_test["id"])
binary_log_test = binary_log_test.drop(["Unnamed: 0_x", "Unnamed: 0_y", "id"], axis=1)

In [None]:
feature_train = binary_log.select_dtypes(include=['int64']).columns

# Keep 5 categorical features
selector = SelectKBest(mutual_info_regression, k=5)
X_cat_train = selector.fit_transform(binary_log[feature_train], binary_log['target'])
X_cat_train = pd.DataFrame(X_cat_train)

In [None]:
feature_test = binary_log_test.select_dtypes(include=['int64']).columns
X_cat_test = selector.transform(binary_log_test[feature_test])
X_cat_test = pd.DataFrame(X_cat_test)
X_cat_test

In [None]:
feature_train = binary_log.select_dtypes(include=['float64']).columns.drop("target")

# Keep 5 numerical features
selector = SelectKBest(f_regression, k=5)
X_num_train = selector.fit_transform(binary_log[feature_train], binary_log['target'])
X_num_train = pd.DataFrame(X_num_train)
X_num_train

In [None]:
feature_test = binary_log_test.select_dtypes(include=['float64']).columns
X_num_test = selector.transform(binary_log_test[feature_test])
X_num_test = pd.DataFrame(X_num_test)
X_num_test

In [None]:
# Get back the features we've kept, zero out all other features
selected_features_train = pd.DataFrame(selector.inverse_transform(X_num_train), 
                                 index=binary_log.index, 
                                 columns=feature_train)
selected_features_train

In [None]:
# Get back the features we've kept, zero out all other features
selected_features_test = pd.DataFrame(selector.inverse_transform(X_num_test), 
                                 index=binary_log_test.index, 
                                 columns=feature_test)
selected_features_test

In [None]:
# Dropped columns have values of all 0s, so var is 0, drop them
selected_columns_train = selected_features_train.columns[selected_features_train.var() != 0]

# Get the valid dataset with the selected features.
UNI_train_binary_log = binary_log[selected_columns_train]
UNI_train_binary_log

In [None]:
# Dropped columns have values of all 0s, so var is 0, drop them
selected_columns_test = selected_features_test.columns[selected_features_test.var() != 0]

# Get the valid dataset with the selected features.
UNI_test_binary_log = binary_log_test[selected_columns_test]
UNI_test_binary_log

In [None]:
UNI_train_binary_log = UNI_train_binary_log.merge(X_cat_train, left_index=True, right_index=True)
UNI_train_binary_log

In [None]:
UNI_test_binary_log = UNI_test_binary_log.merge(X_cat_test, left_index=True, right_index=True)
UNI_test_binary_log

In [None]:
UNI_train_binary_log = pd.merge(UNI_train_binary_log, train_id, left_index=True, right_index=True)
UNI_train_binary_log

In [None]:
UNI_test_binary_log = pd.merge(UNI_test_binary_log, test_id, left_index=True, right_index=True)
UNI_test_binary_log

In [None]:
UNI_train_binary_log.to_csv("UNI_train_binary_log.csv")
UNI_test_binary_log.to_csv("UNI_test_binary_log.csv")

***

2. binary_log_NO

In [None]:
binary_log_NO = pd.read_csv("binary_log_NO.csv", index_col="Unnamed: 0")
binary_log_NO.rename(columns = {'target_y':'target'}, inplace = True)
train_id = binary_log_NO[["id", "target"]]
binary_log_NO = binary_log_NO.drop(["Unnamed: 0.1", "id"], axis=1)
binary_log_test_NO = pd.read_csv("binary_log_test.csv", index_col="Unnamed: 0")
test_id = pd.DataFrame(binary_log_test_NO["id"])
binary_log_test_NO = binary_log_test_NO.drop(["Unnamed: 0_x", "Unnamed: 0_y", "id"], axis=1)

In [None]:
feature_train = binary_log_NO.select_dtypes(include=['int64']).columns

# Keep 5 categorical features
selector = SelectKBest(mutual_info_regression, k=5)
X_cat_train = selector.fit_transform(binary_log_NO[feature_train], binary_log_NO['target'])
X_cat_train = pd.DataFrame(X_cat_train)

feature_test = binary_log_test_NO.select_dtypes(include=['int64']).columns
X_cat_test = selector.transform(binary_log_test_NO[feature_test])
X_cat_test = pd.DataFrame(X_cat_test)


feature_train = binary_log_NO.select_dtypes(include=['float64']).columns.drop("target")

# Keep 5 numerical features
selector = SelectKBest(f_regression, k=5)
X_num_train = selector.fit_transform(binary_log_NO[feature_train], binary_log_NO['target'])
X_num_train = pd.DataFrame(X_num_train)

feature_test = binary_log_test_NO.select_dtypes(include=['float64']).columns
X_num_test = selector.transform(binary_log_test_NO[feature_test])
X_num_test = pd.DataFrame(X_num_test)


# Get back the features we've kept, zero out all other features
selected_features_train = pd.DataFrame(selector.inverse_transform(X_num_train), 
                                 index=binary_log_NO.index, 
                                 columns=feature_train)

# Get back the features we've kept, zero out all other features
selected_features_test = pd.DataFrame(selector.inverse_transform(X_num_test), 
                                 index=binary_log_test_NO.index, 
                                 columns=feature_test)

# Dropped columns have values of all 0s, so var is 0, drop them
selected_columns_train = selected_features_train.columns[selected_features_train.var() != 0]
# Get the valid dataset with the selected features.
UNI_train_binary_log_NO = binary_log_NO[selected_columns_train]

# Dropped columns have values of all 0s, so var is 0, drop them
selected_columns_test = selected_features_test.columns[selected_features_test.var() != 0]
# Get the valid dataset with the selected features.
UNI_test_binary_log_NO = binary_log_test_NO[selected_columns_test]

# merges
UNI_train_binary_log_NO = UNI_train_binary_log_NO.merge(X_cat_train, left_index=True, right_index=True)
UNI_test_binary_log_NO = UNI_test_binary_log_NO.merge(X_cat_test, left_index=True, right_index=True)
UNI_train_binary_log_NO = pd.merge(UNI_train_binary_log_NO, train_id, left_index=True, right_index=True)
UNI_test_binary_log_NO = pd.merge(UNI_test_binary_log_NO, test_id, left_index=True, right_index=True)

In [None]:
UNI_train_binary_log_NO.to_csv("UNI_train_binary_log_NO.csv")
UNI_test_binary_log_NO.to_csv("UNI_test_binary_log_NO.csv")

3. binary_log_NO_TNO

In [None]:
binary_log_NO_TNO = pd.read_csv("binary_log_NO_TNO.csv", index_col="Unnamed: 0")
train_id = binary_log_NO_TNO[["id", "target"]]
binary_log_NO_TNO = binary_log_NO_TNO.drop(["Unnamed: 0.1", "id"], axis=1)
binary_log_test_NO_TNO = pd.read_csv("binary_log_test.csv", index_col="Unnamed: 0")
test_id = pd.DataFrame(binary_log_test_NO_TNO["id"])
binary_log_test_NO_TNO = binary_log_test_NO_TNO.drop(["Unnamed: 0_x", "Unnamed: 0_y", "id"], axis=1)

In [None]:
feature_train = binary_log_NO_TNO.select_dtypes(include=['int64']).columns

# Keep 5 categorical features
selector = SelectKBest(mutual_info_regression, k=5)
X_cat_train = selector.fit_transform(binary_log_NO_TNO[feature_train], binary_log_NO_TNO['target'])
X_cat_train = pd.DataFrame(X_cat_train)

feature_test = binary_log_test_NO_TNO.select_dtypes(include=['int64']).columns
X_cat_test = selector.transform(binary_log_test_NO_TNO[feature_test])
X_cat_test = pd.DataFrame(X_cat_test)


feature_train = binary_log_NO_TNO.select_dtypes(include=['float64']).columns.drop("target")

# Keep 5 numerical features
selector = SelectKBest(f_regression, k=5)
X_num_train = selector.fit_transform(binary_log_NO_TNO[feature_train], binary_log_NO_TNO['target'])
X_num_train = pd.DataFrame(X_num_train)

feature_test = binary_log_test_NO_TNO.select_dtypes(include=['float64']).columns
X_num_test = selector.transform(binary_log_test_NO_TNO[feature_test])
X_num_test = pd.DataFrame(X_num_test)


# Get back the features we've kept, zero out all other features
selected_features_train = pd.DataFrame(selector.inverse_transform(X_num_train), 
                                 index=binary_log_NO_TNO.index, 
                                 columns=feature_train)

# Get back the features we've kept, zero out all other features
selected_features_test = pd.DataFrame(selector.inverse_transform(X_num_test), 
                                 index=binary_log_test_NO_TNO.index, 
                                 columns=feature_test)

# Dropped columns have values of all 0s, so var is 0, drop them
selected_columns_train = selected_features_train.columns[selected_features_train.var() != 0]
# Get the valid dataset with the selected features.
UNI_train_binary_log_NO_TNO = binary_log_NO_TNO[selected_columns_train]

# Dropped columns have values of all 0s, so var is 0, drop them
selected_columns_test = selected_features_test.columns[selected_features_test.var() != 0]
# Get the valid dataset with the selected features.
UNI_test_binary_log_NO_TNO = binary_log_test_NO_TNO[selected_columns_test]

# merges
UNI_train_binary_log_NO_TNO = UNI_train_binary_log_NO_TNO.merge(X_cat_train, left_index=True, right_index=True)
UNI_test_binary_log_NO_TNO = UNI_test_binary_log_NO_TNO.merge(X_cat_test, left_index=True, right_index=True)
UNI_train_binary_log_NO_TNO = pd.merge(UNI_train_binary_log_NO_TNO, train_id, left_index=True, right_index=True)
UNI_test_binary_log_NO_TNO = pd.merge(UNI_test_binary_log_NO_TNO, test_id, left_index=True, right_index=True)

In [None]:
UNI_train_binary_log_NO_TNO.to_csv("UNI_train_binary_log_NO_TNO.csv")
UNI_test_binary_log_NO_TNO.to_csv("UNI_test_binary_log_NO_TNO.csv")

4. binary_num:

In [None]:
binary_num = pd.read_csv("binary_num.csv", index_col="Unnamed: 0")
binary_num.rename(columns = {'target_y':'target'}, inplace = True) 
train_id = binary_num[["id", "target"]]
binary_num = binary_num.drop("id", axis=1)
binary_num_test = pd.read_csv("binary_num_test.csv", index_col="Unnamed: 0")
test_id = pd.DataFrame(binary_num_test["id"])
binary_num_test = binary_num_test.drop(["Unnamed: 0_x", "Unnamed: 0_y", "id"], axis=1)

In [None]:
feature_train = binary_num.select_dtypes(include=['int64']).columns

# Keep 5 categorical features
selector = SelectKBest(mutual_info_regression, k=5)
X_cat_train = selector.fit_transform(binary_num[feature_train], binary_num['target'])
X_cat_train = pd.DataFrame(X_cat_train)

feature_test = binary_num_test.select_dtypes(include=['int64']).columns
X_cat_test = selector.transform(binary_num_test[feature_test])
X_cat_test = pd.DataFrame(X_cat_test)


feature_train = binary_num.select_dtypes(include=['float64']).columns.drop("target")

# Keep 5 numerical features
selector = SelectKBest(f_regression, k=5)
X_num_train = selector.fit_transform(binary_num[feature_train], binary_num['target'])
X_num_train = pd.DataFrame(X_num_train)

feature_test = binary_num_test.select_dtypes(include=['float64']).columns
X_num_test = selector.transform(binary_num_test[feature_test])
X_num_test = pd.DataFrame(X_num_test)


# Get back the features we've kept, zero out all other features
selected_features_train = pd.DataFrame(selector.inverse_transform(X_num_train), 
                                 index=binary_num.index, 
                                 columns=feature_train)

# Get back the features we've kept, zero out all other features
selected_features_test = pd.DataFrame(selector.inverse_transform(X_num_test), 
                                 index=binary_num_test.index, 
                                 columns=feature_test)

# Dropped columns have values of all 0s, so var is 0, drop them
selected_columns_train = selected_features_train.columns[selected_features_train.var() != 0]
# Get the valid dataset with the selected features.
UNI_train_binary_num = binary_num[selected_columns_train]

# Dropped columns have values of all 0s, so var is 0, drop them
selected_columns_test = selected_features_test.columns[selected_features_test.var() != 0]
# Get the valid dataset with the selected features.
UNI_test_binary_num = binary_num_test[selected_columns_test]

# merges
UNI_train_binary_num = UNI_train_binary_num.merge(X_cat_train, left_index=True, right_index=True)
UNI_test_binary_num = UNI_test_binary_num.merge(X_cat_test, left_index=True, right_index=True)
UNI_train_binary_num = pd.merge(UNI_train_binary_num, train_id, left_index=True, right_index=True)
UNI_test_binary_num = pd.merge(UNI_test_binary_num, test_id, left_index=True, right_index=True)

In [None]:
UNI_train_binary_num.to_csv("UNI_train_binary_num.csv")
UNI_test_binary_num.to_csv("UNI_test_binary_num.csv")

5. binary_num_NO

In [None]:
binary_num_NO = pd.read_csv("binary_num_NO.csv", index_col="Unnamed: 0")
binary_num_NO.rename(columns = {'target_y':'target'}, inplace = True) 
train_id = binary_num_NO[["id", "target"]]
binary_num_NO = binary_num_NO.drop(["target_x", "id"], axis=1)
binary_num_test_NO = pd.read_csv("binary_num_test.csv", index_col="Unnamed: 0")
test_id = pd.DataFrame(binary_num_test_NO["id"])
binary_num_test_NO = binary_num_test_NO.drop(["Unnamed: 0_x", "Unnamed: 0_y", "id"], axis=1)

In [None]:
feature_train = binary_num_NO.select_dtypes(include=['int64']).columns

# Keep 5 categorical features
selector = SelectKBest(mutual_info_regression, k=5)
X_cat_train = selector.fit_transform(binary_num_NO[feature_train], binary_num_NO['target'])
X_cat_train = pd.DataFrame(X_cat_train)

feature_test = binary_num_test_NO.select_dtypes(include=['int64']).columns
X_cat_test = selector.transform(binary_num_test_NO[feature_test])
X_cat_test = pd.DataFrame(X_cat_test)


feature_train = binary_num_NO.select_dtypes(include=['float64']).columns.drop("target")

# Keep 5 numerical features
selector = SelectKBest(f_regression, k=5)
X_num_train = selector.fit_transform(binary_num_NO[feature_train], binary_num_NO['target'])
X_num_train = pd.DataFrame(X_num_train)

feature_test = binary_num_test_NO.select_dtypes(include=['float64']).columns
X_num_test = selector.transform(binary_num_test_NO[feature_test])
X_num_test = pd.DataFrame(X_num_test)


# Get back the features we've kept, zero out all other features
selected_features_train = pd.DataFrame(selector.inverse_transform(X_num_train), 
                                 index=binary_num_NO.index, 
                                 columns=feature_train)

# Get back the features we've kept, zero out all other features
selected_features_test = pd.DataFrame(selector.inverse_transform(X_num_test), 
                                 index=binary_num_test_NO.index, 
                                 columns=feature_test)

# Dropped columns have values of all 0s, so var is 0, drop them
selected_columns_train = selected_features_train.columns[selected_features_train.var() != 0]
# Get the valid dataset with the selected features.
UNI_train_binary_num_NO = binary_num_NO[selected_columns_train]

# Dropped columns have values of all 0s, so var is 0, drop them
selected_columns_test = selected_features_test.columns[selected_features_test.var() != 0]
# Get the valid dataset with the selected features.
UNI_test_binary_num_NO = binary_num_test_NO[selected_columns_test]

# merges
UNI_train_binary_num_NO = UNI_train_binary_num_NO.merge(X_cat_train, left_index=True, right_index=True)
UNI_test_binary_num_NO = UNI_test_binary_num_NO.merge(X_cat_test, left_index=True, right_index=True)
UNI_train_binary_num_NO = pd.merge(UNI_train_binary_num_NO, train_id, left_index=True, right_index=True)
UNI_test_binary_num_NO = pd.merge(UNI_test_binary_num_NO, test_id, left_index=True, right_index=True)

In [None]:
UNI_train_binary_num_NO.to_csv("UNI_train_binary_num_NO.csv")
UNI_test_binary_num_NO.to_csv("UNI_test_binary_num_NO.csv")

6. binary_num_NO_TNO

In [None]:
binary_num_NO_TNO = pd.read_csv("binary_num_NO_TNO.csv", index_col="Unnamed: 0")
binary_num_NO_TNO.rename(columns = {'target_y':'target'}, inplace = True) 
train_id = binary_num_NO_TNO[["id", "target"]]
binary_num_NO_TNO = binary_num_NO_TNO.drop(["target_x", "id"], axis=1)
binary_num_test_NO_TNO = pd.read_csv("binary_num_test.csv", index_col="Unnamed: 0")
test_id = pd.DataFrame(binary_num_test_NO_TNO["id"])
binary_num_test_NO_TNO = binary_num_test_NO_TNO.drop(["Unnamed: 0_x", "Unnamed: 0_y", "id"], axis=1)

In [None]:
feature_train = binary_num_NO_TNO.select_dtypes(include=['int64']).columns

# Keep 5 categorical features
selector = SelectKBest(mutual_info_regression, k=5)
X_cat_train = selector.fit_transform(binary_num_NO_TNO[feature_train], binary_num_NO_TNO['target'])
X_cat_train = pd.DataFrame(X_cat_train)

feature_test = binary_num_test_NO_TNO.select_dtypes(include=['int64']).columns
X_cat_test = selector.transform(binary_num_test_NO_TNO[feature_test])
X_cat_test = pd.DataFrame(X_cat_test)


feature_train = binary_num_NO_TNO.select_dtypes(include=['float64']).columns.drop("target")

# Keep 5 numerical features
selector = SelectKBest(f_regression, k=5)
X_num_train = selector.fit_transform(binary_num_NO_TNO[feature_train], binary_num_NO_TNO['target'])
X_num_train = pd.DataFrame(X_num_train)

feature_test = binary_num_test_NO_TNO.select_dtypes(include=['float64']).columns
X_num_test = selector.transform(binary_num_test_NO_TNO[feature_test])
X_num_test = pd.DataFrame(X_num_test)


# Get back the features we've kept, zero out all other features
selected_features_train = pd.DataFrame(selector.inverse_transform(X_num_train), 
                                 index=binary_num_NO_TNO.index, 
                                 columns=feature_train)

# Get back the features we've kept, zero out all other features
selected_features_test = pd.DataFrame(selector.inverse_transform(X_num_test), 
                                 index=binary_num_test_NO_TNO.index, 
                                 columns=feature_test)

# Dropped columns have values of all 0s, so var is 0, drop them
selected_columns_train = selected_features_train.columns[selected_features_train.var() != 0]
# Get the valid dataset with the selected features.
UNI_train_binary_num_NO_TNO = binary_num_NO_TNO[selected_columns_train]

# Dropped columns have values of all 0s, so var is 0, drop them
selected_columns_test = selected_features_test.columns[selected_features_test.var() != 0]
# Get the valid dataset with the selected features.
UNI_test_binary_num_NO_TNO = binary_num_test_NO_TNO[selected_columns_test]

# merges
UNI_train_binary_num_NO_TNO = UNI_train_binary_num_NO_TNO.merge(X_cat_train, left_index=True, right_index=True)
UNI_test_binary_num_NO_TNO = UNI_test_binary_num_NO_TNO.merge(X_cat_test, left_index=True, right_index=True)
UNI_train_binary_num_NO_TNO = pd.merge(UNI_train_binary_num_NO_TNO, train_id, left_index=True, right_index=True)
UNI_test_binary_num_NO_TNO = pd.merge(UNI_test_binary_num_NO_TNO, test_id, left_index=True, right_index=True)

In [None]:
UNI_train_binary_num_NO_TNO.to_csv("UNI_train_binary_num_NO_TNO.csv")
UNI_test_binary_num_NO_TNO.to_csv("UNI_test_binary_num_NO_TNO.csv")

7. ordinal_num

In [None]:
ordinal_num = pd.read_csv("ordinal_num.csv", index_col="Unnamed: 0")
ordinal_num.rename(columns = {'target_y':'target'}, inplace = True) 
train_id = ordinal_num[["id", "target"]]
ordinal_num = ordinal_num.drop("id", axis=1)
ordinal_num_test = pd.read_csv("ordinal_num_test.csv", index_col="Unnamed: 0")
test_id = pd.DataFrame(ordinal_num_test["id"])
ordinal_num_test = ordinal_num_test.drop(["Unnamed: 0_x", "Unnamed: 0_y", "id"], axis=1)    

In [None]:
feature_train = ordinal_num.select_dtypes(include=['int64']).columns

# Keep 5 categorical features
selector = SelectKBest(mutual_info_regression, k=5)
X_cat_train = selector.fit_transform(ordinal_num[feature_train], ordinal_num['target'])
X_cat_train = pd.DataFrame(X_cat_train)

feature_test = ordinal_num_test.select_dtypes(include=['int64']).columns
X_cat_test = selector.transform(ordinal_num_test[feature_test])
X_cat_test = pd.DataFrame(X_cat_test)


feature_train = ordinal_num.select_dtypes(include=['float64']).columns.drop("target")

# Keep 5 numerical features
selector = SelectKBest(f_regression, k=5)
X_num_train = selector.fit_transform(ordinal_num[feature_train], ordinal_num['target'])
X_num_train = pd.DataFrame(X_num_train)

feature_test = ordinal_num_test.select_dtypes(include=['float64']).columns
X_num_test = selector.transform(ordinal_num_test[feature_test])
X_num_test = pd.DataFrame(X_num_test)


# Get back the features we've kept, zero out all other features
selected_features_train = pd.DataFrame(selector.inverse_transform(X_num_train), 
                                 index=ordinal_num.index, 
                                 columns=feature_train)

# Get back the features we've kept, zero out all other features
selected_features_test = pd.DataFrame(selector.inverse_transform(X_num_test), 
                                 index=ordinal_num_test.index, 
                                 columns=feature_test)

# Dropped columns have values of all 0s, so var is 0, drop them
selected_columns_train = selected_features_train.columns[selected_features_train.var() != 0]
# Get the valid dataset with the selected features.
UNI_train_ordinal_num = ordinal_num[selected_columns_train]

# Dropped columns have values of all 0s, so var is 0, drop them
selected_columns_test = selected_features_test.columns[selected_features_test.var() != 0]
# Get the valid dataset with the selected features.
UNI_test_ordinal_num = ordinal_num_test[selected_columns_test]

# merges
UNI_train_ordinal_num = UNI_train_ordinal_num.merge(X_cat_train, left_index=True, right_index=True)
UNI_test_ordinal_num = UNI_test_ordinal_num.merge(X_cat_test, left_index=True, right_index=True)
UNI_train_ordinal_num = pd.merge(UNI_train_ordinal_num, train_id, left_index=True, right_index=True)
UNI_test_ordinal_num = pd.merge(UNI_test_ordinal_num, test_id, left_index=True, right_index=True)

In [None]:
UNI_train_ordinal_num.to_csv("UNI_train_ordinal_num.csv")
UNI_test_ordinal_num.to_csv("UNI_test_ordinal_num.csv")

8. ordinal_num_NO

In [None]:
ordinal_num_NO = pd.read_csv("ordinal_num_NO.csv", index_col="Unnamed: 0")
ordinal_num_NO.rename(columns = {'target_y':'target'}, inplace = True) 
train_id = ordinal_num_NO[["id", "target"]]
ordinal_num_NO = ordinal_num_NO.drop(["target_x", "id"], axis=1)
ordinal_num_test_NO = pd.read_csv("ordinal_num_test.csv", index_col="Unnamed: 0")
test_id = pd.DataFrame(ordinal_num_test_NO["id"])
ordinal_num_test_NO = ordinal_num_test_NO.drop(["Unnamed: 0_x", "Unnamed: 0_y", "id"], axis=1)

In [None]:
feature_train = ordinal_num_NO.select_dtypes(include=['int64']).columns

# Keep 5 categorical features
selector = SelectKBest(mutual_info_regression, k=5)
X_cat_train = selector.fit_transform(ordinal_num_NO[feature_train], ordinal_num_NO['target'])
X_cat_train = pd.DataFrame(X_cat_train)

feature_test = ordinal_num_test_NO.select_dtypes(include=['int64']).columns
X_cat_test = selector.transform(ordinal_num_test_NO[feature_test])
X_cat_test = pd.DataFrame(X_cat_test)


feature_train = ordinal_num_NO.select_dtypes(include=['float64']).columns.drop("target")

# Keep 5 numerical features
selector = SelectKBest(f_regression, k=5)
X_num_train = selector.fit_transform(ordinal_num_NO[feature_train], ordinal_num_NO['target'])
X_num_train = pd.DataFrame(X_num_train)

feature_test = ordinal_num_test_NO.select_dtypes(include=['float64']).columns
X_num_test = selector.transform(ordinal_num_test_NO[feature_test])
X_num_test = pd.DataFrame(X_num_test)


# Get back the features we've kept, zero out all other features
selected_features_train = pd.DataFrame(selector.inverse_transform(X_num_train), 
                                 index=ordinal_num_NO.index, 
                                 columns=feature_train)

# Get back the features we've kept, zero out all other features
selected_features_test = pd.DataFrame(selector.inverse_transform(X_num_test), 
                                 index=ordinal_num_test_NO.index, 
                                 columns=feature_test)

# Dropped columns have values of all 0s, so var is 0, drop them
selected_columns_train = selected_features_train.columns[selected_features_train.var() != 0]
# Get the valid dataset with the selected features.
UNI_train_ordinal_num_NO = ordinal_num_NO[selected_columns_train]

# Dropped columns have values of all 0s, so var is 0, drop them
selected_columns_test = selected_features_test.columns[selected_features_test.var() != 0]
# Get the valid dataset with the selected features.
UNI_test_ordinal_num_NO = ordinal_num_test_NO[selected_columns_test]

# merges
UNI_train_ordinal_num_NO = UNI_train_ordinal_num_NO.merge(X_cat_train, left_index=True, right_index=True)
UNI_test_ordinal_num_NO = UNI_test_ordinal_num_NO.merge(X_cat_test, left_index=True, right_index=True)
UNI_train_ordinal_num_NO = pd.merge(UNI_train_ordinal_num_NO, train_id, left_index=True, right_index=True)
UNI_test_ordinal_num_NO = pd.merge(UNI_test_ordinal_num_NO, test_id, left_index=True, right_index=True)

In [None]:
UNI_train_ordinal_num_NO.to_csv("UNI_train_ordinal_num_NO.csv")
UNI_test_ordinal_num_NO.to_csv("UNI_test_ordinal_num_NO.csv")

9. ordinal_num_NO_TNO

In [None]:
ordinal_num_NO_TNO = pd.read_csv("ordinal_num_NO_TNO.csv", index_col="Unnamed: 0")
ordinal_num_NO_TNO.rename(columns = {'target_y':'target'}, inplace = True) 
train_id = ordinal_num_NO_TNO[["id", "target"]]
ordinal_num_NO_TNO = ordinal_num_NO_TNO.drop(["target_x", "id"], axis=1)
ordinal_num_test_NO_TNO = pd.read_csv("ordinal_num_test.csv", index_col="Unnamed: 0")
test_id = pd.DataFrame(ordinal_num_test_NO_TNO["id"])
ordinal_num_test_NO_TNO = ordinal_num_test_NO_TNO.drop(["Unnamed: 0_x", "Unnamed: 0_y", "id"], axis=1)

In [None]:
feature_train = ordinal_num_NO_TNO.select_dtypes(include=['int64']).columns

# Keep 5 categorical features
selector = SelectKBest(mutual_info_regression, k=5)
X_cat_train = selector.fit_transform(ordinal_num_NO_TNO[feature_train], ordinal_num_NO_TNO['target'])
X_cat_train = pd.DataFrame(X_cat_train)

feature_test = ordinal_num_test_NO_TNO.select_dtypes(include=['int64']).columns
X_cat_test = selector.transform(ordinal_num_test_NO_TNO[feature_test])
X_cat_test = pd.DataFrame(X_cat_test)


feature_train = ordinal_num_NO_TNO.select_dtypes(include=['float64']).columns.drop("target")

# Keep 5 numerical features
selector = SelectKBest(f_regression, k=5)
X_num_train = selector.fit_transform(ordinal_num_NO_TNO[feature_train], ordinal_num_NO_TNO['target'])
X_num_train = pd.DataFrame(X_num_train)

feature_test = ordinal_num_test_NO_TNO.select_dtypes(include=['float64']).columns
X_num_test = selector.transform(ordinal_num_test_NO_TNO[feature_test])
X_num_test = pd.DataFrame(X_num_test)


# Get back the features we've kept, zero out all other features
selected_features_train = pd.DataFrame(selector.inverse_transform(X_num_train), 
                                 index=ordinal_num_NO_TNO.index, 
                                 columns=feature_train)

# Get back the features we've kept, zero out all other features
selected_features_test = pd.DataFrame(selector.inverse_transform(X_num_test), 
                                 index=ordinal_num_test_NO_TNO.index, 
                                 columns=feature_test)

# Dropped columns have values of all 0s, so var is 0, drop them
selected_columns_train = selected_features_train.columns[selected_features_train.var() != 0]
# Get the valid dataset with the selected features.
UNI_train_ordinal_num_NO_TNO = ordinal_num_NO_TNO[selected_columns_train]

# Dropped columns have values of all 0s, so var is 0, drop them
selected_columns_test = selected_features_test.columns[selected_features_test.var() != 0]
# Get the valid dataset with the selected features.
UNI_test_ordinal_num_NO_TNO = ordinal_num_test_NO_TNO[selected_columns_test]

# merges
UNI_train_ordinal_num_NO_TNO = UNI_train_ordinal_num_NO_TNO.merge(X_cat_train, left_index=True, right_index=True)
UNI_test_ordinal_num_NO_TNO = UNI_test_ordinal_num_NO_TNO.merge(X_cat_test, left_index=True, right_index=True)
UNI_train_ordinal_num_NO_TNO = pd.merge(UNI_train_ordinal_num_NO_TNO, train_id, left_index=True, right_index=True)
UNI_test_ordinal_num_NO_TNO = pd.merge(UNI_test_ordinal_num_NO_TNO, test_id, left_index=True, right_index=True)

In [None]:
UNI_train_ordinal_num_NO_TNO.to_csv("UNI_train_ordinal_num_NO_TNO.csv")
UNI_test_ordinal_num_NO_TNO.to_csv("UNI_test_ordinal_num_NO_TNO.csv")

10. ordinal_log

In [None]:
ordinal_log = pd.read_csv("ordinal_num_NO_TNO.csv", index_col="Unnamed: 0")
ordinal_log.rename(columns = {'target_y':'target'}, inplace = True)
train_id = ordinal_log[["id", "target"]]
ordinal_log = ordinal_log.drop(["target_x", "id"], axis=1)
 
ordinal_log_test = pd.read_csv("ordinal_num_test.csv", index_col="Unnamed: 0")
test_id = pd.DataFrame(ordinal_log_test["id"])
ordinal_log_test = ordinal_log_test.drop(["Unnamed: 0_x", "Unnamed: 0_y", "id"], axis=1)

In [None]:
feature_train = ordinal_log.select_dtypes(include=['int64']).columns

# Keep 5 categorical features
selector = SelectKBest(mutual_info_regression, k=5)
X_cat_train = selector.fit_transform(ordinal_log[feature_train], ordinal_log['target'])
X_cat_train = pd.DataFrame(X_cat_train)

feature_test = ordinal_log_test.select_dtypes(include=['int64']).columns
X_cat_test = selector.transform(ordinal_log_test[feature_test])
X_cat_test = pd.DataFrame(X_cat_test)


feature_train = ordinal_log.select_dtypes(include=['float64']).columns.drop("target")

# Keep 5 numerical features
selector = SelectKBest(f_regression, k=5)
X_num_train = selector.fit_transform(ordinal_log[feature_train], ordinal_log['target'])
X_num_train = pd.DataFrame(X_num_train)

feature_test = ordinal_log_test.select_dtypes(include=['float64']).columns
X_num_test = selector.transform(ordinal_log_test[feature_test])
X_num_test = pd.DataFrame(X_num_test)


# Get back the features we've kept, zero out all other features
selected_features_train = pd.DataFrame(selector.inverse_transform(X_num_train), 
                                 index=ordinal_log.index, 
                                 columns=feature_train)

# Get back the features we've kept, zero out all other features
selected_features_test = pd.DataFrame(selector.inverse_transform(X_num_test), 
                                 index=ordinal_log_test.index, 
                                 columns=feature_test)

# Dropped columns have values of all 0s, so var is 0, drop them
selected_columns_train = selected_features_train.columns[selected_features_train.var() != 0]
# Get the valid dataset with the selected features.
UNI_train_ordinal_log = ordinal_log[selected_columns_train]

# Dropped columns have values of all 0s, so var is 0, drop them
selected_columns_test = selected_features_test.columns[selected_features_test.var() != 0]
# Get the valid dataset with the selected features.
UNI_test_ordinal_log = ordinal_log_test[selected_columns_test]

# merges
UNI_train_ordinal_log = UNI_train_ordinal_log.merge(X_cat_train, left_index=True, right_index=True)
UNI_test_ordinal_log = UNI_test_ordinal_log.merge(X_cat_test, left_index=True, right_index=True)
UNI_train_ordinal_log = pd.merge(UNI_train_ordinal_log, train_id, left_index=True, right_index=True)
UNI_test_ordinal_log = pd.merge(UNI_test_ordinal_log, test_id, left_index=True, right_index=True)

In [None]:
UNI_train_ordinal_log.to_csv("UNI_train_ordinal_log.csv")
UNI_test_ordinal_log.to_csv("UNI_test_ordinal_log.csv")

11. ordinal_log_NO

In [None]:
ordinal_log_NO = pd.read_csv("ordinal_log_NO.csv", index_col="Unnamed: 0")
ordinal_log_NO = ordinal_log_NO.drop("Unnamed: 0.1", axis=1)
ordinal_log_NO.rename(columns = {'target_y':'target'}, inplace = True) 
train_id = ordinal_log_NO[["target", "id"]]
ordinal_log_NO = ordinal_log_NO.drop("id", axis=1)
ordinal_log_test_NO = pd.read_csv("ordinal_log_test.csv", index_col="Unnamed: 0")
test_id = pd.DataFrame(ordinal_log_test_NO["id"])
ordinal_log_test_NO = ordinal_log_test_NO.drop(["Unnamed: 0_x", "Unnamed: 0_y", "id"], axis=1)

In [None]:
feature_train = ordinal_log_NO.select_dtypes(include=['int64']).columns

# Keep 5 categorical features
selector = SelectKBest(mutual_info_regression, k=5)
X_cat_train = selector.fit_transform(ordinal_log_NO[feature_train], ordinal_log_NO['target'])
X_cat_train = pd.DataFrame(X_cat_train)

feature_test = ordinal_log_test_NO.select_dtypes(include=['int64']).columns
X_cat_test = selector.transform(ordinal_log_test_NO[feature_test])
X_cat_test = pd.DataFrame(X_cat_test)


feature_train = ordinal_log_NO.select_dtypes(include=['float64']).columns.drop("target")

# Keep 5 numerical features
selector = SelectKBest(f_regression, k=5)
X_num_train = selector.fit_transform(ordinal_log_NO[feature_train], ordinal_log_NO['target'])
X_num_train = pd.DataFrame(X_num_train)

feature_test = ordinal_log_test_NO.select_dtypes(include=['float64']).columns
X_num_test = selector.transform(ordinal_log_test_NO[feature_test])
X_num_test = pd.DataFrame(X_num_test)


# Get back the features we've kept, zero out all other features
selected_features_train = pd.DataFrame(selector.inverse_transform(X_num_train), 
                                 index=ordinal_log_NO.index, 
                                 columns=feature_train)

# Get back the features we've kept, zero out all other features
selected_features_test = pd.DataFrame(selector.inverse_transform(X_num_test), 
                                 index=ordinal_log_test_NO.index, 
                                 columns=feature_test)

# Dropped columns have values of all 0s, so var is 0, drop them
selected_columns_train = selected_features_train.columns[selected_features_train.var() != 0]
# Get the valid dataset with the selected features.
UNI_train_ordinal_log_NO = ordinal_log_NO[selected_columns_train]

# Dropped columns have values of all 0s, so var is 0, drop them
selected_columns_test = selected_features_test.columns[selected_features_test.var() != 0]
# Get the valid dataset with the selected features.
UNI_test_ordinal_log_NO = ordinal_log_test_NO[selected_columns_test]

# merges
UNI_train_ordinal_log_NO = UNI_train_ordinal_log_NO.merge(X_cat_train, left_index=True, right_index=True)
UNI_test_ordinal_log_NO = UNI_test_ordinal_log_NO.merge(X_cat_test, left_index=True, right_index=True)
UNI_train_ordinal_log_NO = pd.merge(UNI_train_ordinal_log_NO, train_id, left_index=True, right_index=True)
UNI_test_ordinal_log_NO = pd.merge(UNI_test_ordinal_log_NO, test_id, left_index=True, right_index=True)

In [None]:
UNI_train_ordinal_log_NO.to_csv("UNI_train_ordinal_log_NO.csv")
UNI_test_ordinal_log_NO.to_csv("UNI_test_ordinal_log_NO.csv")

12. ordinal_log_NO_TNO

In [None]:
ordinal_log_NO_TNO = pd.read_csv("ordinal_log_NO_TNO.csv", index_col="Unnamed: 0")
ordinal_log_NO_TNO = ordinal_log_NO_TNO.drop("Unnamed: 0.1", axis=1)
ordinal_log_NO_TNO.rename(columns = {'target_y':'target'}, inplace = True) 
train_id = ordinal_log_NO_TNO[["target", "id"]]
ordinal_log_NO_TNO = ordinal_log_NO_TNO.drop("id", axis=1)
ordinal_log_test_NO_TNO = pd.read_csv("ordinal_log_test.csv", index_col="Unnamed: 0")
test_id = pd.DataFrame(ordinal_log_test_NO_TNO["id"])
ordinal_log_test_NO_TNO = ordinal_log_test_NO_TNO.drop(["Unnamed: 0_x", "Unnamed: 0_y", "id"], axis=1)

In [None]:
feature_train = ordinal_log_NO_TNO.select_dtypes(include=['int64']).columns

# Keep 5 categorical features
selector = SelectKBest(mutual_info_regression, k=5)
X_cat_train = selector.fit_transform(ordinal_log_NO_TNO[feature_train], ordinal_log_NO_TNO['target'])
X_cat_train = pd.DataFrame(X_cat_train)

feature_test = ordinal_log_test_NO_TNO.select_dtypes(include=['int64']).columns
X_cat_test = selector.transform(ordinal_log_test_NO_TNO[feature_test])
X_cat_test = pd.DataFrame(X_cat_test)


feature_train = ordinal_log_NO_TNO.select_dtypes(include=['float64']).columns.drop("target")

# Keep 5 numerical features
selector = SelectKBest(f_regression, k=5)
X_num_train = selector.fit_transform(ordinal_log_NO_TNO[feature_train], ordinal_log_NO_TNO['target'])
X_num_train = pd.DataFrame(X_num_train)

feature_test = ordinal_log_test_NO_TNO.select_dtypes(include=['float64']).columns
X_num_test = selector.transform(ordinal_log_test_NO_TNO[feature_test])
X_num_test = pd.DataFrame(X_num_test)


# Get back the features we've kept, zero out all other features
selected_features_train = pd.DataFrame(selector.inverse_transform(X_num_train), 
                                 index=ordinal_log_NO_TNO.index, 
                                 columns=feature_train)

# Get back the features we've kept, zero out all other features
selected_features_test = pd.DataFrame(selector.inverse_transform(X_num_test), 
                                 index=ordinal_log_test_NO_TNO.index, 
                                 columns=feature_test)

# Dropped columns have values of all 0s, so var is 0, drop them
selected_columns_train = selected_features_train.columns[selected_features_train.var() != 0]
# Get the valid dataset with the selected features.
UNI_train_ordinal_log_NO_TNO = ordinal_log_NO_TNO[selected_columns_train]

# Dropped columns have values of all 0s, so var is 0, drop them
selected_columns_test = selected_features_test.columns[selected_features_test.var() != 0]
# Get the valid dataset with the selected features.
UNI_test_ordinal_log_NO_TNO = ordinal_log_test_NO_TNO[selected_columns_test]

# merges
UNI_train_ordinal_log_NO_TNO = UNI_train_ordinal_log_NO_TNO.merge(X_cat_train, left_index=True, right_index=True)
UNI_test_ordinal_log_NO_TNO = UNI_test_ordinal_log_NO_TNO.merge(X_cat_test, left_index=True, right_index=True)
UNI_train_ordinal_log_NO_TNO = pd.merge(UNI_train_ordinal_log_NO_TNO, train_id, left_index=True, right_index=True)
UNI_test_ordinal_log_NO_TNO = pd.merge(UNI_test_ordinal_log_NO_TNO, test_id, left_index=True, right_index=True)

In [None]:
UNI_train_ordinal_log_NO_TNO.to_csv("UNI_train_ordinal_log_NO_TNO.csv")
UNI_test_ordinal_log_NO_TNO.to_csv("UNI_test_ordinal_log_NO_TNO.csv")

13. freq_num

In [None]:
freq_num = pd.read_csv("freq_num.csv", index_col="Unnamed: 0")
freq_num.rename(columns = {'target_y':'target'}, inplace = True) 
train_id = freq_num[["target", "id"]]
freq_num = freq_num.drop("id", axis=1)
freq_num_test = pd.read_csv("freq_num_test.csv", index_col="Unnamed: 0")
test_id = pd.DataFrame(freq_num_test["id"])
freq_num_test = freq_num_test.drop(["Unnamed: 0_x", "Unnamed: 0_y", "id"], axis=1)

In [None]:
feature_train = freq_num.loc[:, "cat0":"cat9"].columns

# Keep 5 categorical features
selector = SelectKBest(mutual_info_regression, k=5)
X_cat_train = selector.fit_transform(freq_num[feature_train], freq_num['target'])
X_cat_train = pd.DataFrame(X_cat_train)

feature_test = freq_num_test.loc[:, "cat0":"cat9"].columns
X_cat_test = selector.transform(freq_num_test[feature_test])
X_cat_test = pd.DataFrame(X_cat_test)


feature_train = freq_num.loc[:, "cont0":"cont12"].columns

# Keep 5 numerical features
selector = SelectKBest(f_regression, k=5)
X_num_train = selector.fit_transform(freq_num[feature_train], freq_num['target'])
X_num_train = pd.DataFrame(X_num_train)

feature_test = freq_num_test.loc[:, "cont0":"cont12"].columns
X_num_test = selector.transform(freq_num_test[feature_test])
X_num_test = pd.DataFrame(X_num_test)


# Get back the features we've kept, zero out all other features
selected_features_train = pd.DataFrame(selector.inverse_transform(X_num_train), 
                                 index=freq_num.index, 
                                 columns=feature_train)

# Get back the features we've kept, zero out all other features
selected_features_test = pd.DataFrame(selector.inverse_transform(X_num_test), 
                                 index=freq_num_test.index, 
                                 columns=feature_test)

# Dropped columns have values of all 0s, so var is 0, drop them
selected_columns_train = selected_features_train.columns[selected_features_train.var() != 0]
# Get the valid dataset with the selected features.
UNI_train_freq_num = freq_num[selected_columns_train]

# Dropped columns have values of all 0s, so var is 0, drop them
selected_columns_test = selected_features_test.columns[selected_features_test.var() != 0]
# Get the valid dataset with the selected features.
UNI_test_freq_num = freq_num_test[selected_columns_test]

# merges
UNI_train_freq_num = UNI_train_freq_num.merge(X_cat_train, left_index=True, right_index=True)
UNI_test_freq_num = UNI_test_freq_num.merge(X_cat_test, left_index=True, right_index=True)
UNI_train_freq_num = pd.merge(UNI_train_freq_num, train_id, left_index=True, right_index=True)
UNI_test_freq_num = pd.merge(UNI_test_freq_num, test_id, left_index=True, right_index=True)

In [None]:
UNI_train_freq_num.to_csv("UNI_train_freq_num.csv")
UNI_test_freq_num.to_csv("UNI_test_freq_num.csv")

14. freq_num_NO

In [None]:
freq_num_NO = pd.read_csv("freq_num_NO.csv", index_col="Unnamed: 0")
freq_num_NO.rename(columns = {'target_y':'target'}, inplace = True) 
train_id = freq_num_NO[["target", "id"]]
freq_num_NO = freq_num_NO.drop(["target_x", "id"], axis=1)
freq_num_test_NO = pd.read_csv("freq_num_test.csv", index_col="Unnamed: 0")
test_id = pd.DataFrame(freq_num_test_NO["id"])
freq_num_test_NO = freq_num_test_NO.drop(["Unnamed: 0_x", "Unnamed: 0_y", "id"], axis=1)

In [None]:
feature_train = freq_num_NO.loc[:, "cat0":"cat9"].columns

# Keep 5 categorical features
selector = SelectKBest(mutual_info_regression, k=5)
X_cat_train = selector.fit_transform(freq_num_NO[feature_train], freq_num_NO['target'])
X_cat_train = pd.DataFrame(X_cat_train)

feature_test = freq_num_test_NO.loc[:, "cat0":"cat9"].columns
X_cat_test = selector.transform(freq_num_test_NO[feature_test])
X_cat_test = pd.DataFrame(X_cat_test)


feature_train = freq_num_NO.loc[:, "cont0":"cont12"].columns

# Keep 5 numerical features
selector = SelectKBest(f_regression, k=5)
X_num_train = selector.fit_transform(freq_num_NO[feature_train], freq_num_NO['target'])
X_num_train = pd.DataFrame(X_num_train)

feature_test = freq_num_test_NO.loc[:, "cont0":"cont12"].columns
X_num_test = selector.transform(freq_num_test_NO[feature_test])
X_num_test = pd.DataFrame(X_num_test)


# Get back the features we've kept, zero out all other features
selected_features_train = pd.DataFrame(selector.inverse_transform(X_num_train), 
                                 index=freq_num_NO.index, 
                                 columns=feature_train)

# Get back the features we've kept, zero out all other features
selected_features_test = pd.DataFrame(selector.inverse_transform(X_num_test), 
                                 index=freq_num_test_NO.index, 
                                 columns=feature_test)

# Dropped columns have values of all 0s, so var is 0, drop them
selected_columns_train = selected_features_train.columns[selected_features_train.var() != 0]
# Get the valid dataset with the selected features.
UNI_train_freq_num_NO = freq_num_NO[selected_columns_train]

# Dropped columns have values of all 0s, so var is 0, drop them
selected_columns_test = selected_features_test.columns[selected_features_test.var() != 0]
# Get the valid dataset with the selected features.
UNI_test_freq_num_NO = freq_num_test_NO[selected_columns_test]

# merges
UNI_train_freq_num_NO = UNI_train_freq_num_NO.merge(X_cat_train, left_index=True, right_index=True)
UNI_test_freq_num_NO = UNI_test_freq_num_NO.merge(X_cat_test, left_index=True, right_index=True)
UNI_train_freq_num_NO = pd.merge(UNI_train_freq_num_NO, train_id, left_index=True, right_index=True)
UNI_test_freq_num_NO = pd.merge(UNI_test_freq_num_NO, test_id, left_index=True, right_index=True)

In [None]:
UNI_train_freq_num_NO.to_csv("UNI_train_freq_num_NO.csv")
UNI_test_freq_num_NO.to_csv("UNI_test_freq_num_NO.csv")

15. freq_num_NO_TNO

In [None]:
freq_num_NO_TNO = pd.read_csv("freq_num_NO_TNO.csv", index_col="Unnamed: 0")
freq_num_NO_TNO.rename(columns = {'target_y':'target'}, inplace = True) 
train_id = freq_num_NO_TNO[["target", "id"]]
freq_num_NO_TNO = freq_num_NO_TNO.drop(["target_x", "id"], axis=1)
freq_num_test_NO_TNO = pd.read_csv("freq_num_test.csv", index_col="Unnamed: 0")
test_id = pd.DataFrame(freq_num_test_NO_TNO["id"])
freq_num_test_NO_TNO = freq_num_test_NO_TNO.drop(["Unnamed: 0_x", "Unnamed: 0_y", "id"], axis=1)

In [None]:
feature_train = freq_num_NO_TNO.loc[:, "cat0":"cat9"].columns

# Keep 5 categorical features
selector = SelectKBest(mutual_info_regression, k=5)
X_cat_train = selector.fit_transform(freq_num_NO_TNO[feature_train], freq_num_NO_TNO['target'])
X_cat_train = pd.DataFrame(X_cat_train)

feature_test = freq_num_test_NO_TNO.loc[:, "cat0":"cat9"].columns
X_cat_test = selector.transform(freq_num_test_NO_TNO[feature_test])
X_cat_test = pd.DataFrame(X_cat_test)


feature_train = freq_num_NO_TNO.loc[:, "cont0":"cont12"].columns

# Keep 5 numerical features
selector = SelectKBest(f_regression, k=5)
X_num_train = selector.fit_transform(freq_num_NO_TNO[feature_train], freq_num_NO_TNO['target'])
X_num_train = pd.DataFrame(X_num_train)

feature_test = freq_num_test_NO_TNO.loc[:, "cont0":"cont12"].columns
X_num_test = selector.transform(freq_num_test_NO_TNO[feature_test])
X_num_test = pd.DataFrame(X_num_test)


# Get back the features we've kept, zero out all other features
selected_features_train = pd.DataFrame(selector.inverse_transform(X_num_train), 
                                 index=freq_num_NO_TNO.index, 
                                 columns=feature_train)

# Get back the features we've kept, zero out all other features
selected_features_test = pd.DataFrame(selector.inverse_transform(X_num_test), 
                                 index=freq_num_test_NO_TNO.index, 
                                 columns=feature_test)

# Dropped columns have values of all 0s, so var is 0, drop them
selected_columns_train = selected_features_train.columns[selected_features_train.var() != 0]
# Get the valid dataset with the selected features.
UNI_train_freq_num_NO_TNO = freq_num_NO_TNO[selected_columns_train]

# Dropped columns have values of all 0s, so var is 0, drop them
selected_columns_test = selected_features_test.columns[selected_features_test.var() != 0]
# Get the valid dataset with the selected features.
UNI_test_freq_num_NO_TNO = freq_num_test_NO_TNO[selected_columns_test]

# merges
UNI_train_freq_num_NO_TNO = UNI_train_freq_num_NO_TNO.merge(X_cat_train, left_index=True, right_index=True)
UNI_test_freq_num_NO_TNO = UNI_test_freq_num_NO_TNO.merge(X_cat_test, left_index=True, right_index=True)
UNI_train_freq_num_NO_TNO = pd.merge(UNI_train_freq_num_NO_TNO, train_id, left_index=True, right_index=True)
UNI_test_freq_num_NO_TNO = pd.merge(UNI_test_freq_num_NO_TNO, test_id, left_index=True, right_index=True)

In [None]:
UNI_train_freq_num_NO_TNO.to_csv("UNI_train_freq_num_NO_TNO.csv")
UNI_test_freq_num_NO_TNO.to_csv("UNI_test_freq_num_NO_TNO.csv")

16. freq_log

In [None]:
freq_log = pd.read_csv("freq_log.csv", index_col="Unnamed: 0")
freq_log.rename(columns = {'target_y':'target'}, inplace = True) 
train_id = freq_log[["target", "id"]]
freq_log = freq_log.drop(["id","Unnamed: 0.1"], axis=1)
freq_log_test = pd.read_csv("freq_log_test.csv", index_col="Unnamed: 0")
test_id = pd.DataFrame(freq_log_test["id"])
freq_log_test = freq_log_test.drop(["Unnamed: 0_x", "Unnamed: 0_y", "id"], axis=1)

In [None]:
feature_train = freq_log.loc[:, "cat0":"cat9"].columns

# Keep 5 categorical features
selector = SelectKBest(mutual_info_regression, k=5)
X_cat_train = selector.fit_transform(freq_log[feature_train], freq_log['target'])
X_cat_train = pd.DataFrame(X_cat_train)

feature_test = freq_log_test.loc[:, "cat0":"cat9"].columns
X_cat_test = selector.transform(freq_log_test[feature_test])
X_cat_test = pd.DataFrame(X_cat_test)


feature_train = freq_log.loc[:, "cont0_log":"cont12_log"].columns

# Keep 5 numerical features
selector = SelectKBest(f_regression, k=5)
X_num_train = selector.fit_transform(freq_log[feature_train], freq_log['target'])
X_num_train = pd.DataFrame(X_num_train)

feature_test = freq_log_test.loc[:, "cont0_log":"cont12_log"].columns
X_num_test = selector.transform(freq_log_test[feature_test])
X_num_test = pd.DataFrame(X_num_test)


# Get back the features we've kept, zero out all other features
selected_features_train = pd.DataFrame(selector.inverse_transform(X_num_train), 
                                 index=freq_log.index, 
                                 columns=feature_train)

# Get back the features we've kept, zero out all other features
selected_features_test = pd.DataFrame(selector.inverse_transform(X_num_test), 
                                 index=freq_log_test.index, 
                                 columns=feature_test)

# Dropped columns have values of all 0s, so var is 0, drop them
selected_columns_train = selected_features_train.columns[selected_features_train.var() != 0]
# Get the valid dataset with the selectedfreq features.
UNI_train_freq_log = freq_log[selected_columns_train]

# Dropped columns have values of all 0s, so var is 0, drop them
selected_columns_test = selected_features_test.columns[selected_features_test.var() != 0]
# Get the valid dataset with the selected features.
UNI_test_freq_log = freq_log_test[selected_columns_test]

# merges
UNI_train_freq_log = UNI_train_freq_log.merge(X_cat_train, left_index=True, right_index=True)
UNI_test_freq_log = UNI_test_freq_log.merge(X_cat_test, left_index=True, right_index=True)
UNI_train_freq_log = pd.merge(UNI_train_freq_log, train_id, left_index=True, right_index=True)
UNI_test_freq_log = pd.merge(UNI_test_freq_log, test_id, left_index=True, right_index=True)

In [None]:
UNI_train_freq_log.to_csv("UNI_train_freq_log.csv")
UNI_test_freq_log.to_csv("UNI_test_freq_log.csv")

17. freq_log_NO

In [None]:
freq_log_NO = pd.read_csv("freq_log_NO.csv", index_col="Unnamed: 0")
freq_log_NO.rename(columns = {'target_y':'target'}, inplace = True) 
train_id = freq_log_NO[["target", "id"]]
freq_log_NO = freq_log_NO.drop(["id", "Unnamed: 0.1"], axis=1)
freq_log_test_NO = pd.read_csv("freq_log_test.csv", index_col="Unnamed: 0")
test_id = pd.DataFrame(freq_log_test_NO["id"])
freq_log_test_NO = freq_log_test_NO.drop(["Unnamed: 0_x", "Unnamed: 0_y", "id"], axis=1)

In [None]:
feature_train = freq_log_NO.loc[:, "cat0":"cat9"].columns

# Keep 5 categorical features
selector = SelectKBest(mutual_info_regression, k=5)
X_cat_train = selector.fit_transform(freq_log_NO[feature_train], freq_log_NO['target'])
X_cat_train = pd.DataFrame(X_cat_train)

feature_test = freq_log_test_NO.loc[:, "cat0":"cat9"].columns
X_cat_test = selector.transform(freq_log_test_NO[feature_test])
X_cat_test = pd.DataFrame(X_cat_test)


feature_train = freq_log_NO.loc[:, "cont0_log":"cont12_log"].columns

# Keep 5 numerical features
selector = SelectKBest(f_regression, k=5)
X_num_train = selector.fit_transform(freq_log_NO[feature_train], freq_log_NO['target'])
X_num_train = pd.DataFrame(X_num_train)

feature_test = freq_log_test_NO.loc[:, "cont0_log":"cont12_log"].columns
X_num_test = selector.transform(freq_log_test_NO[feature_test])
X_num_test = pd.DataFrame(X_num_test)


# Get back the features we've kept, zero out all other features
selected_features_train = pd.DataFrame(selector.inverse_transform(X_num_train), 
                                 index=freq_log_NO.index, 
                                 columns=feature_train)

# Get back the features we've kept, zero out all other features
selected_features_test = pd.DataFrame(selector.inverse_transform(X_num_test), 
                                 index=freq_log_test_NO.index, 
                                 columns=feature_test)

# Dropped columns have values of all 0s, so var is 0, drop them
selected_columns_train = selected_features_train.columns[selected_features_train.var() != 0]
# Get the valid dataset with the selectedfreq features.
UNI_train_freq_log_NO = freq_log_NO[selected_columns_train]

# Dropped columns have values of all 0s, so var is 0, drop them
selected_columns_test = selected_features_test.columns[selected_features_test.var() != 0]
# Get the valid dataset with the selected features.
UNI_test_freq_log_NO = freq_log_test_NO[selected_columns_test]

# merges
UNI_train_freq_log_NO = UNI_train_freq_log_NO.merge(X_cat_train, left_index=True, right_index=True)
UNI_test_freq_log_NO = UNI_test_freq_log_NO.merge(X_cat_test, left_index=True, right_index=True)
UNI_train_freq_log_NO = pd.merge(UNI_train_freq_log_NO, train_id, left_index=True, right_index=True)
UNI_test_freq_log_NO = pd.merge(UNI_test_freq_log_NO, test_id, left_index=True, right_index=True)

In [None]:
UNI_train_freq_log_NO.to_csv("UNI_train_freq_log_NO.csv")
UNI_test_freq_log_NO.to_csv("UNI_test_freq_log_NO.csv")

18. freq_log_NO_TNO

In [None]:
freq_log_NO_TNO = pd.read_csv("freq_log_NO_TNO.csv", index_col="Unnamed: 0")
freq_log_NO_TNO.rename(columns = {'target_y':'target'}, inplace = True) 
train_id = freq_log_NO_TNO[["target", "id"]]
freq_log_NO_TNO = freq_log_NO_TNO.drop(["Unnamed: 0.1", "id"], axis=1)
freq_log_test_NO_TNO = pd.read_csv("freq_log_test.csv", index_col="Unnamed: 0")
test_id = pd.DataFrame(freq_log_test_NO_TNO["id"])
freq_log_test_NO_TNO = freq_log_test_NO_TNO.drop(["Unnamed: 0_x", "Unnamed: 0_y", "id"], axis=1)

In [None]:
feature_train = freq_log_NO_TNO.loc[:, "cat0":"cat9"].columns

# Keep 5 categorical features
selector = SelectKBest(mutual_info_regression, k=5)
X_cat_train = selector.fit_transform(freq_log_NO_TNO[feature_train], freq_log_NO_TNO['target'])
X_cat_train = pd.DataFrame(X_cat_train)

feature_test = freq_log_test_NO_TNO.loc[:, "cat0":"cat9"].columns
X_cat_test = selector.transform(freq_log_test_NO_TNO[feature_test])
X_cat_test = pd.DataFrame(X_cat_test)


feature_train = freq_log_NO_TNO.loc[:, "cont0_log":"cont12_log"].columns

# Keep 5 numerical features
selector = SelectKBest(f_regression, k=5)
X_num_train = selector.fit_transform(freq_log_NO_TNO[feature_train], freq_log_NO_TNO['target'])
X_num_train = pd.DataFrame(X_num_train)

feature_test = freq_log_test_NO_TNO.loc[:, "cont0_log":"cont12_log"].columns
X_num_test = selector.transform(freq_log_test_NO_TNO[feature_test])
X_num_test = pd.DataFrame(X_num_test)


# Get back the features we've kept, zero out all other features
selected_features_train = pd.DataFrame(selector.inverse_transform(X_num_train), 
                                 index=freq_log_NO_TNO.index, 
                                 columns=feature_train)

# Get back the features we've kept, zero out all other features
selected_features_test = pd.DataFrame(selector.inverse_transform(X_num_test), 
                                 index=freq_log_test_NO_TNO.index, 
                                 columns=feature_test)

# Dropped columns have values of all 0s, so var is 0, drop them
selected_columns_train = selected_features_train.columns[selected_features_train.var() != 0]
# Get the valid dataset with the selectedfreq features.
UNI_train_freq_log_NO_TNO = freq_log_NO_TNO[selected_columns_train]

# Dropped columns have values of all 0s, so var is 0, drop them
selected_columns_test = selected_features_test.columns[selected_features_test.var() != 0]
# Get the valid dataset with the selected features.
UNI_test_freq_log_NO_TNO = freq_log_test_NO_TNO[selected_columns_test]

# merges
UNI_train_freq_log_NO_TNO = UNI_train_freq_log_NO_TNO.merge(X_cat_train, left_index=True, right_index=True)
UNI_test_freq_log_NO_TNO = UNI_test_freq_log_NO_TNO.merge(X_cat_test, left_index=True, right_index=True)
UNI_train_freq_log_NO_TNO = pd.merge(UNI_train_freq_log_NO_TNO, train_id, left_index=True, right_index=True)
UNI_test_freq_log_NO_TNO = pd.merge(UNI_test_freq_log_NO_TNO, test_id, left_index=True, right_index=True)

In [None]:
UNI_train_freq_log_NO_TNO.to_csv("UNI_train_freq_log_NO_TNO.csv")
UNI_test_freq_log_NO_TNO.to_csv("UNI_test_freq_log_NO_TNO.csv")

# Principle Component Analysis (PCA) Feature Selection

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

1. binary_num

In [None]:
pd.set_option("display.max_columns", None)

In [None]:
binary_num = pd.read_csv("binary_num.csv", index_col="Unnamed: 0")
binary_num

In [None]:
binary_num_test = pd.read_csv("binary_num_test.csv", index_col="Unnamed: 0")
binary_num_test = binary_num_test.drop(["Unnamed: 0_x", "Unnamed: 0_y"], axis=1)
binary_num_test

In [None]:
X_train = binary_num.loc[:, "cat0_0":"cont12"]
X_test = binary_num_test.loc[:, "cat0_0":"cont12"]

In [None]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
pca = PCA(n_components=10)
pca.fit(X_train)

In [None]:
PCA_train_binary_num = pca.transform(X_train)
PCA_train_binary_num = pd.DataFrame(PCA_train_binary_num)
PCA_train_binary_num

In [None]:
PCA_test_binary_num = pca.transform(X_test)
PCA_test_binary_num = pd.DataFrame(PCA_test_binary_num)
PCA_test_binary_num

In [None]:
target = binary_num[["id", "target"]]
i_d = pd.DataFrame(binary_num_test["id"])

In [None]:
PCA_train_binary_num = pd.merge(PCA_train_binary_num, target, left_index=True, right_index=True)
PCA_train_binary_num

In [None]:
PCA_test_binary_num = pd.merge(PCA_test_binary_num, i_d, left_index=True, right_index=True)
PCA_test_binary_num

In [None]:
PCA_train_binary_num.to_csv("PCA_train_binary_num.csv")
PCA_test_binary_num.to_csv("PCA_test_binary_num.csv")

2. binary_num_NO

In [None]:
binary_num_NO = pd.read_csv("binary_num_NO.csv", index_col="Unnamed: 0")
binary_num_NO.rename(columns = {'target_y':'target'}, inplace = True) 
train_id = binary_num_NO[["id", "target"]]
binary_num_NO = binary_num_NO.drop(["target_x", "id"], axis=1)
binary_num_test_NO = pd.read_csv("binary_num_test.csv", index_col="Unnamed: 0")
test_id = pd.DataFrame(binary_num_test_NO["id"])
binary_num_test_NO = binary_num_test_NO.drop(["Unnamed: 0_x", "Unnamed: 0_y", "id"], axis=1)

In [None]:
X_train = binary_num_NO.loc[:, "cat0_0":"cont12"]
X_test = binary_num_test_NO.loc[:, "cat0_0":"cont12"]

# scale data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# apply PCA
pca = PCA(n_components=10)
pca.fit(X_train)

PCA_train_binary_num_NO = pca.transform(X_train)
PCA_train_binary_num_NO = pd.DataFrame(PCA_train_binary_num_NO)

PCA_test_binary_num_NO = pca.transform(X_test)
PCA_test_binary_num_NO = pd.DataFrame(PCA_test_binary_num_NO)

# merge dataframes
PCA_train_binary_num_NO = pd.merge(PCA_train_binary_num_NO, train_id, left_index=True, right_index=True)
PCA_test_binary_num_NO = pd.merge(PCA_test_binary_num_NO, test_id, left_index=True, right_index=True)

In [None]:
PCA_train_binary_num_NO.to_csv("PCA_train_binary_num_NO.csv")
PCA_test_binary_num_NO.to_csv("PCA_test_binary_num_NO.csv")

3. binary_num_NO_TNO

In [None]:
binary_num_NO_TNO = pd.read_csv("binary_num_NO_TNO.csv", index_col="Unnamed: 0")
binary_num_NO_TNO.rename(columns = {'target_y':'target'}, inplace = True) 
train_id = binary_num_NO_TNO[["id", "target"]]
binary_num_NO_TNO = binary_num_NO_TNO.drop(["target_x", "id"], axis=1)
binary_num_test_NO_TNO = pd.read_csv("binary_num_test.csv", index_col="Unnamed: 0")
test_id = pd.DataFrame(binary_num_test_NO_TNO["id"])
binary_num_test_NO_TNO = binary_num_test_NO_TNO.drop(["Unnamed: 0_x", "Unnamed: 0_y", "id"], axis=1)

In [None]:
X_train = binary_num_NO_TNO.loc[:, "cat0_0":"cont12"]
X_test = binary_num_test_NO_TNO.loc[:, "cat0_0":"cont12"]

# scale data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# apply PCA
pca = PCA(n_components=10)
pca.fit(X_train)

PCA_train_binary_num_NO_TNO = pca.transform(X_train)
PCA_train_binary_num_NO_TNO = pd.DataFrame(PCA_train_binary_num_NO_TNO)

PCA_test_binary_num_NO_TNO = pca.transform(X_test)
PCA_test_binary_num_NO_TNO = pd.DataFrame(PCA_test_binary_num_NO_TNO)

# merge dataframes
PCA_train_binary_num_NO_TNO = pd.merge(PCA_train_binary_num_NO_TNO, train_id, left_index=True, right_index=True)
PCA_test_binary_num_NO_TNO = pd.merge(PCA_test_binary_num_NO_TNO, test_id, left_index=True, right_index=True)

In [None]:
PCA_train_binary_num_NO_TNO.to_csv("PCA_train_binary_num_NO_TNO.csv")
PCA_test_binary_num_NO_TNO.to_csv("PCA_test_binary_num_NO_TNO.csv")

4. binary_log

In [None]:
binary_log = pd.read_csv("binary_log.csv", index_col="Unnamed: 0")
train_id = binary_log[["id", "target"]]
binary_log = binary_log.drop(["Unnamed: 0.1", "id"], axis=1)
binary_log_test = pd.read_csv("binary_log_test.csv", index_col="Unnamed: 0")
test_id = pd.DataFrame(binary_log_test["id"])
binary_log_test = binary_log_test.drop(["Unnamed: 0_x", "Unnamed: 0_y", "id"], axis=1)

In [None]:
X_train = binary_log.loc[:, "cat0_0":"cont12_log"]
X_test = binary_log_test.loc[:, "cat0_0":"cont12_log"]

# scale data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# apply PCA
pca = PCA(n_components=10)
pca.fit(X_train)

PCA_train_binary_log = pca.transform(X_train)
PCA_train_binary_log = pd.DataFrame(PCA_train_binary_log)

PCA_test_binary_log = pca.transform(X_test)
PCA_test_binary_log = pd.DataFrame(PCA_test_binary_log)

# merge dataframes
PCA_train_binary_log = pd.merge(PCA_train_binary_log, train_id, left_index=True, right_index=True)
PCA_test_binary_log = pd.merge(PCA_test_binary_log, test_id, left_index=True, right_index=True)

In [None]:
PCA_train_binary_log.to_csv("PCA_train_binary_log.csv")
PCA_test_binary_log.to_csv("PCA_test_binary_log.csv")

5. binary_log_NO

In [None]:
binary_log_NO = pd.read_csv("binary_log_NO.csv", index_col="Unnamed: 0")
binary_log_NO.rename(columns = {'target_y':'target'}, inplace = True)
train_id = binary_log_NO[["id", "target"]]
binary_log_NO = binary_log_NO.drop(["Unnamed: 0.1", "id"], axis=1)
binary_log_test_NO = pd.read_csv("binary_log_test.csv", index_col="Unnamed: 0")
test_id = pd.DataFrame(binary_log_test_NO["id"])
binary_log_test_NO = binary_log_test_NO.drop(["Unnamed: 0_x", "Unnamed: 0_y", "id"], axis=1)

In [None]:
X_train = binary_log_NO.loc[:, "cat0_0":"cont12_log"]
X_test = binary_log_test_NO.loc[:, "cat0_0":"cont12_log"]

# scale data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# apply PCA
pca = PCA(n_components=10)
pca.fit(X_train)

PCA_train_binary_log_NO = pca.transform(X_train)
PCA_train_binary_log_NO = pd.DataFrame(PCA_train_binary_log_NO)

PCA_test_binary_log_NO = pca.transform(X_test)
PCA_test_binary_log_NO = pd.DataFrame(PCA_test_binary_log_NO)

# merge dataframes
PCA_train_binary_log_NO = pd.merge(PCA_train_binary_log_NO, train_id, left_index=True, right_index=True)
PCA_test_binary_log_NO = pd.merge(PCA_test_binary_log_NO, test_id, left_index=True, right_index=True)

In [None]:
PCA_train_binary_log_NO.to_csv("PCA_train_binary_log_NO.csv")
PCA_test_binary_log_NO.to_csv("PCA_test_binary_log_NO.csv")

6. binary_log_NO_TNO

In [None]:
binary_log_NO_TNO = pd.read_csv("binary_log_NO_TNO.csv", index_col="Unnamed: 0")
train_id = binary_log_NO_TNO[["id", "target"]]
binary_log_NO_TNO = binary_log_NO_TNO.drop(["Unnamed: 0.1", "id"], axis=1)
binary_log_test_NO_TNO = pd.read_csv("binary_log_test.csv", index_col="Unnamed: 0")
test_id = pd.DataFrame(binary_log_test_NO_TNO["id"])
binary_log_test_NO_TNO = binary_log_test_NO_TNO.drop(["Unnamed: 0_x", "Unnamed: 0_y", "id"], axis=1)

In [None]:
X_train = binary_log_NO_TNO.loc[:, "cat0_0":"cont12_log"]
X_test = binary_log_test_NO_TNO.loc[:, "cat0_0":"cont12_log"]

# scale data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# apply PCA
pca = PCA(n_components=10)
pca.fit(X_train)

PCA_train_binary_log_NO_TNO = pca.transform(X_train)
PCA_train_binary_log_NO_TNO = pd.DataFrame(PCA_train_binary_log_NO_TNO)

PCA_test_binary_log_NO_TNO = pca.transform(X_test)
PCA_test_binary_log_NO_TNO = pd.DataFrame(PCA_test_binary_log_NO_TNO)

# merge dataframes
PCA_train_binary_log_NO_TNO = pd.merge(PCA_train_binary_log_NO_TNO, train_id, left_index=True, right_index=True)
PCA_test_binary_log_NO_TNO = pd.merge(PCA_test_binary_log_NO_TNO, test_id, left_index=True, right_index=True)

In [None]:
PCA_train_binary_log_NO_TNO.to_csv("PCA_train_binary_log_NO_TNO.csv")
PCA_test_binary_log_NO_TNO.to_csv("PCA_test_binary_log_NO_TNO.csv")

7. ordinal_num

In [None]:
ordinal_num = pd.read_csv("ordinal_num.csv", index_col="Unnamed: 0")
ordinal_num.rename(columns = {'target_y':'target'}, inplace = True) 
train_id = ordinal_num[["id", "target"]]
ordinal_num = ordinal_num.drop("id", axis=1)
ordinal_num_test = pd.read_csv("ordinal_num_test.csv", index_col="Unnamed: 0")
test_id = pd.DataFrame(ordinal_num_test["id"])
ordinal_num_test = ordinal_num_test.drop(["Unnamed: 0_x", "Unnamed: 0_y", "id"], axis=1)    

In [None]:
X_train = ordinal_num.loc[:, "cat0":"cont12"]
X_test = ordinal_num_test.loc[:, "cat0":"cont12"]

# scale data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# apply PCA
pca = PCA(n_components=10)
pca.fit(X_train)

PCA_train_ordinal_num = pca.transform(X_train)
PCA_train_ordinal_num = pd.DataFrame(PCA_train_ordinal_num)

PCA_test_ordinal_num = pca.transform(X_test)
PCA_test_ordinal_num = pd.DataFrame(PCA_test_ordinal_num)

# merge dataframes
PCA_train_ordinal_num = pd.merge(PCA_train_ordinal_num, train_id, left_index=True, right_index=True)
PCA_test_ordinal_num = pd.merge(PCA_test_ordinal_num, test_id, left_index=True, right_index=True)

In [None]:
PCA_train_ordinal_num.to_csv("PCA_train_ordinal_num.csv")
PCA_test_ordinal_num.to_csv("PCA_test_ordinal_num.csv")

8. ordinal_num_NO

In [None]:
ordinal_num_NO = pd.read_csv("ordinal_num_NO.csv", index_col="Unnamed: 0")
ordinal_num_NO.rename(columns = {'target_y':'target'}, inplace = True) 
train_id = ordinal_num_NO[["id", "target"]]
ordinal_num_NO = ordinal_num_NO.drop(["target_x", "id"], axis=1)
ordinal_num_test_NO = pd.read_csv("ordinal_num_test.csv", index_col="Unnamed: 0")
test_id = pd.DataFrame(ordinal_num_test_NO["id"])
ordinal_num_test_NO = ordinal_num_test_NO.drop(["Unnamed: 0_x", "Unnamed: 0_y", "id"], axis=1)

In [None]:
X_train = ordinal_num_NO.loc[:, "cat0":"cont12"]
X_test = ordinal_num_test_NO.loc[:, "cat0":"cont12"]

# scale data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# apply PCA
pca = PCA(n_components=10)
pca.fit(X_train)

PCA_train_ordinal_num_NO = pca.transform(X_train)
PCA_train_ordinal_num_NO = pd.DataFrame(PCA_train_ordinal_num_NO)

PCA_test_ordinal_num_NO = pca.transform(X_test)
PCA_test_ordinal_num_NO = pd.DataFrame(PCA_test_ordinal_num_NO)

# merge dataframes
PCA_train_ordinal_num_NO = pd.merge(PCA_train_ordinal_num_NO, train_id, left_index=True, right_index=True)
PCA_test_ordinal_num_NO = pd.merge(PCA_test_ordinal_num_NO, test_id, left_index=True, right_index=True)

In [None]:
PCA_train_ordinal_num_NO.to_csv("PCA_train_ordinal_num_NO.csv")
PCA_test_ordinal_num_NO.to_csv("PCA_test_ordinal_num_NO.csv")

9. ordinal_num_NO_TNO

In [None]:
ordinal_num_NO_TNO = pd.read_csv("ordinal_num_NO_TNO.csv", index_col="Unnamed: 0")
ordinal_num_NO_TNO.rename(columns = {'target_y':'target'}, inplace = True) 
train_id = ordinal_num_NO_TNO[["id", "target"]]
ordinal_num_NO_TNO = ordinal_num_NO_TNO.drop(["target_x", "id"], axis=1)
ordinal_num_test_NO_TNO = pd.read_csv("ordinal_num_test.csv", index_col="Unnamed: 0")
test_id = pd.DataFrame(ordinal_num_test_NO_TNO["id"])
ordinal_num_test_NO_TNO = ordinal_num_test_NO_TNO.drop(["Unnamed: 0_x", "Unnamed: 0_y", "id"], axis=1)

In [None]:
X_train = ordinal_num_NO_TNO.loc[:, "cat0":"cont12"]
X_test = ordinal_num_test_NO_TNO.loc[:, "cat0":"cont12"]

# scale data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# apply PCA
pca = PCA(n_components=10)
pca.fit(X_train)

PCA_train_ordinal_num_NO_TNO = pca.transform(X_train)
PCA_train_ordinal_num_NO_TNO = pd.DataFrame(PCA_train_ordinal_num_NO_TNO)

PCA_test_ordinal_num_NO_TNO = pca.transform(X_test)
PCA_test_ordinal_num_NO_TNO = pd.DataFrame(PCA_test_ordinal_num_NO_TNO)

# merge dataframes
PCA_train_ordinal_num_NO_TNO = pd.merge(PCA_train_ordinal_num_NO_TNO, train_id, left_index=True, right_index=True)
PCA_test_ordinal_num_NO_TNO = pd.merge(PCA_test_ordinal_num_NO_TNO, test_id, left_index=True, right_index=True)

In [None]:
PCA_train_ordinal_num_NO_TNO.to_csv("PCA_train_ordinal_num_NO_TNO.csv")
PCA_test_ordinal_num_NO_TNO.to_csv("PCA_test_ordinal_num_NO_TNO.csv")

10. ordinal_log

In [None]:
ordinal_log = pd.read_csv("ordinal_log.csv", index_col="Unnamed: 0")
ordinal_log.rename(columns = {'target_y':'target'}, inplace = True)
train_id = ordinal_log[["id", "target"]]
ordinal_log = ordinal_log.drop(["Unnamed: 0.1", "id"], axis=1)
 
ordinal_log_test = pd.read_csv("ordinal_log_test.csv", index_col="Unnamed: 0")
test_id = pd.DataFrame(ordinal_log_test["id"])
ordinal_log_test = ordinal_log_test.drop(["Unnamed: 0_x", "Unnamed: 0_y", "id"], axis=1)

In [None]:
X_train = ordinal_log.loc[:, "cat0":"cont12_log"]
X_test = ordinal_log_test.loc[:, "cat0":"cont12_log"]

# scale data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# apply PCA
pca = PCA(n_components=10)
pca.fit(X_train)

PCA_train_ordinal_log = pca.transform(X_train)
PCA_train_ordinal_log = pd.DataFrame(PCA_train_ordinal_log)

PCA_test_ordinal_log = pca.transform(X_test)
PCA_test_ordinal_log = pd.DataFrame(PCA_test_ordinal_log)

# merge dataframes
PCA_train_ordinal_log = pd.merge(PCA_train_ordinal_log, train_id, left_index=True, right_index=True)
PCA_test_ordinal_log = pd.merge(PCA_test_ordinal_log, test_id, left_index=True, right_index=True)

In [None]:
PCA_train_ordinal_log.to_csv("PCA_train_ordinal_log.csv")
PCA_test_ordinal_log.to_csv("PCA_test_ordinal_log.csv")

11. ordinal_log_NO

In [None]:
ordinal_log_NO = pd.read_csv("ordinal_log_NO.csv", index_col="Unnamed: 0")
ordinal_log_NO = ordinal_log_NO.drop("Unnamed: 0.1", axis=1)
ordinal_log_NO.rename(columns = {'target_y':'target'}, inplace = True) 
train_id = ordinal_log_NO[["target", "id"]]
ordinal_log_NO = ordinal_log_NO.drop("id", axis=1)
ordinal_log_test_NO = pd.read_csv("ordinal_log_test.csv", index_col="Unnamed: 0")
test_id = pd.DataFrame(ordinal_log_test_NO["id"])
ordinal_log_test_NO = ordinal_log_test_NO.drop(["Unnamed: 0_x", "Unnamed: 0_y", "id"], axis=1)

In [None]:
X_train = ordinal_log_NO.loc[:, "cat0":"cont12_log"]
X_test = ordinal_log_test_NO.loc[:, "cat0":"cont12_log"]

# scale data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# apply PCA
pca = PCA(n_components=10)
pca.fit(X_train)

PCA_train_ordinal_log_NO = pca.transform(X_train)
PCA_train_ordinal_log_NO = pd.DataFrame(PCA_train_ordinal_log_NO)

PCA_test_ordinal_log_NO = pca.transform(X_test)
PCA_test_ordinal_log_NO = pd.DataFrame(PCA_test_ordinal_log_NO)

# merge dataframes
PCA_train_ordinal_log_NO = pd.merge(PCA_train_ordinal_log_NO, train_id, left_index=True, right_index=True)
PCA_test_ordinal_log_NO = pd.merge(PCA_test_ordinal_log_NO, test_id, left_index=True, right_index=True)

In [None]:
PCA_train_ordinal_log_NO.to_csv("PCA_train_ordinal_log_NO.csv")
PCA_test_ordinal_log_NO.to_csv("PCA_test_ordinal_log_NO.csv")

12. ordinal_log_NO_TNO

In [None]:
ordinal_log_NO_TNO = pd.read_csv("ordinal_log_NO_TNO.csv", index_col="Unnamed: 0")
ordinal_log_NO_TNO = ordinal_log_NO_TNO.drop("Unnamed: 0.1", axis=1)
ordinal_log_NO_TNO.rename(columns = {'target_y':'target'}, inplace = True) 
train_id = ordinal_log_NO_TNO[["target", "id"]]
ordinal_log_NO_TNO = ordinal_log_NO_TNO.drop("id", axis=1)
ordinal_log_test_NO_TNO = pd.read_csv("ordinal_log_test.csv", index_col="Unnamed: 0")
test_id = pd.DataFrame(ordinal_log_test_NO_TNO["id"])
ordinal_log_test_NO_TNO = ordinal_log_test_NO_TNO.drop(["Unnamed: 0_x", "Unnamed: 0_y", "id"], axis=1)

In [None]:
X_train = ordinal_log_NO_TNO.loc[:, "cat0":"cont12_log"]
X_test = ordinal_log_test_NO_TNO.loc[:, "cat0":"cont12_log"]

# scale data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# apply PCA
pca = PCA(n_components=10)
pca.fit(X_train)

PCA_train_ordinal_log_NO_TNO = pca.transform(X_train)
PCA_train_ordinal_log_NO_TNO = pd.DataFrame(PCA_train_ordinal_log_NO_TNO)

PCA_test_ordinal_log_NO_TNO = pca.transform(X_test)
PCA_test_ordinal_log_NO_TNO = pd.DataFrame(PCA_test_ordinal_log_NO_TNO)

# merge dataframes
PCA_train_ordinal_log_NO_TNO = pd.merge(PCA_train_ordinal_log_NO_TNO, train_id, left_index=True, right_index=True)
PCA_test_ordinal_log_NO_TNO = pd.merge(PCA_test_ordinal_log_NO_TNO, test_id, left_index=True, right_index=True)

In [None]:
PCA_train_ordinal_log_NO_TNO.to_csv("PCA_train_ordinal_log_NO_TNO.csv")
PCA_test_ordinal_log_NO_TNO.to_csv("PCA_test_ordinal_log_NO_TNO.csv")

13. freq_num

In [None]:
freq_num = pd.read_csv("freq_num.csv", index_col="Unnamed: 0")
freq_num.rename(columns = {'target_y':'target'}, inplace = True) 
train_id = freq_num[["target", "id"]]
freq_num = freq_num.drop("id", axis=1)
freq_num_test = pd.read_csv("freq_num_test.csv", index_col="Unnamed: 0")
test_id = pd.DataFrame(freq_num_test["id"])
freq_num_test = freq_num_test.drop(["Unnamed: 0_x", "Unnamed: 0_y", "id"], axis=1)

In [None]:
X_train = freq_num.loc[:, "cat0":"cont12"]
X_test = freq_num_test.loc[:, "cat0":"cont12"]

# scale data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# apply PCA
pca = PCA(n_components=10)
pca.fit(X_train)

PCA_train_freq_num = pca.transform(X_train)
PCA_train_freq_num = pd.DataFrame(PCA_train_freq_num)

PCA_test_freq_num = pca.transform(X_test)
PCA_test_freq_num = pd.DataFrame(PCA_test_freq_num)

# merge dataframes
PCA_train_freq_num = pd.merge(PCA_train_freq_num, train_id, left_index=True, right_index=True)
PCA_test_freq_num = pd.merge(PCA_test_freq_num, test_id, left_index=True, right_index=True)

In [None]:
PCA_train_freq_num.to_csv("PCA_train_freq_num.csv")
PCA_test_freq_num.to_csv("PCA_test_freq_num.csv")

14. freq_num_NO

In [None]:
freq_num_NO = pd.read_csv("freq_num_NO.csv", index_col="Unnamed: 0")
freq_num_NO.rename(columns = {'target_y':'target'}, inplace = True) 
train_id = freq_num_NO[["target", "id"]]
freq_num_NO = freq_num_NO.drop(["target_x", "id"], axis=1)
freq_num_test_NO = pd.read_csv("freq_num_test.csv", index_col="Unnamed: 0")
test_id = pd.DataFrame(freq_num_test_NO["id"])
freq_num_test_NO = freq_num_test_NO.drop(["Unnamed: 0_x", "Unnamed: 0_y", "id"], axis=1)

In [None]:
X_train = freq_num_NO.loc[:, "cat0":"cont12"]
X_test = freq_num_NO.loc[:, "cat0":"cont12"]

# scale data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# apply PCA
pca = PCA(n_components=10)
pca.fit(X_train)

PCA_train_freq_num_NO = pca.transform(X_train)
PCA_train_freq_num_NO = pd.DataFrame(PCA_train_freq_num_NO)

PCA_test_freq_num_NO = pca.transform(X_test)
PCA_test_freq_num_NO = pd.DataFrame(PCA_test_freq_num_NO)

# merge dataframes
PCA_train_freq_num_NO = pd.merge(PCA_train_freq_num_NO, train_id, left_index=True, right_index=True)
PCA_test_freq_num_NO = pd.merge(PCA_test_freq_num_NO, test_id, left_index=True, right_index=True)

In [None]:
PCA_train_freq_num_NO.to_csv("PCA_train_freq_num_NO.csv")
PCA_test_freq_num_NO.to_csv("PCA_test_freq_num_NO.csv")

15. freq_num_NO_TNO

In [None]:
freq_num_NO_TNO = pd.read_csv("freq_num_NO_TNO.csv", index_col="Unnamed: 0")
freq_num_NO_TNO.rename(columns = {'target_y':'target'}, inplace = True) 
train_id = freq_num_NO_TNO[["target", "id"]]
freq_num_NO_TNO = freq_num_NO_TNO.drop(["target_x", "id"], axis=1)
freq_num_test_NO_TNO = pd.read_csv("freq_num_test.csv", index_col="Unnamed: 0")
test_id = pd.DataFrame(freq_num_test_NO_TNO["id"])
freq_num_test_NO_TNO = freq_num_test_NO_TNO.drop(["Unnamed: 0_x", "Unnamed: 0_y", "id"], axis=1)

In [None]:
X_train = freq_num_NO_TNO.loc[:, "cat0":"cont12"]
X_test = freq_num_NO_TNO.loc[:, "cat0":"cont12"]

# scale data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# apply PCA
pca = PCA(n_components=10)
pca.fit(X_train)

PCA_train_freq_num_NO_TNO = pca.transform(X_train)
PCA_train_freq_num_NO_TNO = pd.DataFrame(PCA_train_freq_num_NO_TNO)

PCA_test_freq_num_NO_TNO = pca.transform(X_test)
PCA_test_freq_num_NO_TNO = pd.DataFrame(PCA_test_freq_num_NO_TNO)

# merge dataframes
PCA_train_freq_num_NO_TNO = pd.merge(PCA_train_freq_num_NO_TNO, train_id, left_index=True, right_index=True)
PCA_test_freq_num_NO_TNO = pd.merge(PCA_test_freq_num_NO_TNO, test_id, left_index=True, right_index=True)

In [None]:
PCA_train_freq_num_NO_TNO.to_csv("PCA_train_freq_num_NO_TNO.csv")
PCA_test_freq_num_NO_TNO.to_csv("PCA_test_freq_num_NO_TNO.csv")

16. freq_log

In [None]:
freq_log = pd.read_csv("freq_log.csv", index_col="Unnamed: 0")
freq_log.rename(columns = {'target_y':'target'}, inplace = True) 
train_id = freq_log[["target", "id"]]
freq_log = freq_log.drop(["id","Unnamed: 0.1"], axis=1)
freq_log_test = pd.read_csv("freq_log_test.csv", index_col="Unnamed: 0")
test_id = pd.DataFrame(freq_log_test["id"])
freq_log_test = freq_log_test.drop(["Unnamed: 0_x", "Unnamed: 0_y", "id"], axis=1)

In [None]:
X_train = freq_log.loc[:, "cat0":"cont12_log"]
X_test = freq_log_test.loc[:, "cat0":"cont12_log"]

# scale data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# apply PCA
pca = PCA(n_components=10)
pca.fit(X_train)

PCA_train_freq_log = pca.transform(X_train)
PCA_train_freq_log = pd.DataFrame(PCA_train_freq_log)

PCA_test_freq_log = pca.transform(X_test)
PCA_test_freq_log = pd.DataFrame(PCA_test_freq_log)

# merge dataframes
PCA_train_freq_log = pd.merge(PCA_train_freq_log, train_id, left_index=True, right_index=True)
PCA_test_freq_log = pd.merge(PCA_test_freq_log, test_id, left_index=True, right_index=True)

In [None]:
PCA_train_freq_log.to_csv("PCA_train_freq_log.csv")
PCA_test_freq_log.to_csv("PCA_test_freq_log.csv")

17. freq_log_NO

In [None]:
freq_log_NO = pd.read_csv("freq_log_NO.csv", index_col="Unnamed: 0")
freq_log_NO.rename(columns = {'target_y':'target'}, inplace = True) 
train_id = freq_log_NO[["target", "id"]]
freq_log_NO = freq_log_NO.drop(["id", "Unnamed: 0.1"], axis=1)
freq_log_test_NO = pd.read_csv("freq_log_test.csv", index_col="Unnamed: 0")
test_id = pd.DataFrame(freq_log_test_NO["id"])
freq_log_test_NO = freq_log_test_NO.drop(["Unnamed: 0_x", "Unnamed: 0_y", "id"], axis=1)

In [None]:
X_train = freq_log_NO.loc[:, "cat0":"cont12_log"]
X_test = freq_log_test_NO.loc[:, "cat0":"cont12_log"]

# scale data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# apply PCA
pca = PCA(n_components=10)
pca.fit(X_train)

PCA_train_freq_log_NO = pca.transform(X_train)
PCA_train_freq_log_NO = pd.DataFrame(PCA_train_freq_log_NO)

PCA_test_freq_log_NO = pca.transform(X_test)
PCA_test_freq_log_NO = pd.DataFrame(PCA_test_freq_log_NO)

# merge dataframes
PCA_train_freq_log_NO = pd.merge(PCA_train_freq_log_NO, train_id, left_index=True, right_index=True)
PCA_test_freq_log_NO = pd.merge(PCA_test_freq_log_NO, test_id, left_index=True, right_index=True)

In [None]:
PCA_train_freq_log_NO.to_csv("PCA_train_freq_log_NO.csv")
PCA_test_freq_log_NO.to_csv("PCA_test_freq_log_NO.csv")

18. freq_log_NO_TNO

In [None]:
freq_log_NO_TNO = pd.read_csv("freq_log_NO_TNO.csv", index_col="Unnamed: 0")
freq_log_NO_TNO.rename(columns = {'target_y':'target'}, inplace = True) 
train_id = freq_log_NO_TNO[["target", "id"]]
freq_log_NO_TNO = freq_log_NO_TNO.drop(["Unnamed: 0.1", "id"], axis=1)
freq_log_test_NO_TNO = pd.read_csv("freq_log_test.csv", index_col="Unnamed: 0")
test_id = pd.DataFrame(freq_log_test_NO_TNO["id"])
freq_log_test_NO_TNO = freq_log_test_NO_TNO.drop(["Unnamed: 0_x", "Unnamed: 0_y", "id"], axis=1)

In [None]:
X_train = freq_log_NO_TNO.loc[:, "cat0":"cont12_log"]
X_test = freq_log_test_NO_TNO.loc[:, "cat0":"cont12_log"]

# scale data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# apply PCA
pca = PCA(n_components=10)
pca.fit(X_train)

PCA_train_freq_log_NO_TNO = pca.transform(X_train)
PCA_train_freq_log_NO_TNO = pd.DataFrame(PCA_train_freq_log_NO_TNO)

PCA_test_freq_log_NO_TNO = pca.transform(X_test)
PCA_test_freq_log_NO_TNO = pd.DataFrame(PCA_test_freq_log_NO_TNO)

# merge dataframes
PCA_train_freq_log_NO_TNO = pd.merge(PCA_train_freq_log_NO_TNO, train_id, left_index=True, right_index=True)
PCA_test_freq_log_NO_TNO = pd.merge(PCA_test_freq_log_NO_TNO, test_id, left_index=True, right_index=True)

In [None]:
PCA_train_freq_log_NO_TNO.to_csv("PCA_train_freq_log_NO_TNO.csv")
PCA_test_freq_log_NO_TNO.to_csv("PCA_test_freq_log_NO_TNO.csv")

***

# Modelling

In [None]:
import xgboost as xgb
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import RidgeCV
from sklearn.linear_model import SGDRegressor                    
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import ElasticNetCV
from sklearn.linear_model import Lars
from sklearn.linear_model import LarsCV
from sklearn.linear_model import Lasso
from sklearn.linear_model import LassoCV
from sklearn.linear_model import LassoLars
from sklearn.linear_model import LassoLarsCV
from sklearn.linear_model import LassoLarsIC
from sklearn.linear_model import OrthogonalMatchingPursuit
from sklearn.linear_model import OrthogonalMatchingPursuitCV                        
from sklearn.linear_model import ARDRegression
from sklearn.linear_model import BayesianRidge                       
from sklearn.linear_model import HuberRegressor
from sklearn.linear_model import RANSACRegressor
from sklearn.linear_model import TheilSenRegressor
from sklearn.linear_model import TweedieRegressor
from sklearn.linear_model import GammaRegressor
from sklearn.linear_model import PassiveAggressiveRegressor
from sklearn.linear_model import enet_path
from sklearn.linear_model import lars_path
from sklearn.linear_model import lars_path_gram
from sklearn.linear_model import lasso_path
from sklearn.linear_model import orthogonal_mp
from sklearn.linear_model import orthogonal_mp_gram
from sklearn.linear_model import ridge_regression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neighbors import RadiusNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import ExtraTreeRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.isotonic import IsotonicRegression
from sklearn.kernel_ridge import KernelRidge
from sklearn.svm import LinearSVR
from sklearn.svm import NuSVR
from sklearn.svm import SVR


In [None]:
from sklearn import pipeline 
from sklearn.metrics import mean_squared_error
from sklearn import model_selection # train_test_split
from IPython.display import clear_output

In [None]:
regressors = {
    "xgboost": xgb.XGBRegressor(objective ='reg:linear', 
                  n_estimators = 10, seed = 123),
    "LinearRegression": LinearRegression(),
    "Ridge": Ridge(),
    "RidgeCV": RidgeCV(),
    "SGDRegressor": SGDRegressor(),              
    "ElasticNet": ElasticNet(),
    "ElasticNetCV": ElasticNetCV(),
    "Lars": Lars(),
    "LarsCV": LarsCV(),
    "Lasso": Lasso(),
    "LassoCV": LassoCV(),
    "LassoLars": LassoLars(),
    "LassoLarsCV": LassoLarsCV(),
    "LassoLarsIC": LassoLarsIC(),
    "OrthogonalMatchingPursuit": OrthogonalMatchingPursuit(),
    "OrthogonalMatchingPursuitCV": OrthogonalMatchingPursuitCV(),                        
    "ARDRegression": ARDRegression(),
    "BayesianRidge": BayesianRidge(),                       
    "HuberRegressor": HuberRegressor(),
    "RANSACRegressor": RANSACRegressor(),
    "TheilSenRegressor": TheilSenRegressor(),
    "TweedieRegressor": TweedieRegressor(),
    "GammaRegressor": GammaRegressor(),
    "PassiveAggressiveRegressor": PassiveAggressiveRegressor(),
    "KNeighborsRegressor": KNeighborsRegressor(),
    "RadiusNeighborsRegressor": RadiusNeighborsRegressor(),
    "DecisionTreeRegressor": DecisionTreeRegressor(),
    "ExtraTreeRegressor": ExtraTreeRegressor(),
    "AdaBoostRegressor": AdaBoostRegressor(),
    "BaggingRegressor": BaggingRegressor(),
    "ExtraTreesRegressor": ExtraTreesRegressor(),
    "GradientBoostingRegressor": GradientBoostingRegressor(),
    "RandomForestRegressor": RandomForestRegressor(),
    "IsotonicRegression": IsotonicRegression(),
    "KernelRidge": KernelRidge(),
    "LinearSVR": LinearSVR(),
    "NuSVR": NuSVR(),
    "SVR": SVR(),
}

In [None]:
regressors = {name: pipeline.make_pipeline(model) for name, model in regressors.items()}

1. UNI_binary_num

In [None]:
x = UNI_train_binary_num.drop(columns=["id"]) # X DATA (WILL BE TRAIN+VALID DATA)
y = UNI_train_binary_num["target"]

x_test = UNI_test_binary_num.drop(columns=['id']) # # X_TEST DATA (NEW DATA)

In [None]:
x_train, x_val, y_train, y_val = model_selection.train_test_split(
    x, y,
    test_size=0.2,
    random_state=4  # Recommended for reproducibility
)

results = pd.DataFrame({'Model': [], 'RMSE': []})

for model_name, model in regressors.items():

    model.fit(x_train, y_train)
    
    pred = model.predict(x_val)
    
    results = results.append({"Model":    model_name,
                              "RMSE": mean_squared_error(y_val, pred, squared=False)},
                              ignore_index=True)
    
    results_ord = results.sort_values(by=['RMSE'], ascending=False, ignore_index=True)
    results_ord.index += 1 
    
    clear_output(wait=True)
    display(results_ord.style.bar(subset=['RMSE'], vmin=0, vmax=100, color='#5fba7d'))

# Put each of the full csv combinations through a pipeline like so. Not enough memory on this kaggle notebook to do this - continued on another notebook.

# **Best result with train validation: 0.854**
# **Best result with Kaggle test validation: 0.84186**