# Import Files and Libraries

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import os
import sys

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Activation, Dense, Input
from tensorflow.keras.optimizers import  SGD

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.model_selection import RepeatedKFold, KFold
from keras.models import Sequential
from keras.layers import Dense, Input
from tensorflow.keras.initializers import  GlorotUniform, Constant, Zeros, GlorotNormal
from scikeras.wrappers import KerasClassifier, KerasRegressor
from tensorflow.keras.regularizers import l2
from sklearn.metrics import make_scorer
from tensorflow.keras.callbacks import EarlyStopping
from matplotlib import pyplot as plt

from sklearn.datasets import make_classification


dir_path = os.getcwd().split(os.path.sep)
root_index = dir_path.index('Machine_Learning_project')
root_path = os.path.sep.join(dir_path[:root_index + 1])
sys.path.append(root_path + '/code/data_loaders')



In [2]:
from data_cup import *

In [3]:
cup = CupDataset('Cup_tr')
df = cup.data

y=df.iloc[:,-3:]
x=df.iloc[:,:-3]


x.drop(['id'], axis=1, inplace=True)



# Data analysis and visualization

In [None]:
X_values=x.values

y_values=y.values


print(str(len(X_values))+ " number of samples in input.")
print(str(len(y_values[0]))+ " numbers of target variables")


 #### Histograms of the frequencies of the input variables are displayed

In [None]:
import matplotlib.pyplot as plt

# Conta il numero di feature presenti in X_values
num_features = X_values.shape[1]

# Calcola il numero di righe e colonne necessarie per i subplot
num_rows = (num_features + 3) // 4
num_cols = min(num_features, 4)

fig, axs = plt.subplots(num_rows, num_cols, figsize=(16, 12))
fig.subplots_adjust(top=0.9, hspace=0.4)  # Aggiunge spazio sopra i subplot e tra le righe
n_bins = 20  # Numero di bin per gli istogrammi

for i in range(num_features):
    row = i // 4
    col = i % 4
    x = X_values[:, i]
    axs[row, col].hist(x, bins=n_bins, density=True, edgecolor='black', alpha=0.7)
    axs[row, col].set_title(f"Feature {i}", fontsize=12)
    axs[row, col].set_xlabel("Value", fontsize=10)
    axs[row, col].set_ylabel("Density", fontsize=10)
    axs[row, col].tick_params(labelsize=8)
    axs[row, col].grid(True, linestyle='--', alpha=0.7)


for i in range(num_features, num_rows * num_cols):
    row = i // 4
    col = i % 4
    fig.delaxes(axs[row, col])

plt.suptitle("Distribution of Features", fontsize=16, y=0.98)  
plt.tight_layout(rect=[0, 0, 1, 1])  
plt.show()


##### The input features are in the range of -1 to 1, with peaks near the extremes of the range. Our intuition is that a nonlinear transformation such as the  Arc Tangent Hyperbolic (ArcTanh)  , which maps the range of [-1,1] values to [-inf,+inf], could improve the feature distributions.



In [None]:
import matplotlib.pyplot as plt
import numpy as np

X_transform = np.array(np.arctanh(X_values))

fig, axs = plt.subplots(3, 4, figsize=(16, 12), tight_layout=True)
fig.subplots_adjust(top=0.9, hspace=0.4)
n_bins = 40

for i in range(4):
    x = X_transform[:, 2*i]
    axs[0, i].hist(x, bins=n_bins, density=True, edgecolor='black', alpha=0.7)
    axs[0, i].set_title(f"Feature {2*i}", fontsize=12)
    axs[0, i].set_xlabel("Value", fontsize=10)
    axs[0, i].set_ylabel("Density", fontsize=10)
    axs[0, i].tick_params(labelsize=8)
    axs[0, i].grid(True, linestyle='--', alpha=0.7)

    if 2*i+1 < X_transform.shape[1]:
        x = X_transform[:, 2*i+1]
        axs[1, i].hist(x, bins=n_bins, density=True, edgecolor='black', alpha=0.7)
        axs[1, i].set_title(f"Feature {2*i+1}", fontsize=12)
        axs[1, i].set_xlabel("Value", fontsize=10)
        axs[1, i].set_ylabel("Density", fontsize=10)
        axs[1, i].tick_params(labelsize=8)
        axs[1, i].grid(True, linestyle='--', alpha=0.7)

for i in range(8, X_transform.shape[1]):
    x = X_transform[:, i]
    axs[2, i-8].hist(x, bins=n_bins, density=True, edgecolor='black', alpha=0.7)
    axs[2, i-8].set_title(f"Feature {i}", fontsize=12)
    axs[2, i-8].set_xlabel("Value", fontsize=10)
    axs[2, i-8].set_ylabel("Density", fontsize=10)
    axs[2, i-8].tick_params(labelsize=8)
    axs[2, i-8].grid(True, linestyle='--', alpha=0.7)

min_ylim = min(ax.get_ylim()[0] for ax in axs.flat)
max_ylim = max(ax.get_ylim()[1] for ax in axs.flat)
for ax in axs.flat:
    ax.set_ylim(min_ylim, max_ylim)

for i in range(X_transform.shape[1], 12):
    fig.delaxes(axs[2, i-8])

plt.suptitle("Distribution of Transformed Features", fontsize=16, y=0.98)
plt.tight_layout(rect=[0, 0, 1, 0.96])
plt.show()


Graphical representation of transformed variables with the PolynomialFeatures of raw data 

In [None]:
poly = PolynomialFeatures(degree=2, include_bias=False)
X_train_poly = poly.fit_transform(X_values)[:, 1:]


n_features = X_train_poly.shape[1]


n_rows = (n_features + 3) // 4
n_cols = min(n_features, 4)


fig, axs = plt.subplots(n_rows, n_cols, figsize=(16, 4*n_rows), tight_layout=True)
fig.subplots_adjust(top=0.95, hspace=0.4)
n_bins = 40


for i in range(n_features):
    row = i // 4
    col = i % 4
    axs[row, col].hist(X_train_poly[:, i], bins=n_bins, density=True, edgecolor='black', alpha=0.7)
    axs[row, col].set_title(f"Feature {i+1}", fontsize=12)
    axs[row, col].set_xlabel("Value", fontsize=10)
    axs[row, col].set_ylabel("Density", fontsize=10)
    axs[row, col].tick_params(labelsize=8)
    axs[row, col].grid(True, linestyle='--', alpha=0.7)


for i in range(n_features, n_rows * n_cols):
    row = i // 4
    col = i % 4
    fig.delaxes(axs[row, col])


min_ylim = min(ax.get_ylim()[0] for ax in axs.flat)
max_ylim = max(ax.get_ylim()[1] for ax in axs.flat)
for ax in axs.flat:
    ax.set_ylim(min_ylim, max_ylim)


plt.suptitle("Distribution of Polynomial Features", fontsize=16, y=1)


plt.show()


The feature distributions after the PolynomialFeature(PF) transformation show a concentration of frequencies at the limits between -1 and 1. This behavior is attributable to the nature of the original features. The PF transformation generates new features by combining the original ones in polynomial terms, maintaining the distributional characteristics of the initial data.

Finally, we graphically represent the results of the polynomial degree and arctanh transformations together to visually compare the effect of each transformation on the data.

In [None]:
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import PolynomialFeatures

poly = PolynomialFeatures(degree=2)
X_train_poly = np.arctanh(poly.fit_transform(X_values)[:, 1:])


n_features = X_train_poly.shape[1]


n_rows = (n_features + 3) // 4
n_cols = min(n_features, 4)


fig, axs = plt.subplots(n_rows, n_cols, figsize=(16, 4*n_rows), tight_layout=True)
fig.subplots_adjust(top=0.95, hspace=0.4)
n_bins = 40


for i in range(n_features):
    row = i // 4
    col = i % 4
    axs[row, col].hist(X_train_poly[:, i], bins=n_bins, density=True, edgecolor='black', alpha=0.7)
    axs[row, col].set_title(f"Feature {i+1}", fontsize=12)
    axs[row, col].set_xlabel("Value", fontsize=10)
    axs[row, col].set_ylabel("Density", fontsize=10)
    axs[row, col].tick_params(labelsize=8)
    axs[row, col].grid(True, linestyle='--', alpha=0.7)


for i in range(n_features, n_rows * n_cols):
    row = i // 4
    col = i % 4
    fig.delaxes(axs[row, col])


min_ylim = min(ax.get_ylim()[0] for ax in axs.flat)
max_ylim = max(ax.get_ylim()[1] for ax in axs.flat)
for ax in axs.flat:
    ax.set_ylim(min_ylim, max_ylim)


plt.suptitle("Distribution of Polynomial Features", fontsize=16, y=1)


plt.show()


### Conclusion
Analyzing the last graphs, in which we applied the PolynomialFeatures transformation in combination with the Hyperbolic Arc Tangent (arctanh) transformation, some significant effects on the distribution of the variables can be observed.

In particular, the joint application of these two transformations resulted in a more effective mitigation of the high frequencies present at the extremes of the interval, compared with the use of the PolynomialFeatures transformation alone. This result suggests that the combination of the two transformations has a positive impact on the normalization of the distribution of the variables.