In [None]:
import pandas as pd
from matplotlib import pyplot as plt
import numpy as np
import seaborn as sns
sns.set_style(style='darkgrid')

In [None]:
path = '/kaggle/input/cern-electron-collision-data/dielectron.csv'
df = pd.read_csv(path)
df.head()

### Exploratory Data Analysis

Check for missing values.

In [None]:
df.isnull().sum()

Since only 0.085% of the data is missing, let us drop those rows.

In [None]:
df.dropna(inplace=True)

Features `[Run, Event]` are the run number and event numbers which does not contribute to the target variable and can be dropped.

In [None]:
df.drop(columns=['Run', 'Event'], inplace=True)

A correlation matrix helps in understanding which features directly affect the target variable.

In [None]:
corr = df.corr()
plt.figure(figsize=(10,8))
sns.heatmap(corr, annot=True, fmt='0.1f');

From the heatmap, it can be observed that target variable, `M` is dependent directly on `[E1, pt1, E2, pt2]`. \\
From the correlation matrix, it can be observed that some features correlated to each other. Let us visualize some of the correlation.

In [None]:
plt.figure(figsize=(10,10), tight_layout=True)
r,c = 3, 3
plt.subplot(r,c,1)
sns.scatterplot(x=df['E1'], y=df['pt1']);
plt.subplot(r,c,2)
sns.scatterplot(x=df['px1 '], y=df['px2']);
plt.subplot(r,c,3)
sns.scatterplot(x=df['phi1'], y=df['py1']);
plt.subplot(r,c,4)
sns.scatterplot(x=df['py1'], y=df['py2']);
plt.subplot(r,c,5)
sns.scatterplot(x=df['pz1'], y=df['pz2']);
plt.subplot(r,c,6)
sns.scatterplot(x=df['pt1'], y=df['pz2']);
plt.subplot(r,c,7)
sns.scatterplot(x=df['E2'], y=df['pt2']);
plt.subplot(r,c,8)
sns.scatterplot(x=df['pz2'], y=df['eta2']);
plt.subplot(r,c,9)
sns.scatterplot(x=df['phi2'], y=df['py2']);

## Linear Regression

Since the target is floating value, linear regression can be applied on the dataset.

In [None]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [None]:
X = df.drop(columns='M')
y = df[['M']].squeeze()

Before applying linear regression, let us check the data distribution

In [None]:
X.describe()

Scaling the data helps in increasing the performance of the regression model. Let us split the data into train and test set.

In [None]:
xTr, xTs, yTr, yTs = train_test_split(X, y, test_size = 0.25)

Applying minmax scaling

In [None]:
scaler = MinMaxScaler()
xTr = scaler.fit_transform(xTr)
xTs = scaler.transform(xTs)

Fit a linear model and check the MSE and R2 for test data

In [None]:
reg = LinearRegression()
reg.fit(xTr, yTr)
yPred = reg.predict(xTs)
print(f'MSE: {mean_squared_error(yTs, yPred):0.2f}, R2: {r2_score(yTs, yPred):0.4f}')

A graph of actual vs predicted gives how well the model fits the data

In [None]:
plt.figure(figsize=(10,8))
xyMin, xyMax = yTs.min(), yTs.max()
sns.lineplot(x=[xyMin, xyMax], y=[xyMin, xyMax], color='red')
sns.scatterplot(x=yTs, y=yPred);

The given dataset is not linear on the target variable, since MSE is a large number, and R2 is small.

## Random forest

Since the data is not linear, to design a more complex model, random forest of decision trees can be applied.

In [None]:
from sklearn.ensemble import RandomForestRegressor

In [None]:
reg = RandomForestRegressor()
reg.fit(xTr, yTr)
yPred = reg.predict(xTs)
print(f'MSE: {mean_squared_error(yTs, yPred):0.2f}, R2: {r2_score(yTs, yPred):0.4f}')

In [None]:
plt.figure(figsize=(10,8))
xyMin, xyMax = yTs.min(), yTs.max()
sns.lineplot(x=[xyMin, xyMax], y=[xyMin, xyMax], color='red')
sns.scatterplot(x=yTs, y=yPred);

Using random forest, the model performs better on test data with relatively small MSE and higher R2. Since the model performed well, let us understand which features are important in predicting the target variable and their ranking.

In [None]:
plt.figure(figsize=(10,8))
feature_importance = reg.feature_importances_
idx = np.argsort(-feature_importance, )
sns.barplot(x=df.columns[idx] ,y=feature_importance[idx]);

`[pt2, pt1, E2, E1, pz2, eta1, eta2, px2, py1, py2, px1, Event, phi2, phi1, Run, Q2, Q1]` is the order of importance of the features in predicting the target `M`.

## Fully Connected Neural Network

Eventhough with random forest MSE reduced sigificantly, and model predicts with relatively high accuracy, neural network can be built for the model since 100k data is available with only 16 fetures.

In [None]:
import tensorflow as tf

Building a FC NN model with 2 hidden layers. One with 32 units another with 8 units. The model has 817 trainable parameters.

In [None]:
tf.keras.backend.clear_session()
model = tf.keras.Sequential([
            tf.keras.layers.Dense(units=32, input_shape=(xTr.shape[1],), activation='relu', name='FC1'), 
            tf.keras.layers.Dense(units=8, activation='relu', name='FC2'), 
            tf.keras.layers.Dense(units=1, name='Output')
])
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.002), loss=tf.keras.losses.mean_squared_error)
init = 0
model.summary()

Writing a custom callback function to log MSE of validation data every 10 epochs.

In [None]:
class LogMetrics(tf.keras.callbacks.Callback):
    def on_epoch_end(self, epoch, logs=None):
        if epoch % 10 == 9:
            val_loss = mean_squared_error(yTs, self.model.predict(xTs))
            print(f'Epoch: {epoch+1}/{self.params["epochs"]} - val_loss: {val_loss:0.4f} ')

Passing early stopping callback to avoid overfitting.

In [None]:
xtr, xval, ytr, yval = train_test_split(xTr, yTr, test_size=0.1)
epochs = 500
hist = model.fit(xtr, ytr, validation_data=(xval, yval), epochs=epochs, verbose=0, batch_size=128, 
                 callbacks=[LogMetrics(), tf.keras.callbacks.EarlyStopping(min_delta=0.1, patience=5)])

A graph of traing loss vs valiation loss gives insight on model overfitting/ underfitting

In [None]:
plt.figure(figsize=(8,4))
splot = sns.lineplot(data=hist.history);
splot.set(xscale="log");

In [None]:
yPred = model.predict(xTs)
print(f'MSE: {mean_squared_error(yTs, yPred):0.2f}, R2: {r2_score(yTs, yPred):0.4f}')

In [None]:
plt.figure(figsize=(10,8))
sns.set_style(style='darkgrid')
xyMin, xyMax = yTs.min(), yTs.max()
sns.lineplot(x=[xyMin, xyMax], y=[xyMin, xyMax], color='red')
sns.scatterplot(x=yTs, y=yPred.squeeze());