# Predicting Mass Invariant of electron pairs

In [None]:
import os

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error

from tensorflow.keras import models, layers


def RMSE(mse):
    RMSE = np.square(mse)
    return RMSE

## Data

In [None]:
df = pd.read_csv('/kaggle/input/cern-electron-collision-data/dielectron.csv')
df.head()

In [None]:
df.info()

In [None]:
df_to_predict =df[ df.M.isnull()]

## Visualization

In [None]:
fig,ax = plt.subplots(nrows=4,ncols=1,figsize=(16,20))

sns.scatterplot(ax= ax[0],data=df, x='pt1', y='pt2', hue='M', hue_norm=(df.M.min(),df.M.max()))
ax[0].set_title('Mass by linear momentum')
sns.scatterplot(ax= ax[1],data=df, x='eta1', y='eta2', hue='M', hue_norm=(df.M.min(),df.M.max()))
ax[1].set_title('Mass by pseudorapidity')
sns.scatterplot(ax= ax[2],data=df, x='E1', y='E2', hue='M', hue_norm=(df.M.min(),df.M.max()))
ax[2].set_title('Mass by energy')
sns.scatterplot(ax= ax[3],data=df, x='phi1', y='phi2', hue='M', hue_norm=(df.M.min(),df.M.max()))
ax[3].set_title('Mass by phi angle')

plt.show()

### Correlation

In [None]:
sns.set_style(style='dark')
plt.figure(figsize=(16,9))
heatmap = sns.heatmap(df.corr(), vmin=-1,vmax=1, annot=True,cmap='viridis')

heatmap.set_title('Correlation Heatmap', fontdict={'fontsize':12}, pad=12)
plt.show()

Great correlation between Mass invariant (M) and:
* Energies: [E1,E2]
* Linear momentums: [pt1,pt2]

## Model prediction

### Data selection

In [None]:
df_traintest = df.drop(index=df_to_predict.index)

y = df_traintest.M
X = df_traintest.drop(columns=['M','Run','Event'])

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3, random_state=42)
X_topred = df_to_predict.drop(columns=['M','Run','Event'])

### Random Forest Regressor

In [None]:
RFR = RandomForestRegressor(n_jobs=3, random_state=42)

RFR.fit(X_train,y_train)
Y_pred = RFR.predict(X_test)

RMSE_RFR = RMSE(mean_squared_error(y_test,Y_pred))

In [None]:
fig = plt.figure(figsize=(10,7))

plt.plot(np.arange(0,len(y_test)),sorted(y_test), c='b', label='Actual')
plt.plot(np.arange(0,len(y_test)),sorted(Y_pred), c='r', label='Predicted')

plt.title('Random Forest Regressor Prediction.\nRMSE = {}'.format(RMSE_RFR))
plt.ylabel('Mass Invariant (GeV)')
plt.legend(loc='best')

plt.show()

### Gradient Boosting Regressor


In [None]:
GBR = GradientBoostingRegressor(random_state=42)

GBR.fit(X_train,y_train)
Y_pred = GBR.predict(X_test)

RMSE_GBR = RMSE(mean_squared_error(y_test,Y_pred))

In [None]:
fig = plt.figure(figsize=(10,7))

plt.plot(np.arange(0,len(y_test)),sorted(y_test), c='b', label='Actual')
plt.plot(np.arange(0,len(y_test)),sorted(Y_pred), c='r', label='Predicted')

plt.title('Gradient Boosting Regressor Prediction.\nRMSE = {}'.format(RMSE_GBR))
plt.ylabel('Mass Invariant (GeV)')
plt.legend(loc='best')

plt.show()

### Neural Network Regressor

In [None]:
NNR = models.Sequential()

NNR.add(layers.Dense(20, activation='sigmoid', input_shape=(X.shape[1],)))
NNR.add(layers.Dense(4, activation='sigmoid'))
NNR.add(layers.Dense(1, activation='relu'))

NNR.compile(loss='mse',
           optimizer='adam',
           metrics=['mse'])

NNR.fit(X_train,y_train,
       batch_size=500,
       epochs = 500,
       validation_data=(X_test,y_test))

In [None]:
Y_pred_NNR = NNR.predict(X_test)

RMSE_NNR = RMSE(mean_squared_error(y_test,Y_pred_NNR))

In [None]:
fig = plt.figure(figsize=(10,7))

plt.plot(np.arange(0,len(y_test)),sorted(y_test), c='b', label='Actual')
plt.plot(np.arange(0,len(y_test)),sorted(Y_pred_NNR), c='r', label='Predicted')

plt.title('Neural Network Regressor Prediction.\nRMSE = {}'.format(RMSE_NNR))
plt.ylabel('Mass Invariant (GeV)')
plt.legend(loc='best')

plt.show()

## Prediction of Nan values 

In [None]:
Y_nan_RFR = RFR.predict(X_topred)
Y_nan_GBR = GBR.predict(X_topred)
Y_nan_NNR = NNR.predict(X_topred)

fig = plt.figure(figsize=(10,7))

plt.plot(np.arange(0,len(X_topred)),sorted(Y_nan_RFR), c='b', label='Random Forest Regressor, RMSE: {:.2f}'.format(RMSE_RFR))
plt.plot(np.arange(0,len(X_topred)),sorted(Y_nan_GBR), c='r', label='Gradient Boosting Regressor, RMSE: {:.2f}'.format(RMSE_GBR))
plt.plot(np.arange(0,len(X_topred)),sorted(Y_nan_NNR), c='g', label='Neural Network Regressor, RMSE: {:.2f}'.format(RMSE_NNR))

plt.title('Predictions with each regressor for NaN values of M')
plt.ylabel('Mass Invariant (GeV)')
plt.legend(loc='best')

plt.show()

In [None]:
df_pred = pd.DataFrame(np.array([Y_nan_RFR,Y_nan_GBR,Y_nan_NNR.ravel()]).T,
                      columns = ['M_RFR','M_GBR','M_NNR'],index=X_topred.index)

df_nan = df_to_predict.drop(columns='M')
df_nan = pd.concat([df_nan,df_pred], axis=1)

In [None]:
df_nan