# PACKAGES AND LIBRARIES

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from warnings import filterwarnings
from mpl_toolkits.mplot3d import Axes3D
import statsmodels.api as sm
import missingno as msno
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale
from sklearn.neighbors import LocalOutlierFactor
from scipy.stats import levene
from scipy.stats import shapiro
from scipy.stats.stats import pearsonr
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict
from sklearn.preprocessing import scale
from sklearn.model_selection import ShuffleSplit, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
from sklearn import model_selection
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import BaggingRegressor
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPRegressor
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LinearRegression
from sklearn.cross_decomposition import PLSRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import RidgeCV
from sklearn.linear_model import Lasso
from sklearn.linear_model import LassoCV
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import ElasticNetCV
from sklearn import linear_model
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.ensemble import GradientBoostingRegressor, GradientBoostingClassifier
import xgboost as xgb
from xgboost import XGBRegressor, XGBClassifier
from lightgbm import LGBMRegressor, LGBMClassifier
from catboost import CatBoostRegressor, CatBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn import tree
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report, roc_auc_score, roc_curve

### Ignoring Warnings

In [None]:
filterwarnings("ignore", category=DeprecationWarning) 
filterwarnings("ignore", category=FutureWarning) 
filterwarnings("ignore", category=UserWarning) 

# DATA SOURCE

In [None]:
CERN = pd.read_csv("../input/cern-electron-collision-data/dielectron.csv")

In [None]:
CERN = CERN.drop(["Run","Event"],axis=1) # We will focus on numerical columns, we do not need event IDs
# if you want to predict Event or Run, you should not run that process

In [None]:
CERN.rename(columns={'px1 ':'px1'}, inplace=True) # "px 1" is not comfortable

In [None]:
data = CERN.copy() # copy to protect main data

In [None]:
dataV = data.copy() # for visulazition
dataV["Q1"] = pd.Categorical(dataV["Q1"])
dataV["Q2"] = pd.Categorical(dataV["Q2"])
# Q1 and Q2 refer to electron charges, we can think categorically

In [None]:
df = dataV.select_dtypes(include=["float64","int64","int32"]) # for mathematical analysis, to guarantee even though it is not necessary

### Exploratory Data Analysis

In [None]:
print(data.head())

In [None]:
print(data.shape)

In [None]:
print(data.columns)

In [None]:
print(data.info())

In [None]:
print(data.describe().T)

In [None]:
print(df.corr())

In [None]:
print(df.cov())

In [None]:
print(data.groupby(["E1","E2"]).mean()["M"])

In [None]:
print(data.groupby(["px1","px2"]).mean()["M"])

In [None]:
print(data.groupby(["py1","py2"]).mean()["M"])

In [None]:
print(data.groupby(["pz1","pz2"]).mean()["M"])

In [None]:
print(data.groupby(["pt1","pt2"]).mean()["M"])

In [None]:
print(data.groupby(["eta1","eta2"]).mean()["M"])

In [None]:
print(data.groupby(["phi1","phi2"]).mean()["M"])

In [None]:
print(data.where(data["E1"] > 50).value_counts().sum())

In [None]:
print(data.where(data["E2"] > 50).value_counts().sum())

In [None]:
print(data.where(data["px1"] > data["px1"].mean()).value_counts().sum())

In [None]:
print(data.where(data["px2"] > data["px2"].mean()).value_counts().sum())

In [None]:
print(data.where(data["py1"] < 0).value_counts().sum())

In [None]:
print(data.where(data["py2"] < 1).value_counts().sum())

In [None]:
print(data.where(data["pz1"] < 1).value_counts().sum())

In [None]:
print(data.where(data["pz2"] < data["pz2"].mean()).value_counts().sum())

In [None]:
print(data.where(data["pt1"] < data["pt1"].mean()).value_counts().sum())

In [None]:
print(data.where(data["pt2"] < data["pt2"].mean()).value_counts().sum())

In [None]:
print(data.where(data["eta1"] > data["eta1"].mean()).value_counts().sum())

In [None]:
print(data.where(data["eta2"] > data["eta2"].mean()).value_counts().sum())

In [None]:
print(data.where(data["phi1"] > 1).value_counts().sum())

In [None]:
print(data.where(data["phi2"] > 1).value_counts().sum())

In [None]:
print(data.duplicated().value_counts()) # That for Q1,Q2 -- you should not delete duplicated

In [None]:
print(data.isnull().sum()) # M has NaN

In [None]:
print(data.isnull().all())

Let's take a look at the correlation between missing data by visualizing it

In [None]:
msno.matrix(data,figsize=(8,5))
plt.show()

In [None]:
msno.bar(data,figsize=(8,5))
plt.show()

In [None]:
msno.heatmap(data,figsize=(8,5))
plt.show()

it seems missing values are acceptable, but we will add median

In [None]:
# Let's add median corresponding to that data instead of the NaN data
data["M"].fillna(data["M"].median(),inplace=True)
print(data.isnull().sum()) # checking again

# OUTLIER VALUES

In [None]:
DataForA = data.copy() # to protect main copy data

In [None]:
clf = LocalOutlierFactor()
clf.fit_predict(DataForA)

In [None]:
score = clf.negative_outlier_factor_

In [None]:
scoresorted = np.sort(score)
print(scoresorted[0:20]) # checking outlier, look where the biggest jump took place
# for this data, outlier point is index 4 -- > -2.74668789

In [None]:
point = scoresorted[4]
print(f"Outlier Point -- > {point} ")
print("---"*20)
print("Outlier Row -- >\n",DataForA[score==point])

In [None]:
totaloutlier = DataForA < point
print(DataForA[totaloutlier].any())

In [None]:
print(data[score < point])
# outlier values

In [None]:
CleanData = DataForA > point
print(data[CleanData])

# CORRELATION - COVARIANCE - NORMALITY - HOMOGENEITY

### Correlation

In [None]:
corrPearson = data.corr(method="pearson")
corrSpearman = data.corr(method="spearman")

In [None]:
figure = plt.figure(figsize=(20,8))
sns.heatmap(corrPearson,annot=True,vmin=-1,center=0,vmax=1)
plt.title("PEARSON")
plt.show()

In [None]:
figure = plt.figure(figsize=(20,8))
sns.heatmap(corrSpearman,annot=True,vmin=-1,center=0,vmax=1)
plt.title("SPEARMAN")
plt.show()

### Covariance

In [None]:
covv = data.cov()

In [None]:
figure = plt.figure(figsize=(20,8))
sns.heatmap(covv,annot=True,vmin=-1,center=0,vmax=1)
plt.title("COVARIANCE")
plt.show()

### Normality

In [None]:
for i in data.columns:
    print("---"*30)
    print(i)
    print("%.4f - %.4f" % shapiro(data[i]))

### Homogeneity

In [None]:
print("%.3f - %.3f" % levene(data["E1"],data["E2"]))
print("%.3f - %.3f" % levene(data["px1"],data["px2"]))
print("%.3f - %.3f" % levene(data["py1"],data["py2"]))
print("%.3f - %.3f" % levene(data["pt1"],data["pt2"]))
print("%.3f - %.3f" % levene(data["pz1"],data["pz2"]))
print("%.3f - %.3f" % levene(data["eta1"],data["eta2"]))
print("%.3f - %.3f" % levene(data["phi1"],data["phi1"]))
print("%.3f - %.3f" % levene(data["Q1"],data["Q2"]))

# VISUALIZATION

### BASIC HIST

In [None]:
data.iloc[:,0:16].hist(figsize=(20,20))
plt.show()

### BOX

In [None]:
figure = plt.figure(figsize=(20,8))
sns.boxplot(x="px1",y="Q1",data=dataV)
plt.show()

In [None]:
figure = plt.figure(figsize=(20,8))
sns.boxplot(x="px2",y="Q2",data=dataV)
plt.show()

### SCATTER

In [None]:
figure = plt.figure(figsize=(20,8))
sns.scatterplot(x="E1",y="M",data=data)
plt.show()

In [None]:
figure = plt.figure(figsize=(20,8))
sns.scatterplot(x="px1",y="M",data=data)
plt.show()

In [None]:
figure = plt.figure(figsize=(20,8))
sns.scatterplot(x="py1",y="M",data=data)
plt.show()

In [None]:
figure = plt.figure(figsize=(20,8))
sns.scatterplot(x="pz1",y="M",data=data)
plt.show()

In [None]:
figure = plt.figure(figsize=(20,8))
sns.scatterplot(x="pt1",y="M",data=data)
plt.show()

In [None]:
figure = plt.figure(figsize=(20,8))
sns.scatterplot(x="eta1",y="M",data=data)
plt.show()

In [None]:
figure = plt.figure(figsize=(20,8))
sns.scatterplot(x="phi1",y="M",data=data)
plt.show()

### HISTPLOT

In [None]:
figure = plt.figure(figsize=(20,5))
sns.histplot(
    data,
    x="E2",
    multiple="stack",
    edgecolor=".3",
    linewidth=.5,
    log_scale=True,
)
plt.show()

In [None]:
figure = plt.figure(figsize=(20,5))
sns.histplot(
    data,
    x="px2",
    multiple="stack",
    edgecolor=".3",
    linewidth=.5,
    log_scale=True,
)
plt.show()

In [None]:
figure = plt.figure(figsize=(20,5))
sns.histplot(
    data,
    x="py2",
    multiple="stack",
    edgecolor=".3",
    linewidth=.5,
    log_scale=True,
)
plt.show()

In [None]:
figure = plt.figure(figsize=(20,5))
sns.histplot(
    data,
    x="pt2",
    multiple="stack",
    edgecolor=".3",
    linewidth=.5,
    log_scale=True,
)
plt.show()

In [None]:
figure = plt.figure(figsize=(20,5))
sns.histplot(
    data,
    x="pz2",
    multiple="stack",
    edgecolor=".3",
    linewidth=.5,
    log_scale=True,
)
plt.show()

In [None]:
figure = plt.figure(figsize=(20,5))
sns.histplot(
    data,
    x="eta2",
    multiple="stack",
    edgecolor=".3",
    linewidth=.5,
    log_scale=True,
)
plt.show()

In [None]:
figure = plt.figure(figsize=(20,5))
sns.histplot(
    data,
    x="phi2",
    multiple="stack",
    edgecolor=".3",
    linewidth=.5,
    log_scale=True,
)
plt.show()

### JOINTPLOT

In [None]:
figure = plt.figure(figsize=(20,8))
sns.jointplot(x="E1",y="px1", color="#4CB391",data=data)
plt.show()

In [None]:
figure = plt.figure(figsize=(20,8))
sns.jointplot(x="E1",y="py1", color="#4CB391",data=data)
plt.show()

In [None]:
figure = plt.figure(figsize=(20,8))
sns.jointplot(x="E1",y="pt1", color="#4CB391",data=data)
plt.show()

In [None]:
figure = plt.figure(figsize=(20,8))
sns.jointplot(x="E1",y="pz1", color="#4CB391",data=data)
plt.show()

In [None]:
figure = plt.figure(figsize=(20,8))
sns.jointplot(x="E1",y="eta1",color="#4CB391",data=data)
plt.show()

In [None]:
figure = plt.figure(figsize=(20,8))
sns.jointplot(x="E1",y="phi1", color="#4CB391",data=data)
plt.show()

### DISTPLOT

In [None]:
figure = plt.figure(figsize=(20,8))
sns.distplot(data[data['E1'] > data['E1'].mean()]["M"], color='black',label='LESS') 
sns.distplot(data[data['E1'] < data['E1'].mean()]["M"], color='red',label='UPPER')
plt.title('E1', fontsize=10)
plt.legend()

In [None]:
figure = plt.figure(figsize=(20,8))
sns.distplot(data[data['px1'] > data['px1'].mean()]["M"], color='black',label='LESS') 
sns.distplot(data[data['px1'] < data['px1'].mean()]["M"], color='red',label='UPPER')
plt.title('px1', fontsize=10)
plt.legend()

In [None]:
figure = plt.figure(figsize=(20,8))
sns.distplot(data[data['py1'] > data['py1'].mean()]["M"], color='black',label='LESS') 
sns.distplot(data[data['py1'] < data['py1'].mean()]["M"], color='red',label='UPPER')
plt.title('py1', fontsize=10)
plt.legend()

In [None]:
figure = plt.figure(figsize=(20,8))
sns.distplot(data[data['pt1'] > data['pt1'].mean()]["M"], color='black',label='LESS') 
sns.distplot(data[data['pt1'] < data['pt1'].mean()]["M"], color='red',label='UPPER')
plt.title('pt1', fontsize=10)
plt.legend()

In [None]:
figure = plt.figure(figsize=(20,8))
sns.distplot(data[data['pz1'] > data['pz1'].mean()]["M"], color='black',label='LESS') 
sns.distplot(data[data['pz1'] < data['pz1'].mean()]["M"], color='red',label='UPPER')
plt.title('pz1', fontsize=10)
plt.legend()

In [None]:
figure = plt.figure(figsize=(20,8))
sns.distplot(data[data['eta1'] > data['eta1'].mean()]["M"], color='black',label='LESS') 
sns.distplot(data[data['eta1'] < data['eta1'].mean()]["M"], color='red',label='UPPER')
plt.title('pz1', fontsize=10)
plt.legend()

In [None]:
figure = plt.figure(figsize=(20,8))
sns.distplot(data[data['phi1'] > data['phi1'].mean()]["M"], color='black',label='LESS') 
sns.distplot(data[data['phi1'] < data['phi1'].mean()]["M"], color='red',label='UPPER')
plt.title('pz1', fontsize=10)
plt.legend()

### 3D

In [None]:
fig = plt.figure(figsize=(20,10))
ax = Axes3D(fig)
ax.scatter(data["px1"], data["py1"], data["pz1"], c="red", s=20, alpha=0.2)
plt.show()

In [None]:
fig = plt.figure(figsize=(20,10))
ax = Axes3D(fig)
ax.scatter(data["E1"], data["px1"], data["eta1"], c="black", s=20, alpha=0.2)
plt.show()

In [None]:
fig = plt.figure(figsize=(20,10))
ax = Axes3D(fig)
ax.scatter(data["E1"], data["E2"], data["M"], c="green", s=20, alpha=0.2)
plt.show()

In [None]:
fig = plt.figure(figsize=(20,10))
ax = Axes3D(fig)
ax.scatter(data["phi1"], data["phi2"], data["M"], c="blue", s=20, alpha=0.2)
plt.show()

### PAIRGRID

In [None]:
fig = plt.figure(figsize=(20,10))
sns.PairGrid(dataV, y_vars="M",
                 x_vars=["Q1", "Q2"],height=10,aspect=.5).map(sns.pointplot, scale=1.3, errwidth=2, color="black")
plt.show()

# PREDICTION MODELS

### X & Y 

In [None]:
x = data.drop("M",axis=1)
y = data["M"]

xTrain,xTest,yTrain,yTest = train_test_split(x,y,test_size=0.2,random_state=42)

### Models

In [None]:
lm = LinearRegression().fit(xTrain,yTrain)
pls = PLSRegression().fit(xTrain,yTrain)
ridge = Ridge().fit(xTrain,yTrain)
lasso = Lasso().fit(xTrain,yTrain)
elasticnet = ElasticNet().fit(xTrain,yTrain)
knnr = KNeighborsRegressor().fit(xTrain,yTrain)
cartr = DecisionTreeRegressor(random_state=42).fit(xTrain,yTrain)
baggr = BaggingRegressor(random_state=42,bootstrap_features=True,verbose=False).fit(xTrain,yTrain)
rfr = RandomForestRegressor(random_state=42,verbose=False).fit(xTrain,yTrain)
gbmr = GradientBoostingRegressor(verbose=False).fit(xTrain,yTrain)
xgbr = XGBRegressor().fit(xTrain,yTrain)
lgbmr = LGBMRegressor().fit(xTrain,yTrain)
catbr = CatBoostRegressor(verbose=False).fit(xTrain,yTrain)

In [None]:
models = [lm,pls,ridge,lasso,elasticnet,knnr,
         cartr,baggr,rfr,gbmr,xgbr,lgbmr,catbr]

In [None]:
for model in models:
    name = model.__class__.__name__
    R2CV = cross_val_score(model,xTest,yTest,cv=10,scoring="r2").mean()
    error = -cross_val_score(model,xTest,yTest,cv=10,scoring="neg_mean_squared_error").mean()
    print(name + ": ")
    print("-" * 10)
    print(R2CV)
    print(np.sqrt(error))
    print("-" * 30)

BEST IS CATBOOST --> 0.9898453825158777

### TUNING BEST MODEL CATBOOST

In [None]:
params = {
    
    "depth": [2, 3, 4, 5, 6],
    "learning_rate": [0.1, 0.01, 0.5]
}

In [None]:
cv = GridSearchCV(catbr, params, cv=10, verbose=False).fit(xTrain, yTrain)
print(cv.best_params_)

In [None]:
catbrtuned = CatBoostRegressor(depth=6,learning_rate=0.5,verbose=False).fit(xTrain,yTrain)

R2CVtuned = cross_val_score(catbrtuned,xTest,yTest,cv=10,scoring="r2").mean()
print(R2CVtuned)
errortuned = -cross_val_score(catbrtuned,xTest,yTest,cv=10,scoring="neg_mean_squared_error").mean()
print(np.sqrt(errortuned))

BEST PARAMETERS ARE DEFAULT PARAMETERS

### CNN - ADDITIONAL

Scaling

In [None]:
scaler = StandardScaler().fit(xTrain,yTrain)
xRTrain = scaler.transform(xTrain)
xRTest = scaler.transform(xTest)

In [None]:
mlpr = MLPRegressor().fit(xTrain,yTrain)

R2CV = cross_val_score(mlpr,xRTest,yTest,cv=10,scoring="r2").mean()
error = -cross_val_score(mlpr,xRTest,yTest,cv=10,scoring="neg_mean_squared_error").mean()

print(R2CV)
print(np.sqrt(error))

CNN is so close to CatBoost, but CatBoost is still best

### OLS - ADDITIONAL

In [None]:
ols = sm.OLS(yTrain,xTrain).fit()
print(ols.summary())

Best model is still CatBoost

### FINAL DESICION ---> CATBOOST / 0.9898453825158777