In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt # plot the dataset
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler 
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor




# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

### Loading the Dataset

In [None]:
df = pd.read_csv('/kaggle/input/cern-electron-collision-data/dielectron.csv')
df.head()

In [None]:
df.info()

Here we can see that 85 values from Target column is missing

In [None]:
df.describe().T

Looking at the minimum and maximum values of multiple column we are quite sure that this dataset needs to be scaled.

In [None]:
df.hist(bins = 50, figsize = (20,15))
plt.show()

In [None]:
df['E1'].hist(bins = 10, figsize = (5,5))
plt.show()

In [None]:
df.isnull().sum()

As there are only 85 missing values, that too the target values, we opted to drop those rows

In [None]:
df2 = df.dropna(subset = ['M'])
df2.isnull().sum()

In [None]:
df2.info()

In [None]:
corr_matrix = df2.corr()
corr_matrix['M'].sort_values(ascending=False)

In [None]:
from pandas.plotting import scatter_matrix
attributes = ['M','pt1','pt2','E1','E2']
scatter_matrix(df2[attributes],figsize=(12,8))
plt.show()

In [None]:
sns.set(rc={'figure.figsize':(15,15)})
g = sns.heatmap(df2.corr(),annot=True, fmt = ".2f", cmap = "coolwarm")

Run and Event columns are there only to specify the run number and event number, so we find it safe just to drop those columns

In [None]:
df2.drop(labels = ["Run","Event"], axis = 1, inplace = True)
df2.head()

## Train Test Split

Now we will do the split of Training Data and Testing Data with 80:20 ratio.

In [None]:
train_set,test_set = train_test_split(df2,test_size = 0.2, random_state = 42)

In [None]:
dataframe = train_set.drop('M',axis = 1)
dataframe_labels = train_set['M'].copy()

### Scaling
We can see the deviation among the min and max values of different columns, so what we thought of doing is Scaling and using StandardScaler 
StandardScaler first it subtracts the mean value (so standardized
values always have a zero mean), and then it divides by the standard deviation so that
the resulting distribution has unit variance. In other words, it centralizes the data.

In [None]:
scaler = StandardScaler()

In [None]:
dataframe_scaled = scaler.fit_transform(dataframe)

In [None]:
dataframe_scaled

# **Models** 

First we tried rough implementation of different modeling techniques

### ***LINEAR REGRESSION***

In [None]:
lin_reg = LinearRegression()
lin_reg.fit(dataframe_scaled, dataframe_labels)

In [None]:
dataframe_predictions = lin_reg.predict(dataframe_scaled)
lin_mse = mean_squared_error(dataframe_labels, dataframe_predictions)
lin_rmse = np.sqrt(lin_mse)
lin_rmse

RMSE of Linear Regression is quite high

### ***DECISION TREE***

In [None]:

tree_reg = DecisionTreeRegressor()
tree_reg.fit(dataframe_scaled,dataframe_labels)

In [None]:
dataframe_predictions = tree_reg.predict(dataframe_scaled)
tree_mse = mean_squared_error(dataframe_labels,dataframe_predictions)
tree_rmse = np.sqrt(tree_mse)
tree_rmse

### ***Random Forest***

In [None]:
forest_reg = RandomForestRegressor()
forest_reg.fit(dataframe_scaled,dataframe_labels)

In [None]:
dataframe_predictions = forest_reg.predict(dataframe_scaled)
forest_mse = mean_squared_error(dataframe_labels,dataframe_predictions)
forest_rmse = np.sqrt(forest_mse)
forest_rmse

We can see that without any fine tuning Decision Tree is overfitting the data and Random Forest is near overfitting. Hence will do Cross Validation

In [None]:
def display_scores(scores):
    print('Scores',scores)
    print('Mean',scores.mean())
    print('Standard Deviation',scores.std())

### Using Cross Validation

In [None]:
lin_scores = cross_val_score(lin_reg,dataframe_scaled,dataframe_labels,
                             scoring = 'neg_mean_squared_error',cv =10)
lin_rmse_scores = np.sqrt(-lin_scores)
display_scores(lin_rmse_scores)

> Scikit-Learn’s cross-validation features expect a utility function
(greater is better) rather than a cost function (lower is better), so
the scoring function is actually the opposite of the MSE (i.e., a neg‐
ative value), which is why the preceding code computes -scores
before calculating the square root.

In [None]:
scores = cross_val_score(tree_reg,dataframe_scaled,dataframe_labels,
                         scoring='neg_mean_squared_error',cv = 10)
tree_rmse_scores = np.sqrt(-scores)

In [None]:
display_scores(tree_rmse_scores)

In [None]:
forest_scores = cross_val_score(forest_reg,dataframe_scaled,dataframe_labels,
                                scoring = 'neg_mean_squared_error',cv =10)

In [None]:
forest_rmse_scores = np.sqrt(-forest_scores)
display_scores(forest_rmse_scores)

Of the three we can see that Random Forest has lowest RMSE and is also quite stable, so will use GridSearchCV for hyperparameter tuning

### GridSearchCV

In [None]:
param_grid = [
    {'n_estimators':[3,10,30], 'max_features':[2,4,6,8]},
    {'bootstrap':[False],'n_estimators':[3,10],'max_features':[2,3,4]},
]
forest_reg = RandomForestRegressor()
grid_search = GridSearchCV(forest_reg,param_grid, cv=5,scoring = 'neg_mean_squared_error')
grid_search.fit(dataframe_scaled,dataframe_labels)

In [None]:
grid_search.best_params_

In [None]:
grid_search.best_estimator_

In [None]:
cvres = grid_search.cv_results_
for mean_score,params in zip(cvres['mean_test_score'],cvres['params']):
    print(np.sqrt(-mean_score),params)

In [None]:
feature_importances =  grid_search.best_estimator_.feature_importances_
feature_importances

In [None]:
final_model = grid_search.best_estimator_
X_test = test_set.drop("M", axis=1)
y_test = test_set["M"].copy()
X_test_prepared = scaler.transform(X_test)
final_predictions = final_model.predict(X_test_prepared)
final_mse = mean_squared_error(y_test, final_predictions)
final_rmse = np.sqrt(final_mse)

In [None]:
print(final_rmse)