In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

step 1:  Reading Data 

In [None]:
data=pd.read_csv("/kaggle/input/cern-electron-collision-data/dielectron.csv")
data

step 2: Information about the data

In [None]:
data.shape

In [None]:
data.columns
#data.describe()
#data.head()

Adding attribute

In [None]:
data['E_total'] = data['E1'] + data['E2']

step 2: knowing about type of data

In [None]:
data.info()

Checking NaN value in the dataset

In [None]:
data.isnull().sum()

Note that data of "M" is less than the rest of the data

step 3: Plotting the Histogram for knowing about data

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
data.hist(bins=50, figsize=(20,15))
plt.show()

main idea: **prediction of "M"**.

step4:  Finding which of data has more correlation with "M"

In [None]:
corr_Matrix=data.corr()
corr_Matrix["M"].sort_values(ascending=False)

Correlation between "M" and feature "E_total" is more

step5:  Splitting train set and test set by "train_set split" library. without cnsidering distribution

In [None]:
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(data, test_size=0.2, random_state=42)

In [None]:
data['pt2'].hist(bins=50, figsize=(10,5))
plt.show()
data['E_total'].hist(bins=50, figsize=(10,5))
plt.show()

some feature is more important
splitting data with cosidering distribution form.
looking carefully at "pt2"(or E_total) histogram, it demands redefinition of the test and train set in order to avoid the "sampling bias" problem. To do this we use "StratifiedShuffleSplit" library. To do this first we have to use "cut" from numpy module.

* Adding a column

In [None]:
data["pt2_cat"]=pd.cut(data["pt2"],
                       bins=[0., 10, 20, 30, 40, np.inf],
                       labels=[1, 2, 3, 4, 5])


data["E_total_cat"]=pd.cut(data["E_total"],
                       bins=[0., 50, 100, 150, 200, np.inf],
                       labels=[1, 2, 3, 4, 5])

In [None]:
data["E_total_cat"].hist(bins=50, figsize=(10,5))
plt.show()

data["pt2_cat"].hist(bins=50, figsize=(10,5))
plt.show()

In [None]:
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(data, data["E_total_cat"]):
    strat_train_set = data.loc[train_index]
    strat_test_set = data.loc[test_index]
    
for train_index, test_index in split.split(data, data["pt2_cat"]):
    strat_train_set1= data.loc[train_index]
    strat_test_set1= data.loc[test_index]

* Deleading the column

In [None]:
for set_ in (strat_train_set, strat_test_set):
    set_.drop("E_total_cat", axis =1, inplace = True)
    
    
    
for set_ in (strat_train_set1, strat_test_set1):
    set_.drop("pt2_cat", axis =1, inplace = True)

In [None]:
strat_train_set

In [None]:
Corr_Matrix = strat_train_set.corr()
print(Corr_Matrix["M"].sort_values(ascending= False))

In [None]:
strat_train_set.info()

The mass data is less than the rest of the data so we delete some of the data

In [None]:
strat_train_set.dropna(subset=['M'], inplace=True)

In [None]:
strat_train_set.info()

In [None]:
from pandas.plotting import scatter_matrix as sm
attributes = ['pt1','pt2','E_total','M']
sm(strat_train_set[attributes] , figsize=(15, 10))
plt.show()

step6:  Separateing the label data and the predictors

In [None]:
data_prepared = strat_train_set.drop("M", axis=1, inplace=False)
data_lable = strat_train_set["M"].copy()

Scaling the data

In [None]:
scale=StandardScaler()
data_prepared_scale=scale.fit_transform(data_prepared)

step7:  Training Model

* **The LinearRegresion model**

In [None]:
data_lin_reg= LinearRegression()
data_lin_reg.fit(data_prepared_scale, data_lable)

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error
data_prediction = data_lin_reg.predict(data_prepared_scale)

lin_mse = mean_squared_error(data_lable, data_prediction)
lin_rmse = np.sqrt(lin_mse)
lin_rmse

In [None]:
lin_scores = cross_val_score(data_lin_reg, data_prepared_scale, data_lable, scoring='neg_mean_squared_error', cv=20)
lin_rmse_score= np.sqrt(- lin_scores)
lin_rmse_score

* **DecisionTreeRegressor model**

In [None]:
de_data=DecisionTreeRegressor()
de_data.fit(data_prepared_scale, data_lable)

In [None]:
data_p=de_data.predict(data_prepared_scale)

lin_msed = mean_squared_error(data_lable, data_p)
lin_rmsed = np.sqrt(lin_msed)
lin_rmsed

In [None]:
Des_scoresd = cross_val_score(de_data ,data_prepared_scale, data_lable, scoring='neg_mean_squared_error', cv=20)
Des_rmse_scored= np.sqrt(- Des_scoresd)
Des_rmse_scored

* **RandomForest model**

In [None]:
forest_reg = RandomForestRegressor()
forest_reg.fit(data_prepared_scale, data_lable)

data_prediction = forest_reg.predict(data_prepared_scale)
forest_mse = mean_squared_error(data_prediction,  data_lable)
forest_rmse = np.sqrt(forest_mse)
forest_rmse

In [None]:
forest_scores = cross_val_score(forest_reg, data_prepared_scale, data_lable,
                                scoring="neg_mean_squared_error", cv=10)
forest_rmse_scores = np.sqrt(-forest_scores)
forest_rmse_scores


In [None]:
from sklearn.preprocessing import PolynomialFeatures

data_poly = PolynomialFeatures(2)
data_poly_2 =data_poly.fit_transform(data_prepared_scale, data_lable)

In [None]:
data_lin_reg.fit(data_poly_2, data_lable)

poly_scores = cross_val_score(data_lin_reg, data_poly_2, data_lable,
                              scoring="neg_mean_squared_error", cv=10)
poly_scores_rmse = np.sqrt(-poly_scores)
print(poly_scores_rmse)

In [None]:
de_data.fit(data_poly_2, data_lable)
tree_poly_scores = cross_val_score(de_data, data_poly_2, data_lable,
                        scoring='neg_mean_squared_error', cv=10)
tree_poly_rmse = np.sqrt(-tree_poly_scores)
print(tree_poly_rmse)

In [None]:
forest_reg = RandomForestRegressor()
forest_reg.fit(data_poly_2, data_lable)

In [None]:
forest_poly_scores = cross_val_score(forest_reg, data_poly_2, data_lable,
                        scoring='neg_mean_squared_error', cv=5)
forest_poly_rmse = np.sqrt(-forest_poly_scores)
print(forest_poly_rmse)

In [None]:
strat_test_set0 = strat_test_set.dropna(subset=['M'])


data_prepared_test = strat_test_set0.drop("M", axis=1, inplace=False)
data_lable_test = strat_test_set0["M"].copy()


data_prepared_test_scale = scale.transform(data_prepared_test)
data_prepared_test_prepared = data_poly.transform(data_prepared_test_scale)

In [None]:
best_model = forest_reg
test_predictions = best_model.predict(data_prepared_test_prepared)
test_mse = mean_squared_error(test_predictions, data_lable_test)
test_rmse = np.sqrt(test_mse)
print(test_rmse)