# Exporting the dataset and connecting to google drive



In [8]:
from google.colab import drive
drive.mount("/gdrive")

Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).


In [9]:
cd "/gdrive/MyDrive/Colab Notebooks/Project"

/gdrive/MyDrive/Colab Notebooks/Project


# Import the required libraries



In [21]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split

# Read the data

In [22]:
# Finding the list of files in the folder
import os
os.listdir()

['sample_submission.csv',
 'test.csv',
 'train.csv',
 'Untitled.ipynb',
 '.ipynb_checkpoints']

In [23]:
import pandas as pd
train_data=pd.read_csv('train.csv')

# Exploratory Data Analysis

In [24]:
# Looking at the data
train_data.head()

Unnamed: 0.1,Unnamed: 0,lepton_1_pT,lepton_1_eta,lepton_1_phi,lepton_2_pT,lepton_2_eta,lepton_2_phi,missing_energy_magnitude,missing_energy_phi,MET_rel,axial_MET,M_R,M_TR_2,R,MT2,S_R,M_Delta_R,dPhi_r_b,cos(theta_r1),class
0,0,0.841381,1.832647,-0.689286,0.781839,0.572864,1.577097,0.398978,-0.683847,0.001826,0.651397,0.86556,0.429017,0.43984,0.0,0.796105,0.342497,0.461542,0.00571,0.0
1,1,0.663798,2.05829,0.681435,1.054036,0.575352,-1.001445,0.462154,-0.833411,0.199734,0.215158,0.949988,0.618046,0.577324,0.0,0.962927,0.3338,1.455247,0.101246,0.0
2,2,1.792225,-1.099978,0.088109,0.573157,-0.472629,1.642084,1.203374,1.506731,0.457695,-0.640507,1.157024,1.585432,1.215963,0.0,1.113292,0.645729,0.721326,0.613326,1.0
3,3,0.893018,0.297782,-1.27487,1.316164,1.593303,0.672115,0.307014,-1.189868,0.064561,0.430909,1.162625,0.548821,0.418897,0.163908,1.157707,0.298163,0.803802,0.038902,0.0
4,4,1.338997,0.350023,-1.51851,1.482963,-0.491807,0.34017,0.415071,-1.292034,0.240712,0.611775,1.307798,0.697804,0.473487,0.429977,1.287935,0.330327,0.717237,0.003147,1.0


## Understanding the data to find missing values and wrong data

In [25]:
train_data.isna().sum()

Unnamed: 0                  0
lepton_1_pT                 0
lepton_1_eta                0
lepton_1_phi                0
lepton_2_pT                 0
lepton_2_eta                0
lepton_2_phi                0
missing_energy_magnitude    0
missing_energy_phi          0
MET_rel                     0
axial_MET                   0
M_R                         0
M_TR_2                      0
R                           0
MT2                         0
S_R                         0
M_Delta_R                   0
dPhi_r_b                    0
cos(theta_r1)               0
class                       0
dtype: int64

It can be seen that there is no missing data in the dataset. Hence , there is no process needed to handle null values. However, we might still have missing values or wrong data



## Understanding if the datatype of all columns are normal 

In [28]:
train_data.dtypes

Unnamed: 0                    int64
lepton_1_pT                 float64
lepton_1_eta                float64
lepton_1_phi                float64
lepton_2_pT                 float64
lepton_2_eta                float64
lepton_2_phi                float64
missing_energy_magnitude    float64
missing_energy_phi          float64
MET_rel                     float64
axial_MET                   float64
M_R                         float64
M_TR_2                      float64
R                           float64
MT2                         float64
S_R                         float64
M_Delta_R                   float64
dPhi_r_b                    float64
cos(theta_r1)               float64
class                       float64
dtype: object

<h3> Findings from dtypes </h3>

*  All the datatypes are float64 except the Unnamed 0, which means that there is no data entered as NAN, or as other string formats. 

# Understanding the statistical distribution of the data

In [29]:
np.round(train_data.describe(),2)

Unnamed: 0.1,Unnamed: 0,lepton_1_pT,lepton_1_eta,lepton_1_phi,lepton_2_pT,lepton_2_eta,lepton_2_phi,missing_energy_magnitude,missing_energy_phi,MET_rel,axial_MET,M_R,M_TR_2,R,MT2,S_R,M_Delta_R,dPhi_r_b,cos(theta_r1),class
count,3500000.0,3500000.0,3500000.0,3500000.0,3500000.0,3500000.0,3500000.0,3500000.0,3500000.0,3500000.0,3500000.0,3500000.0,3500000.0,3500000.0,3500000.0,3500000.0,3500000.0,3500000.0,3500000.0,3500000.0
mean,1749999.5,1.0,0.0,0.0,1.0,0.0,-0.0,1.0,-0.0,1.0,-0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.22,0.46
std,1010363.12,0.69,1.0,1.0,0.65,1.0,1.0,0.87,1.0,0.89,1.0,0.63,0.58,0.47,0.86,0.62,0.62,0.44,0.2,0.5
min,0.0,0.25,-2.1,-1.73,0.43,-2.06,-1.73,0.0,-1.73,0.0,-15.34,0.27,0.0,0.0,0.0,0.03,0.0,0.0,0.0,0.0
25%,874999.75,0.56,-0.76,-0.87,0.6,-0.77,-0.87,0.48,-0.87,0.37,-0.49,0.59,0.62,0.65,0.17,0.6,0.51,0.69,0.07,0.0
50%,1749999.5,0.79,0.0,-0.0,0.8,0.0,-0.0,0.77,-0.01,0.8,-0.08,0.83,0.88,0.93,0.9,0.84,0.91,1.09,0.17,0.0
75%,2624999.25,1.2,0.76,0.87,1.16,0.77,0.87,1.21,0.87,1.37,0.35,1.21,1.22,1.28,1.61,1.21,1.38,1.37,0.33,1.0
max,3499999.0,20.55,2.1,1.73,33.04,2.06,1.73,21.07,1.74,23.39,19.59,21.08,16.17,6.73,20.69,21.15,15.61,1.59,1.0,1.0


<h3> Findings from describe table </h3>
<body>


*  From the above table, it can be understood that the Unnamed : 0 is the index
*  The last column is the output column and it has to be seperated. 
*  To understand the dependency of other columns we will have to try other process







## Checking
