## Mounting the drive and navigating to the folder

In [4]:
# mouting the google drive
from google.colab import drive
drive.mount("/gdrive")

Mounted at /gdrive


In [6]:
# Navigating to the folder in google drive where the data is stored
%cd '/gdrive/MyDrive/Colab Notebooks/Project'


/gdrive/MyDrive/Colab Notebooks/Project


### Loading all the libraries

In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from scipy.stats import zscore
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
import time
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.model_selection import RandomizedSearchCV
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.neural_network import MLPClassifier

### Reading the file

In [8]:
train_data=pd.read_csv('train.csv')

In [9]:
train_data.head(3)

Unnamed: 0.1,Unnamed: 0,lepton_1_pT,lepton_1_eta,lepton_1_phi,lepton_2_pT,lepton_2_eta,lepton_2_phi,missing_energy_magnitude,missing_energy_phi,MET_rel,axial_MET,M_R,M_TR_2,R,MT2,S_R,M_Delta_R,dPhi_r_b,cos(theta_r1),class
0,0,0.841381,1.832647,-0.689286,0.781839,0.572864,1.577097,0.398978,-0.683847,0.001826,0.651397,0.86556,0.429017,0.43984,0.0,0.796105,0.342497,0.461542,0.00571,0.0
1,1,0.663798,2.05829,0.681435,1.054036,0.575352,-1.001445,0.462154,-0.833411,0.199734,0.215158,0.949988,0.618046,0.577324,0.0,0.962927,0.3338,1.455247,0.101246,0.0
2,2,1.792225,-1.099978,0.088109,0.573157,-0.472629,1.642084,1.203374,1.506731,0.457695,-0.640507,1.157024,1.585432,1.215963,0.0,1.113292,0.645729,0.721326,0.613326,1.0


### Splitting the data to inputs and outputs

* It can be seen that the first column of the dataset is the index. Since this does not contain any essential information we can drop this column. 

* The last column in the dataset is the output class. This has to be seperated from the other parameters for testing and training purposes. 


In [None]:
# Reading the names of columns in the dataset
train_data.columns

Index(['Unnamed: 0', 'lepton_1_pT', 'lepton_1_eta', 'lepton_1_phi',
       'lepton_2_pT', 'lepton_2_eta', 'lepton_2_phi',
       'missing_energy_magnitude', 'missing_energy_phi', 'MET_rel',
       'axial_MET', 'M_R', 'M_TR_2', 'R', 'MT2', 'S_R', 'M_Delta_R',
       'dPhi_r_b', 'cos(theta_r1)', 'class'],
      dtype='object')

In [10]:
# Dropping the firsy column which is the index and the last column which is the output
# Output is then moved to a different variable called y_train whereas the input is stored in x_train
x_train=train_data.drop([train_data.columns[0],'class'],axis=1)
y_train=train_data['class']

## Exploratory data analysis

In [None]:
x_train.head()

Unnamed: 0,lepton_1_pT,lepton_1_eta,lepton_1_phi,lepton_2_pT,lepton_2_eta,lepton_2_phi,missing_energy_magnitude,missing_energy_phi,MET_rel,axial_MET,M_R,M_TR_2,R,MT2,S_R,M_Delta_R,dPhi_r_b,cos(theta_r1)
0,0.841381,1.832647,-0.689286,0.781839,0.572864,1.577097,0.398978,-0.683847,0.001826,0.651397,0.86556,0.429017,0.43984,0.0,0.796105,0.342497,0.461542,0.00571
1,0.663798,2.05829,0.681435,1.054036,0.575352,-1.001445,0.462154,-0.833411,0.199734,0.215158,0.949988,0.618046,0.577324,0.0,0.962927,0.3338,1.455247,0.101246
2,1.792225,-1.099978,0.088109,0.573157,-0.472629,1.642084,1.203374,1.506731,0.457695,-0.640507,1.157024,1.585432,1.215963,0.0,1.113292,0.645729,0.721326,0.613326
3,0.893018,0.297782,-1.27487,1.316164,1.593303,0.672115,0.307014,-1.189868,0.064561,0.430909,1.162625,0.548821,0.418897,0.163908,1.157707,0.298163,0.803802,0.038902
4,1.338997,0.350023,-1.51851,1.482963,-0.491807,0.34017,0.415071,-1.292034,0.240712,0.611775,1.307798,0.697804,0.473487,0.429977,1.287935,0.330327,0.717237,0.003147


### Finding missing or invalid entries in the dataset

In [None]:
x_train.isna().sum() # Checking if there are missing values in the dataset

lepton_1_pT                 0
lepton_1_eta                0
lepton_1_phi                0
lepton_2_pT                 0
lepton_2_eta                0
lepton_2_phi                0
missing_energy_magnitude    0
missing_energy_phi          0
MET_rel                     0
axial_MET                   0
M_R                         0
M_TR_2                      0
R                           0
MT2                         0
S_R                         0
M_Delta_R                   0
dPhi_r_b                    0
cos(theta_r1)               0
dtype: int64

It can be seen that there is no missing data in the dataset. Hence , there is no process needed to handle null values. However, we might still have missing values or wrong data



In [None]:
x_train.dtypes # Checking the datatype of all columns in the dataset

lepton_1_pT                 float64
lepton_1_eta                float64
lepton_1_phi                float64
lepton_2_pT                 float64
lepton_2_eta                float64
lepton_2_phi                float64
missing_energy_magnitude    float64
missing_energy_phi          float64
MET_rel                     float64
axial_MET                   float64
M_R                         float64
M_TR_2                      float64
R                           float64
MT2                         float64
S_R                         float64
M_Delta_R                   float64
dPhi_r_b                    float64
cos(theta_r1)               float64
dtype: object

<h3> Findings from dtypes </h3>

*  All the datatypes are float64 except the Unnamed 0, which means that there is no data entered as NAN, or as other string formats. 

## Understanding the statistical distribution of data

In [None]:
np.round(x_train.describe(),2)

Unnamed: 0,lepton_1_pT,lepton_1_eta,lepton_1_phi,lepton_2_pT,lepton_2_eta,lepton_2_phi,missing_energy_magnitude,missing_energy_phi,MET_rel,axial_MET,M_R,M_TR_2,R,MT2,S_R,M_Delta_R,dPhi_r_b,cos(theta_r1)
count,3500000.0,3500000.0,3500000.0,3500000.0,3500000.0,3500000.0,3500000.0,3500000.0,3500000.0,3500000.0,3500000.0,3500000.0,3500000.0,3500000.0,3500000.0,3500000.0,3500000.0,3500000.0
mean,1.0,0.0,0.0,1.0,0.0,-0.0,1.0,-0.0,1.0,-0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.22
std,0.69,1.0,1.0,0.65,1.0,1.0,0.87,1.0,0.89,1.0,0.63,0.58,0.47,0.86,0.62,0.62,0.44,0.2
min,0.25,-2.1,-1.73,0.43,-2.06,-1.73,0.0,-1.73,0.0,-15.34,0.27,0.0,0.0,0.0,0.03,0.0,0.0,0.0
25%,0.56,-0.76,-0.87,0.6,-0.77,-0.87,0.48,-0.87,0.37,-0.49,0.59,0.62,0.65,0.17,0.6,0.51,0.69,0.07
50%,0.79,0.0,-0.0,0.8,0.0,-0.0,0.77,-0.01,0.8,-0.08,0.83,0.88,0.93,0.9,0.84,0.91,1.09,0.17
75%,1.2,0.76,0.87,1.16,0.77,0.87,1.21,0.87,1.37,0.35,1.21,1.22,1.28,1.61,1.21,1.38,1.37,0.33
max,20.55,2.1,1.73,33.04,2.06,1.73,21.07,1.74,23.39,19.59,21.08,16.17,6.73,20.69,21.15,15.61,1.59,1.0



<h3> Findings from describe table </h3>
<body>


*  From the above table, it can be understood that the Unnamed : 0 is the index
* lepton_1_pt is left is right skewed as the mean and standard deviation and inclined towards the lower value of the range of values
* missing_energy_magnitude,MET_rel,M_R,R,S_R,M_delta_R,dphi_r_b
 also has a right skewness

*  The last column is the output column and it has to be seperated. 
*  To understand the dependency of other columns we will have to try other process







### Checking for duplicate values

In [None]:
x_train.duplicated().sum() # Checking if there are any duplicate entries in the dataset

0

There are no duplicate entries in the database




### Identifying the correlation of data

In [None]:
# Set a threshold to identify pairs more than that
correlation=x_train.corr().T

# Identifying columns with high correlation

high_correlation_list=[]
threshold=0.7
# Iterating through the rows and columns of the correlation matrix to check if there are any columns or parameters that are highly correlated

for x_iter  in correlation.columns:
  for y_iter in correlation.index:
    if x_iter!=y_iter:
      if (correlation[x_iter][y_iter]>threshold) & (correlation[x_iter][y_iter]>(threshold*-1)):
        high_correlation_list.append((x_iter,y_iter,correlation[x_iter][y_iter]))

for element in high_correlation_list:
    print(element)

('lepton_1_pT', 'M_R', 0.8516937529653643)
('lepton_1_pT', 'M_TR_2', 0.7242294648547011)
('lepton_1_pT', 'S_R', 0.8116162751408211)
('lepton_2_pT', 'M_R', 0.7974893534011629)
('lepton_2_pT', 'S_R', 0.79932499694437)
('missing_energy_magnitude', 'MET_rel', 0.7058940094586493)
('missing_energy_magnitude', 'M_TR_2', 0.7217469748772937)
('MET_rel', 'missing_energy_magnitude', 0.7058940094586493)
('MET_rel', 'M_Delta_R', 0.74856879269838)
('M_R', 'lepton_1_pT', 0.8516937529653643)
('M_R', 'lepton_2_pT', 0.7974893534011629)
('M_R', 'S_R', 0.9813072099983999)
('M_TR_2', 'lepton_1_pT', 0.7242294648547011)
('M_TR_2', 'missing_energy_magnitude', 0.7217469748772937)
('MT2', 'M_Delta_R', 0.808811052489197)
('S_R', 'lepton_1_pT', 0.8116162751408211)
('S_R', 'lepton_2_pT', 0.79932499694437)
('S_R', 'M_R', 0.9813072099983999)
('M_Delta_R', 'MET_rel', 0.74856879269838)
('M_Delta_R', 'MT2', 0.808811052489197)


### Checking for skewness of the data


In [None]:
# It was visible from the datat distribution that certain parameters were skewed to the right. 
# Using the nibuily skew function to check if the parameters are skewed or not

for column in x_train.columns:
  if (train_data[column].skew()>1) | (train_data[column].skew()<-1):
    print(column,'  ',train_data[column].skew())
print( ' The above attributes are skewed ')

lepton_1_pT    2.860452539757907
lepton_2_pT    3.522050030748477
missing_energy_magnitude    3.1158771071898284
MET_rel    2.266883264480619
axial_MET    1.53082620504899
M_R    2.8762447920635874
M_TR_2    2.381011555541814
S_R    2.8965969645166187
cos(theta_r1)    1.1415481567512238
 The above attributes are skewed 


### Next steps

* Although it high correaltion and skewness are seen in the above exploration, we have'nt dropeed them as we would like to see the performance of the models with and without these corrections made. 




### Dividing the dataset into train and test.

* A random state of 7 is used which will be followed throughout in this process


In [11]:
x_train,x_test,y_train,y_test=train_test_split(x_train,y_train,test_size=0.25,random_state=7)

### Creating a subset of the dataset to understand which models perform better and which doesn't

In [13]:
x_train_subset,x_test_subset,y_train_subset,y_test_subset=train_test_split(x_train,y_train,test_size=0.40,random_state=7)

### Trying different models to see its performance
Here we have tried using
* Logistic Regression
* Decision Tree classifier
* Random Forests of Logistic Regressions
* Random Forests of Decision Tree Classifiers
* Multi Layered Perceptron using sklearn
* Multi Layered Perceptron using tensorflow

In [None]:
x_train_subset.shape

(1575000, 18)

In [None]:
#Creating lists to store results
# This is later used to create a dataframe to summarise all the models created
model_name=[]
score=[]
run_time=[]

##### Logistic regression and Hyper parameter Tuning


In [None]:
start=time.time()
lgR=LogisticRegression(random_state=7)
lgR.fit(x_train_subset,y_train_subset)
t=time.time()-start
model_name.append('Logistic Regression')
score.append(lgR.score(x_test_subset,y_test_subset))
run_time.append(t)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


#### Decision tree classifier



In [None]:
start=time.time()
dtClassifier=DecisionTreeClassifier(random_state=7)
dtClassifier.fit(x_train_subset,y_train_subset)
t=time.time()-start
model_name.append('Decision Tree')
score.append(dtClassifier.score(x_test_subset,y_test_subset))
run_time.append(t)

#### Random Forest of Linear Regressions



In [None]:
start=time.time()
lr_random_forest=BaggingClassifier(base_estimator=lgR,n_estimators=10,random_state=17)
lr_random_forest.fit(x_train_subset,y_train_subset)
t=time.time()-start
model_name.append('Random Forest of Logistic Regressions')
score.append(lr_random_forest.score(x_test_subset,y_test_subset))
run_time.append(t)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

#### Random Forest of Decision Trees



In [None]:
start=time.time()
lr_random_forest=BaggingClassifier(base_estimator=dtClassifier,n_estimators=10,random_state=17)
lr_random_forest.fit(x_train_subset,y_train_subset)
t=time.time()-start
model_name.append('Random Forest of Decsion Tree Classifiers')
score.append(lr_random_forest.score(x_test_subset,y_test_subset))
run_time.append(t)

### Multi Layered Perceptron



In [None]:
model_name

['Logistic Regression',
 'Decision Tree',
 'Random Forest of Logistic Regressions',
 'Random Forest of Decsion Tree Classifiers']

In [15]:
start=time.time()
mlp=MLPClassifier(random_state=7,max_iter=100,solver='sgd')
mlp.fit(x_train_subset,y_train_subset)
t=time.time()-start
model_name.append('Multi Layered Perceptron')
score.append(mlp.score(x_test_subset,y_test_subset))
run_time.append(t)

In [None]:
results=pd.DataFrame()
results['Model']=model_name
results['Score']=score
results['Train time']=run_time
results

Unnamed: 0,Model,Score,Train time
0,Logistic Regression,0.787643,28.858265
1,Decision Tree,0.71467,129.081156
2,Random Forest of Logistic Regressions,0.787667,288.906142
3,Random Forest of Decsion Tree Classifiers,0.781822,838.045714
4,Multi Layered Perceptron,0.801927,445.749412


## Findings

* Out of all the models tried, Multi Layered Perceptron seems to be having the best prediction. 
Hence we will be using the same model for further fine tuning and hypter parameter adjustments.

* It can also be seen that the MLP takes lower time than a random forest of decision trees and gives the best accuracy on the training subset data

# Using tensorflow

In [None]:
# Creating a keras model

import tensorflow as tf
import keras
from keras import Sequential,layers



model = keras.Sequential([
    layers.Dense(19, activation='sigmoid', input_shape=(x_train.shape[1],)),
    layers.Dense(100,activation='sigmoid'),
    layers.Dense(50, activation='sigmoid'),
    layers.Dense(25, activation='sigmoid'),
    layers.Dense(10, activation='sigmoid'),
    layers.Dense(1, activation='sigmoid'),
])

model.compile(optimizer='adam',loss='logistic',metrics=['Accuracy'])

In [None]:
dlModel = model.fit(
    x_train, y_train,
    validation_data=(x_test, y_test),
    batch_size=100,
    epochs=50,
    verbose=1
)


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [None]:
df=pd.read_csv('test.csv')
out=pd.DataFrame()
out['Id']=df[df.columns[0]]
df.drop([df.columns[0]],axis=1,inplace=True)
prediction=model.predict(df)
output=[]
for pred in prediction:
  if pred<=0.80:
    output.append(float(0))
  else:
    output.append(float(1))

out['class']=output
out.to_csv('tensorflow81.csv',index=False)

Although the model was able to get good accuracy, the performance of the model on the test dataset in kaggle was pretty low.Hence excluding this model from the final submission

#Creating a multi layered perceptron

### Creating a Multi Layered Perceptron for testing the model

* Training the model with more data

In [None]:
x_train_subset.shape

(1575000, 18)

#### Since the MLP took 443 seconds to train on a dataset of size 1.5Million, taking a further subset of the dataset using stratified sampling to find the best hyper parameters. 

In [18]:
#Creating a smaller subset of data
x_train_micro,x_test_micro,y_train_micro,y_test_micro=train_test_split(x_train_subset,y_train_subset,test_size=0.8,stratify=y_train_subset)

In [None]:
x_train_micro.shape

(315000, 18)

## Using Gridsearch cv for hyper parameter tuning of the MLP model

In [16]:
hidden_layer_sizes=[10,12,15,18,20]
solver=['sgd','adam']
activation=['identify','logistic','relu','tanh']
batch_size=[100,300,600,1000]
learning_rate=['constant','invscaling','adaptive']

grid={'hidden_layer_sizes':hidden_layer_sizes,
      'solver':solver,
      'activation':activation,
      'batch_size':batch_size,
      'learning_rate':learning_rate}

random_search=RandomizedSearchCV(estimator=mlp,param_distributions=grid,
                                 n_iter=30,random_state=7,n_jobs=-1,verbose=1)

In [19]:
mlp_search=random_search.fit(x_train_micro,y_train_micro)

Fitting 5 folds for each of 30 candidates, totalling 150 fits


20 fits failed out of a total of 150.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
20 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/sklearn/model_selection/_validation.py", line 681, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.7/dist-packages/sklearn/neural_network/_multilayer_perceptron.py", line 752, in fit
    return self._fit(X, y, incremental=False)
  File "/usr/local/lib/python3.7/dist-packages/sklearn/neural_network/_multilayer_perceptron.py", line 384, in _fit
    self._validate_hyperparameters()
  File "/usr/local/lib/python3.7/dist-packages/sklearn/neural_network/_multilayer_perceptron.py", line 49

In [20]:
mlp_search.best_params_

{'activation': 'tanh',
 'batch_size': 300,
 'hidden_layer_sizes': 18,
 'learning_rate': 'constant',
 'solver': 'adam'}

# Creating a model with the best params as specified by grid search

In [21]:
model=MLPClassifier(hidden_layer_sizes=18,random_state=7,max_iter=100,solver='sgd',batch_size=100)

In [22]:
model.fit(x_train_micro,y_train_micro)

MLPClassifier(batch_size=100, hidden_layer_sizes=18, max_iter=100,
              random_state=7, solver='sgd')

In [23]:
model.score(x_test_micro,y_test_micro)

0.8005539682539683

## Tweaking he hyper parameters more, as the Gridsearch CV Results was not performing well on the test data

In [None]:

model_full_data_tuned=MLPClassifier(hidden_layer_sizes=18,random_state=7,max_iter=100,solver='sgd')
model_full_data_tuned.fit(x_train,y_train)
model_full_data_tuned.score(x_test,y_test)

0.8019097142857143

In [None]:

model_full_data_tuned1=MLPClassifier(hidden_layer_sizes=15,random_state=7,max_iter=200,solver='sgd')
model_full_data_tuned1.fit(x_train,y_train)
model_full_data_tuned1.score(x_test,y_test)

0.8018697142857143

In [None]:

model_full_data_tuned2=MLPClassifier(hidden_layer_sizes=18,random_state=7,max_iter=100,solver='sgd',activation='tanh')
model_full_data_tuned2.fit(x_train,y_train)
model_full_data_tuned2.score(x_test,y_test)

0.80184

In [None]:

model_full_data_tuned3=MLPClassifier(hidden_layer_sizes=18,random_state=7,max_iter=100,solver='sgd',activation='tanh',learning_rate='constant')
model_full_data_tuned3.fit(x_train,y_train)
model_full_data_tuned3.score(x_test,y_test)

0.80184

In [None]:

model_full_data4=MLPClassifier(random_state=7,max_iter=100,solver='sgd')
model_full_data4.fit(x_train,y_train)
model_full_data4.score(x_test,y_test)

0.8028171428571429

In [None]:

model_full_data5=MLPClassifier(random_state=7,max_iter=50,solver='sgd')
model_full_data5.fit(x_train,y_train)
model_full_data5.score(x_test,y_test)

0.8028171428571429

In [None]:
model_full_data6=MLPClassifier(random_state=7,max_iter=100,solver='sgd')
model_full_data6.fit(x_train,y_train)
model_full_data6.score(x_test,y_test)

0.8028171428571429

In [None]:
model_full_data_7=MLPClassifier(random_state=7,max_iter=100,solver='adam')
model_full_data_7.fit(x_train,y_train)
model_full_data_7.score(x_test,y_test)

0.8035085714285715

In [None]:
model_full_data_8=MLPClassifier(random_state=7,max_iter=100,solver='adam',activation='logistic')
model_full_data_8.fit(x_train,y_train)
model_full_data_8.score(x_test,y_test)

0.8037977142857143

In [None]:
model_full_data_9=MLPClassifier(random_state=7,max_iter=100,solver='adam',activation='logistic',learning_rate='adaptive')
model_full_data_9.fit(x_train,y_train)
model_full_data_9.score(x_test,y_test)

0.8037977142857143

In [None]:
# Normalising the data and applying the above model again

In [None]:
x_train=train_data.drop([train_data.columns[0],'class'],axis=1)
y_train=train_data['class']
x_train=x_train.apply(zscore)
x_train,x_test,y_train,y_test=train_test_split(x_train,y_train,test_size=0.25,random_state=7)

model_full_data_10=MLPClassifier(random_state=7,max_iter=100,solver='adam',activation='logistic')
model_full_data_10.fit(x_train,y_train)
model_full_data_10.score(x_test,y_test)

0.8039668571428571

# Submission using the result obtained in las titeration where the accuracy was the maximum so far. 
 

## submission1(1).csv


In [None]:
df=pd.read_csv('test.csv')
out=pd.DataFrame()
out['Id']=df[df.columns[0]]
df.drop([df.columns[0]],axis=1,inplace=True)
prediction=model_full_data6.predict(df)
out['class']=prediction

In [None]:
out.to_csv('submission1.csv',index=False)

## Submission2.csv

In [None]:
df=pd.read_csv('test.csv')
out=pd.DataFrame()
out['Id']=df[df.columns[0]]
df.drop([df.columns[0]],axis=1,inplace=True)
prediction=model_full_data_7.predict(df)
out['class']=prediction
out.to_csv('submission2.csv',index=False)



```
# This is formatted as code
```

## Submission3.csv

In [None]:
df=pd.read_csv('test.csv')
out=pd.DataFrame()
out['Id']=df[df.columns[0]]
df.drop([df.columns[0]],axis=1,inplace=True)
prediction=model_full_data_8.predict(df)
out['class']=prediction
out.to_csv('submission3.csv',index=False)

## Submission4.csv


In [None]:
df=pd.read_csv('test.csv')
out=pd.DataFrame()
out['Id']=df[df.columns[0]]
df.drop([df.columns[0]],axis=1,inplace=True)
prediction=model_full_data_10.predict(df)
out['class']=prediction
out.to_csv('submission4.csv',index=False)