## Data partitioning

In this notebook we will take the conditioned data that we have, the data should be in the form of a two collumn csv that has the block numvber on the left and the MEV quantity sorted in to none = 0, low = 1, medium = 2, and high =3. 

In [43]:
import numpy as np 
import pandas as pd
from numpy import genfromtxt
import multiprocessing as mp
from multiprocessing import Pool
from numba import jit
import json
import csv

In [44]:
#let pull the data in and so that we can check how big the files are. 

my_data = genfromtxt('../Ethdata/randomdatawgv.csv', delimiter=',')
print(my_data)
print(np.shape(my_data))


[[0.00000000e+00 2.00000000e+00 8.75339849e-01 8.70496361e-01]
 [1.00000000e+00 0.00000000e+00 6.64251242e-01 9.95776649e-01]
 [2.00000000e+00 2.00000000e+00 5.17810565e-01 8.59385246e-01]
 ...
 [9.99700000e+03 1.00000000e+00 3.13720353e-01 7.07850871e-01]
 [9.99800000e+03 3.00000000e+00 5.12175624e-01 5.64553907e-01]
 [9.99900000e+03 3.00000000e+00 5.54034209e-02 3.86228289e-01]]
(10000, 4)


#what this function will do is take one data point and the next two after, the next one after will be used with that one for the x part of the data and the third will be the y position, 
#the awnser that we are asking here is can you predict the next MEV quantity based on the two before. This might look a little ardious, however doing it this way all in np, is much 
#much mich faster than trying to do it any other way, a for loop over 10000000 data entries would just take way too long. 

datalets = np.zeros((np.shape(my_data)[0],3))

datalets[:,0] = my_data[:,1]

my_data2 = np.delete(my_data[:,1],0,0)

my_data2 = np.append(my_data2,[0])

datalets[:,1] = my_data2

my_data3 = np.delete(my_data[:,1],[0,1],0)

my_data3 = np.append(my_data3,[0,0])

datalets[:,2] = my_data3

#lastly we need to delete the last three rows, as I had to just input zeros there to to that the array didn't have any missing point,

datalets = np.delete(datalets,[-1,-2,-3],0)

print(datalets)


In [45]:
# Ok now lets automate doing that a bunch more times so we aren't doing this for every datalet length. 

#start by making a dictionary to hold all this data

datalets = {}

# Then we loop over, the default is set from 2-10, this is the length of hte data length, the x vector will be one less than this. 

for i in range(2,10):
    
    # Start with an all zeros np array so we can get started, it will have i columns because that is the number of 
    
    datalet = np.zeros((np.shape(my_data)[0],i*3))
    
    for j in range(i):
        data = np.delete(my_data[:,1],range(j),0)
        data = np.append(data,range(j))
        data = data.astype(int)
        datalet[:,j] = data

        data = np.delete(my_data[:,2],range(j),0)
        data = np.append(data,range(j))
        data = data.astype(float)
        datalet[:,i+j] = data        
        
        data = np.delete(my_data[:,3],range(j),0)
        data = np.append(data,range(j))
        data = data.astype(float)
        datalet[:,2*i+j] = data        
               
    datalet = datalet.astype(float)
    datalet = np.delete(datalet,range(i*(-1),0),0)
        
    datalets["datalet{}".format(i)] = datalet

print(datalets)  

{'datalet2': array([[2.        , 0.        , 0.87533985, 0.66425124, 0.87049636,
        0.99577665],
       [0.        , 2.        , 0.66425124, 0.51781056, 0.99577665,
        0.85938525],
       [2.        , 1.        , 0.51781056, 0.68443699, 0.85938525,
        0.21365897],
       ...,
       [2.        , 1.        , 0.47889569, 0.28515119, 0.1297828 ,
        0.08832347],
       [1.        , 1.        , 0.28515119, 0.31372035, 0.08832347,
        0.70785087],
       [1.        , 3.        , 0.31372035, 0.51217562, 0.70785087,
        0.56455391]]), 'datalet3': array([[2.        , 0.        , 2.        , ..., 0.87049636, 0.99577665,
        0.85938525],
       [0.        , 2.        , 1.        , ..., 0.99577665, 0.85938525,
        0.21365897],
       [2.        , 1.        , 3.        , ..., 0.85938525, 0.21365897,
        0.12736657],
       ...,
       [0.        , 2.        , 1.        , ..., 0.50430803, 0.1297828 ,
        0.08832347],
       [2.        , 1.        , 1.     

## Saving these datalets as CSVs

In [46]:
for x,y in datalets.items():
    headers = []
    for i in range(int(np.shape(y)[1]/3)):
        headers.append('Mev_period{}'.format(i))
    for i in range(int(np.shape(y)[1]/3)):
        headers.append('gas_fees_period{}'.format(i))
    for i in range(int(np.shape(y)[1]/3)):
        headers.append('price_volitility_period{}'.format(i))
    headerstr = ','.join(map(str,headers))
    np.savetxt("..\dateletswgv\{}.csv".format(x), y, delimiter=",", header= headerstr, comments = '')