# This Notebook explains how each model generates the data, already saved in CSVs/FinalExperiment/ folder

In [None]:
%load_ext autoreload
%autoreload 2
import sys
# Add the parent directory to the system path
sys.path.insert(0, '..')
import numpy as np
import matplotlib
from matplotlib.patches import Rectangle
from matplotlib.backends.backend_pgf import FigureCanvasPgf
matplotlib.backend_bases.register_backend('pdf', FigureCanvasPgf)
matplotlib.rcParams.update({
    "pgf.texsystem": "pdflatex",
    'font.family': 'serif',
    'text.usetex': True,
    'pgf.rcfonts': False,
    'text.latex.preamble': '\\usepackage[dvips]{graphicx}\\usepackage{xfrac}\\usepackage{amssymb}\\usepackage{lmodern}'
})

'''
Each model class should have the functions
.train(data_csv)
.generate(n=5000) 
.gen_to_csv()
'''


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


'\nEach model class should have the functions\n.train(data_csv)\n.generate(n=5000) \n.gen_to_csv()\n'

In [None]:
from PY.Plotting import FieldSequencesPlotter
R = FieldSequencesPlotter('RealScapyHeaders.csv', val_set=0.2)
ROUNDS = 50
N=1000
score2s_dict = {} #dict of models with each corresponding to a parameter, round and score2
len_ent_dict = {}

In [None]:
from Models.GeneticModel import GeneticModel
score2s_dict['Genetic'] = []

best = (0,0,0)
model = GeneticModel() #reg=reg
model.train(R)
for reg in list(range(9)) + [100000]:
    print(reg)
    for round in range(ROUNDS):
        model.generate(N=N, reg=reg)
        model.gen_to_csv()
        P = FieldSequencesPlotter('GeneticGeneratedHeaders.csv')
        P.comparative_statistics(R)
        score2s_dict['Genetic'].append((reg, round, P.score2))
        if P.score2 > best[2]:
            best = (reg, round, P.score2)

print(best)
(reg, round, bscore2) = best
model = GeneticModel()#reg=reg)
model.train(R)
model.generate(N=N, reg=reg)
model.gen_to_csv()
P = FieldSequencesPlotter('GeneticGeneratedHeaders.csv')
P.self_statistics()
P.comparative_statistics(R)
len_ent_dict['Genetic'] = (P.entropies, P.lengths)

0
self.TnG=97, self.T=472, self.Gn=659, self.G=1000, self.score2=2.3684017094017094
self.TnG=97, self.T=472, self.Gn=659, self.G=1000, self.score2=2.3684017094017094
self.TnG=97, self.T=472, self.Gn=659, self.G=1000, self.score2=2.3684017094017094
self.TnG=97, self.T=472, self.Gn=659, self.G=1000, self.score2=2.3684017094017094
self.TnG=97, self.T=472, self.Gn=659, self.G=1000, self.score2=2.3684017094017094
self.TnG=97, self.T=472, self.Gn=659, self.G=1000, self.score2=2.3684017094017094
self.TnG=97, self.T=472, self.Gn=659, self.G=1000, self.score2=2.3684017094017094
self.TnG=97, self.T=472, self.Gn=659, self.G=1000, self.score2=2.3684017094017094
self.TnG=97, self.T=472, self.Gn=659, self.G=1000, self.score2=2.3684017094017094
self.TnG=97, self.T=472, self.Gn=659, self.G=1000, self.score2=2.3684017094017094
self.TnG=97, self.T=472, self.Gn=659, self.G=1000, self.score2=2.3684017094017094
self.TnG=97, self.T=472, self.Gn=659, self.G=1000, self.score2=2.3684017094017094
self.TnG=97, s

In [None]:
from Models.GeneticModel2 import GeneticModel2
score2s_dict['Genetic2'] = []

best = (0,0,0)
model = GeneticModel2() #reg=reg
model.train(R)
for reg in list(range(9)) + [100000]:
    print(reg)
    for round in range(ROUNDS):
        model.generate(N=N, reg=reg)
        model.gen_to_csv()
        P = FieldSequencesPlotter('Genetic2GeneratedHeaders.csv')
        P.comparative_statistics(R)
        score2s_dict['Genetic2'].append((reg, round, P.score2))
        if P.score2 > best[2]:
            best = (reg, round, P.score2)

print(best)
(reg, round, bscore2) = best
model = GeneticModel()#reg=reg)
model.train(R)
model.generate(N=N, reg=reg)
model.gen_to_csv()
P = FieldSequencesPlotter('Genetic2GeneratedHeaders.csv')
P.self_statistics()
P.comparative_statistics(R)
len_ent_dict['Genetic2'] = (P.entropies, P.lengths)

In [None]:
from Models.MarkovModel import MarkovModel

score2s_dict['1stOrderMarkov'] = []
best = (0,0,0)
for reg in list(range(9)) + [100000]:
    print(reg)
    model = MarkovModel(reg=reg)
    model.train(R)
    for round in range(ROUNDS):
        model.generate(N=N)
        model.gen_to_csv()
        P = FieldSequencesPlotter('MarkovGeneratedHeaders.csv')
        P.comparative_statistics(R)
        score2s_dict['1stOrderMarkov'].append((reg, round, P.score2))
        if P.score2 > best[2]:
            best = (reg, round, P.score2)

print(best)
(reg, round, bscore2) = best
model = MarkovModel(reg=reg)
model.train(R)
model.generate(N=N)
model.gen_to_csv()
P = FieldSequencesPlotter('MarkovGeneratedHeaders.csv')
P.self_statistics()
P.comparative_statistics(R)
len_ent_dict['1stOrderMarkov'] = (P.entropies, P.lengths)

In [None]:
from Models.MarkovModel2 import MarkovModel2
score2s_dict['2ndOrderMarkov'] = []
for reg in list(range(9)) + [100000]:
    print(reg)
    model2 = MarkovModel2(reg=reg)
    model2.train(R)
    for round in range(ROUNDS):
        model2.generate(N=N)
        model2.gen_to_csv()
        P2 = FieldSequencesPlotter('Markov2GeneratedHeaders.csv')
        P2.comparative_statistics(R)
        score2s_dict['2ndOrderMarkov'].append((reg, round, P2.score2))
        if P2.score2 > best[2]:
            best = (reg, round, P.score2)

print(best)
(reg, round, bscore2) = best
model2 = MarkovModel2(reg=reg)
model2.train(R)
model2.generate(N=N)
model2.gen_to_csv()
P2 = FieldSequencesPlotter('Markov2GeneratedHeaders.csv')
P2.self_statistics()
P.comparative_statistics(R)
len_ent_dict['2ndOrderMarkov'] = (P2.entropies, P2.lengths)

In [None]:
from Models.GANModel import GANModel
score2s_dict['GANModel'] = []

model = GANModel()
model.train(R)

for temp in np.linspace(0.5,2,10):
    print(temp)
    for round in range(ROUNDS):
        model.generate(N=N, temperature=temp)
        model.gen_to_csv()
        P = FieldSequencesPlotter('GANGeneratedHeaders.csv')
        P.comparative_statistics(R)
        score2s_dict['GANModel'].append((temp, round, P.score2))
        if P.score2 > best[2]:
            best = (reg, round, P.score2)

print(best)
(reg, round, bscore2) = best
model.generate(N=N,temperature=temp)
model.gen_to_csv()
P = FieldSequencesPlotter('GANGeneratedHeaders.csv')
P.self_statistics()
P.comparative_statistics(R)
len_ent_dict['GANModel'] = (P.entropies, P.lengths)

In [None]:
from Models.TransformerModel import TransformerModel
score2s_dict['TransformerModel'] = []

model = TransformerModel()
model.train(R)

for temp in np.linspace(0.5,2,10):
    print(temp)
    for round in range(ROUNDS):
        model.generate(n=N, temperature=temp)
        model.gen_to_csv()
        P = FieldSequencesPlotter('TransformerGeneratedHeaders.csv')
        P.comparative_statistics(R)
        score2s_dict['TransformerModel'].append((temp, round, P.score2))
        if P.score2 > best[2]:
            best = (temp, round, P.score2)

print(best)
(temp, round, bscore2) = best
model.generate(n=N, temperature=temp)
model.gen_to_csv()
P = FieldSequencesPlotter('TransformerGeneratedHeaders.csv')
P.self_statistics()
P.comparative_statistics(R)
len_ent_dict['TransformerModel'] = (P.entropies, P.lengths)

In [None]:
from Models.LlamaModel import LlamaModel
score2s_dict['LlamaModel'] = []

model = LlamaModel()
model.train(R)

for temp in np.linspace(0.5,2,10):
    print(temp)
    for round in range(1):
        model.generate(n=N, temperature=temp)
        model.gen_to_csv()
        P = FieldSequencesPlotter('LlamaGeneratedHeaders.csv')
        P.comparative_statistics(R)
        score2s_dict['LlamaModel'].append((temp, round, P.score2))
print(temp, round, P.score2)
print(best)
(temp, round, bscore2) = best
model.generate(n=N, temperature=temp)
model.gen_to_csv()
P = FieldSequencesPlotter('LlamaGeneratedHeaders.csv')
P.self_statistics()
P.comparative_statistics(R)
len_ent_dict['LlamaModel'] = (P.entropies, P.lengths)

In [None]:
# Prepare a list of rows
import pandas as pd
rows = []
for model, tuples_list in score2s_dict.items():
    for reg, rnd, score2 in tuples_list:
        rows.append({
            "Model": model,
            "Regularisation": reg,
            "Round": rnd,
            "score2": score2
        })

# Create DataFrame
df = pd.DataFrame(rows)

print(df)
df.to_csv('CSVs/model_score2s.csv', index=False) #header=False) #mode=a

# Convert to DataFrame
rows = []
for model, (entropies, lengths) in len_ent_dict.items():
    for e, l in zip(entropies, lengths):
        rows.append([model, e, l])

df = pd.DataFrame(rows, columns=['Model', 'Entropy', 'Length'])

# Save to CSV
df.to_csv("CSVs/model_entropy_lengths.csv", index=False)