In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import root_mean_squared_error, r2_score, mean_absolute_percentage_error
import scienceplots
plt.style.use(["science","nature"])
plt.rcParams.update({"font.size": 12,
                    "xtick.labelsize": 12,
                    "ytick.labelsize": 12,
                    "axes.labelsize": 12,
                    'legend.fontsize': 12})
import warnings

# Ignore all warnings
warnings.filterwarnings("ignore")

In [2]:
data = pd.read_csv("datas/dataforF10_all.csv",index_col=0)
data=data[data.index>="2008-12"] # Only cycl2 24 and 25
data=data.rename(columns={"f10.7_index":"f107",
                  "Scalar B, nT":"B",
                  "SW Plasma Temperature, K":"swT",
                  "SW Proton Density, N/cm^3":"swN",
                  "SW Plasma Speed, km/s":"swV",
                  "Alpha/Prot. ratio":"APr"})
data.head(7)

Unnamed: 0,Shannon Entropy,Sample Entropy,Permutation Entropy,Spectral Entropy,Approximate Entropy,Higuchi Fractal Dim.,Katz Fractal Dim.,Petrosian Fractal Dim.,Lempel-Ziv Complexity,Hurst Exponent,f107,B,swT,swN,swV,APr
2008-12-01,1.926398,1.676377,0.992373,0.846315,0.715731,1.822438,2.483865,1.048228,30.0,1.15356,66.2,2.7,18747.0,3.2,320.0,0.008
2008-12-02,1.917908,1.65999,0.991421,0.853494,0.720873,1.822833,2.277169,1.047455,30.0,1.105687,67.0,2.8,18158.0,5.9,293.0,0.006
2008-12-03,1.892595,1.637609,0.988603,0.86249,0.717824,1.836808,2.401924,1.046681,30.0,1.244803,67.2,5.7,49807.0,11.7,341.0,0.019
2008-12-04,1.992342,1.644529,0.988046,0.870467,0.744367,1.846679,2.280404,1.046681,30.0,0.907205,67.6,6.5,93434.0,5.0,443.0,0.017
2008-12-05,1.957088,1.649984,0.988046,0.877329,0.744225,1.857465,2.115726,1.046681,29.0,1.093554,66.8,7.4,79760.0,4.9,406.0,0.014
2008-12-06,1.925257,1.656585,0.991136,0.883274,0.766831,1.866816,2.530336,1.047455,29.0,1.216796,67.1,5.7,175936.0,4.3,506.0,0.019
2008-12-07,1.989369,1.810109,0.991136,0.887961,0.441724,1.878538,2.911458,1.047455,29.0,1.250991,67.0,3.4,111586.0,2.7,548.0,0.016


In [3]:
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
#quantile_transformer = preprocessing.QuantileTransformer(random_state=0)
#scaler = StandardScaler()
#dataq = pd.DataFrame(quantile_transformer.fit_transform(data),index=data.index,columns=data.columns)
#dataq = pd.DataFrame(scaler.fit_transform(data),index=data.index,columns=data.columns)
#data=dataq.copy()

In [4]:
N=-1
X = data.drop(["f107"], axis=1)[1:N]
X1 = data.drop(["f107","B","swT","swN","swV","APr"], axis=1)[1:N]
X2 = data.drop(["f107","Shannon Entropy","Sample Entropy","Permutation Entropy",
                "Spectral Entropy","Approximate Entropy","Higuchi Fractal Dim.",
                "Katz Fractal Dim.","Petrosian Fractal Dim.","Lempel-Ziv Complexity","Hurst Exponent"], axis=1)[1:N]
y = data['f107'][1:N]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.35, random_state=42)
X1_train, X1_test, y1_train, y1_test = train_test_split(X1, y, test_size=0.35, random_state=42)
X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y, test_size=0.35, random_state=42)

#Model 0: Model with all variables: heliophysis + Topological
#Model 1: Model with only Topological variables 
#Model 1: Model with only heliophysis variables 

In [5]:
X.head()

Unnamed: 0,Shannon Entropy,Sample Entropy,Permutation Entropy,Spectral Entropy,Approximate Entropy,Higuchi Fractal Dim.,Katz Fractal Dim.,Petrosian Fractal Dim.,Lempel-Ziv Complexity,Hurst Exponent,B,swT,swN,swV,APr
2008-12-02,1.917908,1.65999,0.991421,0.853494,0.720873,1.822833,2.277169,1.047455,30.0,1.105687,2.8,18158.0,5.9,293.0,0.006
2008-12-03,1.892595,1.637609,0.988603,0.86249,0.717824,1.836808,2.401924,1.046681,30.0,1.244803,5.7,49807.0,11.7,341.0,0.019
2008-12-04,1.992342,1.644529,0.988046,0.870467,0.744367,1.846679,2.280404,1.046681,30.0,0.907205,6.5,93434.0,5.0,443.0,0.017
2008-12-05,1.957088,1.649984,0.988046,0.877329,0.744225,1.857465,2.115726,1.046681,29.0,1.093554,7.4,79760.0,4.9,406.0,0.014
2008-12-06,1.925257,1.656585,0.991136,0.883274,0.766831,1.866816,2.530336,1.047455,29.0,1.216796,5.7,175936.0,4.3,506.0,0.019


In [6]:
import tpot
from tpot import TPOTRegressor

Version 1.0.0 of tpot is outdated. Version 1.1.0 was released 2 days ago.


In [7]:
print('tpot: %s' % tpot.__version__)

tpot: 1.0.0


In [8]:
#########################
GENERATIONS = 5
POPULATION = 10
CROSSVALIDATION_SPLIT = 5
#########################
tpot = TPOTRegressor(verbose=4, 
                     #max_time_mins=10, 
                     n_jobs=4, 
                     generations=GENERATIONS, 
                     cv=CROSSVALIDATION_SPLIT)
tpot.fit(X_train, y_train)

Generation:   0%|          | 0/5 [00:00<?, ?it/s]Version 1.0.0 of tpot is outdated. Version 1.1.0 was released 2 days ago.
Version 1.0.0 of tpot is outdated. Version 1.1.0 was released 2 days ago.
Version 1.0.0 of tpot is outdated. Version 1.1.0 was released 2 days ago.
Version 1.0.0 of tpot is outdated. Version 1.1.0 was released 2 days ago.
Generation:  20%|██        | 1/5 [33:18<2:13:13, 1998.30s/it]

Generation:  1
Best mean_squared_error score: -221.37032846306752


Generation:  40%|████      | 2/5 [54:08<1:17:55, 1558.45s/it]

Generation:  2
Best mean_squared_error score: -201.82592501557792
 {'individual': <tpot.search_spaces.pipelines.sequential.SequentialPipelineIndividual object at 0x7a8b1836a890>, 'time': 1751807231.359239} 

 {'individual': <tpot.search_spaces.pipelines.sequential.SequentialPipelineIndividual object at 0x7a8b12863e00>, 'time': 1751807285.4303267} 

 {'individual': <tpot.search_spaces.pipelines.sequential.SequentialPipelineIndividual object at 0x7a8b1ab35a20>, 'time': 1751807351.5174356} 



Generation:  60%|██████    | 3/5 [1:00:07<40:04, 1202.37s/it]

Generation:  3
Best mean_squared_error score: -194.72237697564395



2025-07-06 08:09:27,606 - distributed.scheduler - ERROR - Removing worker 'tcp://127.0.0.1:34897' caused the cluster to lose scattered data, which can't be recovered: {'DataFrame-b2de192ab9da4851cec2103f6c00d56a', 'Series-a0b2dc40dcbdb95016847247a81753f7'} (stimulus_id='handle-worker-cleanup-1751807367.6058147')


 <tpot.search_spaces.pipelines.sequential.SequentialPipelineIndividual object at 0x7ca56ad5a660> 
 Found array with 0 feature(s) (shape=(3006, 0)) while a minimum of 1 is required by PCA. 

 <tpot.search_spaces.pipelines.sequential.SequentialPipelineIndividual object at 0x7ca56ae57a80> 

 <tpot.search_spaces.pipelines.sequential.SequentialPipelineIndividual object at 0x7ca56ae57490> 

 <tpot.search_spaces.pipelines.sequential.SequentialPipelineIndividual object at 0x7ca56abda9f0> 
 X contains negative values. 

 <tpot.search_spaces.pipelines.sequential.SequentialPipelineIndividual object at 0x7ca592ebecf0> 

 <tpot.search_spaces.pipelines.sequential.SequentialPipelineIndividual object at 0x7ca56996d190> 





In [9]:
print(tpot.fitted_pipeline_)

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('variancethreshold',
                 VarianceThreshold(threshold=0.0129262095647)),
                ('featureunion-1',
                 FeatureUnion(transformer_list=[('featureunion',
                                                 FeatureUnion(transformer_list=[('zerocount',
                                                                                 ZeroCount())])),
                                                ('passthrough',
                                                 Passthrough())])),
                ('featureunion-2',
                 FeatureUnion(transformer_list=[('featureunion',
                                                 FeatureU...
                                                                                                                              num_leaves=66,
                                                                                                                      

In [10]:
exctracted_best_model=tpot.fitted_pipeline_.steps[-1][1]

In [11]:
exctracted_best_model.fit(X_train, y_train)

Saving model...

In [12]:
import pickle
# save the model to disk
filename = 'datas/finalized_model_tpot_cycle_24_25_f107.sav'
pickle.dump(exctracted_best_model, open(filename, 'wb'))
 
# some time later...
# load the model from disk
loaded_model = pickle.load(open(filename, 'rb'))
result = loaded_model.score(X_test, y_test)
print(result)

0.8084919346262394


In [13]:
exctracted_best_model

In [14]:
loaded_model