In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import root_mean_squared_error, r2_score, mean_absolute_percentage_error
import scienceplots
plt.style.use(["science","nature"])
plt.rcParams.update({"font.size": 12,
                    "xtick.labelsize": 12,
                    "ytick.labelsize": 12,
                    "axes.labelsize": 12,
                    'legend.fontsize': 12})
import warnings

# Ignore all warnings
warnings.filterwarnings("ignore")

In [2]:
data = pd.read_csv("datas/dataforDst_all.csv",index_col=0)
data=data[data.index>="2008-12"] # Only cycl2 24 and 25
data=data.rename(columns={"Dst-index":"Dst",
                  "Scalar B, nT":"B",
                  "SW Plasma Temperature, K":"swT",
                  "SW Proton Density, N/cm^3":"swN",
                  "SW Plasma Speed, km/s":"swV",
                  "Alpha/Prot. ratio":"APr"})
data.head(7)

Unnamed: 0,Shannon Entropy,Sample Entropy,Permutation Entropy,Spectral Entropy,Approximate Entropy,Higuchi Fractal Dim.,Katz Fractal Dim.,Petrosian Fractal Dim.,Lempel-Ziv Complexity,Hurst Exponent,Dst,B,swT,swN,swV,APr
2008-12-01,2.023154,1.441557,0.905802,0.820046,0.625788,1.846397,2.125888,1.038862,55.0,1.088886,0.0,2.7,18747.0,3.2,320.0,0.008
2008-12-02,2.004551,1.439217,0.912831,0.821028,0.632647,1.853645,2.432188,1.038862,55.0,1.162938,7.0,2.8,18158.0,5.9,293.0,0.006
2008-12-03,1.988424,1.459626,0.914263,0.82179,0.641902,1.860431,2.681439,1.038862,55.0,1.068092,7.0,5.7,49807.0,11.7,341.0,0.019
2008-12-04,1.973072,1.469676,0.906715,0.822571,0.669129,1.860308,2.335766,1.038072,55.0,1.198862,-6.0,6.5,93434.0,5.0,443.0,0.017
2008-12-05,1.883707,1.518466,0.906715,0.823104,0.680226,1.864978,2.60469,1.038072,54.0,1.112025,-14.0,7.4,79760.0,4.9,406.0,0.014
2008-12-06,1.969049,1.475907,0.906715,0.823503,0.670827,1.871677,3.133362,1.038072,55.0,1.262962,-15.0,5.7,175936.0,4.3,506.0,0.019
2008-12-07,1.97722,1.562918,0.898455,0.823876,0.704712,1.871115,3.232116,1.037281,54.0,1.108806,-9.0,3.4,111586.0,2.7,548.0,0.016


In [3]:
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
#quantile_transformer = preprocessing.QuantileTransformer(random_state=0)
#scaler = StandardScaler()
#dataq = pd.DataFrame(quantile_transformer.fit_transform(data),index=data.index,columns=data.columns)
#dataq = pd.DataFrame(scaler.fit_transform(data),index=data.index,columns=data.columns)
#data=dataq.copy()

In [4]:
N=-1
X = data.drop(["Dst"], axis=1)[1:N]
X1 = data.drop(["Dst","B","swT","swN","swV","APr"], axis=1)[1:N]
X2 = data.drop(["Dst","Shannon Entropy","Sample Entropy","Permutation Entropy",
                "Spectral Entropy","Approximate Entropy","Higuchi Fractal Dim.",
                "Katz Fractal Dim.","Petrosian Fractal Dim.","Lempel-Ziv Complexity","Hurst Exponent"], axis=1)[1:N]
y = data['Dst'][1:N]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.35, random_state=42)
X1_train, X1_test, y1_train, y1_test = train_test_split(X1, y, test_size=0.35, random_state=42)
X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y, test_size=0.35, random_state=42)

#Model 0: Model with all variables: heliophysis + Topological
#Model 1: Model with only Topological variables 
#Model 1: Model with only heliophysis variables 

In [5]:
X.head()

Unnamed: 0,Shannon Entropy,Sample Entropy,Permutation Entropy,Spectral Entropy,Approximate Entropy,Higuchi Fractal Dim.,Katz Fractal Dim.,Petrosian Fractal Dim.,Lempel-Ziv Complexity,Hurst Exponent,B,swT,swN,swV,APr
2008-12-02,2.004551,1.439217,0.912831,0.821028,0.632647,1.853645,2.432188,1.038862,55.0,1.162938,2.8,18158.0,5.9,293.0,0.006
2008-12-03,1.988424,1.459626,0.914263,0.82179,0.641902,1.860431,2.681439,1.038862,55.0,1.068092,5.7,49807.0,11.7,341.0,0.019
2008-12-04,1.973072,1.469676,0.906715,0.822571,0.669129,1.860308,2.335766,1.038072,55.0,1.198862,6.5,93434.0,5.0,443.0,0.017
2008-12-05,1.883707,1.518466,0.906715,0.823104,0.680226,1.864978,2.60469,1.038072,54.0,1.112025,7.4,79760.0,4.9,406.0,0.014
2008-12-06,1.969049,1.475907,0.906715,0.823503,0.670827,1.871677,3.133362,1.038072,55.0,1.262962,5.7,175936.0,4.3,506.0,0.019


In [6]:
import tpot
from tpot import TPOTRegressor

Version 1.0.0 of tpot is outdated. Version 1.1.0 was released 2 days ago.


In [7]:
print('tpot: %s' % tpot.__version__)

tpot: 1.0.0


In [8]:
#########################
GENERATIONS = 5
POPULATION = 10
CROSSVALIDATION_SPLIT = 5
#########################
tpot = TPOTRegressor(verbose=4, 
                     #max_time_mins=10, 
                     n_jobs=4, 
                     generations=GENERATIONS, 
                     cv=CROSSVALIDATION_SPLIT)
tpot.fit(X_train, y_train)

Generation:   0%|          | 0/5 [00:00<?, ?it/s]Version 1.0.0 of tpot is outdated. Version 1.1.0 was released 2 days ago.
Version 1.0.0 of tpot is outdated. Version 1.1.0 was released 2 days ago.
Version 1.0.0 of tpot is outdated. Version 1.1.0 was released 2 days ago.
Version 1.0.0 of tpot is outdated. Version 1.1.0 was released 2 days ago.
Generation:  20%|██        | 1/5 [14:35<58:20, 875.09s/it]

Generation:  1
Best mean_squared_error score: -120.40018898113951


Generation:  40%|████      | 2/5 [26:01<38:11, 763.93s/it]

Generation:  2
Best mean_squared_error score: -119.98879723669775


Generation:  60%|██████    | 3/5 [41:36<28:04, 842.29s/it]

Generation:  3
Best mean_squared_error score: -119.98879723669775


Generation:  80%|████████  | 4/5 [57:28<14:45, 885.50s/it]

Generation:  4
Best mean_squared_error score: -119.98879723669775
 {'individual': <tpot.search_spaces.pipelines.sequential.SequentialPipelineIndividual object at 0x7158328cb5b0>, 'time': 1751803496.8547153} 

 {'individual': <tpot.search_spaces.pipelines.sequential.SequentialPipelineIndividual object at 0x71582bb03d90>, 'time': 1751803588.512044} 

 {'individual': <tpot.search_spaces.pipelines.sequential.SequentialPipelineIndividual object at 0x71582bc96040>, 'time': 1751803588.8134475} 



Generation: 100%|██████████| 5/5 [1:00:08<00:00, 721.61s/it]

Generation:  5
Best mean_squared_error score: -119.98879723669775



2025-07-06 07:06:39,243 - distributed.scheduler - ERROR - Removing worker 'tcp://127.0.0.1:34523' caused the cluster to lose scattered data, which can't be recovered: {'DataFrame-e0f5dd9e6ecc2f6a54d5ba9b5ed734ea', 'Series-dcf8655d5f972bd7be1f3d42e654bc66'} (stimulus_id='handle-worker-cleanup-1751803599.2433422')


 <tpot.search_spaces.pipelines.sequential.SequentialPipelineIndividual object at 0x72a6e7de7b10> 
 No feature in X meets the variance threshold 0.01503 

 <tpot.search_spaces.pipelines.sequential.SequentialPipelineIndividual object at 0x72a6e7d4deb0> 
 Some value(s) of y are negative which is not allowed for Poisson regression. 

 <tpot.search_spaces.pipelines.sequential.SequentialPipelineIndividual object at 0x72a71fcafd50> 
 Some value(s) of y are negative which is not allowed for Poisson regression. 

 <tpot.search_spaces.pipelines.sequential.SequentialPipelineIndividual object at 0x72a6e7ef0e60> 
 Found array with 1 feature(s) (shape=(3006, 1)) while a minimum of 2 is required by FeatureAgglomeration. 

 <tpot.search_spaces.pipelines.sequential.SequentialPipelineIndividual object at 0x72a6e7b3bd40> 
 No feature in X meets the variance threshold 0.13392 

 <tpot.search_spaces.pipelines.sequential.SequentialPipelineIndividual object at 0x72a6e7dd3770> 
 Some value(s) of y are negativ



In [9]:
print(tpot.fitted_pipeline_)

Pipeline(steps=[('minmaxscaler', MinMaxScaler()),
                ('selectpercentile',
                 SelectPercentile(percentile=61.8928700278173)),
                ('featureunion-1',
                 FeatureUnion(transformer_list=[('featureunion',
                                                 FeatureUnion(transformer_list=[('nystroem',
                                                                                 Nystroem(gamma=0.5528087550866,
                                                                                          kernel='poly',
                                                                                          n_components=12))])),
                                                ('passthrough',
                                                 Passthrough())])),
                ('featureunion-2',
                 FeatureUnion(transformer_list=[('skiptransformer',
                                                 SkipTransformer()),
                     

In [10]:
exctracted_best_model=tpot.fitted_pipeline_.steps[-1][1]

In [11]:
exctracted_best_model.fit(X_train, y_train)

Best models and feature importances...

Saving model...

In [12]:
import pickle
# save the model to disk
filename = 'datas/finalized_model_tpot_cycle_24_25_dst.sav'
pickle.dump(exctracted_best_model, open(filename, 'wb'))
 
# some time later...
# load the model from disk
loaded_model = pickle.load(open(filename, 'rb'))
result = loaded_model.score(X_test, y_test)
print(result)

0.207648734968907


In [13]:
exctracted_best_model

In [14]:
loaded_model