Wind Speed Prediction Dataset
Predict wind speed using weather variables

Kaggle link: https://www.kaggle.com/datasets/fedesoriano/wind-speed-prediction-dataset

In [7]:
import pandas as pd
import plotly.express as px
import numpy as np
import cut_the_tails as ct
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from xgboost import XGBClassifier, XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error
from sklearn.model_selection import train_test_split

In [8]:
df = pd.read_csv('../data_sets\\wind_dataset.csv')
target = 'WIND'

In [9]:
features = [
 'IND',
 'RAIN',
 'IND.1',
 'T.MAX',
 'IND.2',
 'T.MIN',
 'T.MIN.G'
 ]

In [10]:
fig = px.histogram(df, x=target, nbins=40)
fig.show()

In [11]:
cdf = ct.split_by_quantile_class(df,target,[0.1,0.55])

cdf.dropna(inplace=True)

# Cauda Ótima

In [12]:
df.dropna(inplace=True)
df = df.reset_index(drop=True)

model = DecisionTreeRegressor(random_state=0, max_depth=5) 
classifier = DecisionTreeClassifier(random_state=0, max_depth=5)

x,fval = ct.get_cuts_direct_optimization(df, target, features, classifier, model, 'brute')

print("Final Result: ")
print(x, fval)

[0. 0.] 0.3760708899549622
[0.   0.05] 0.363317571387716
[0.  0.1] 0.3601919317144785
[0.   0.15] 0.3788865521366673
[0.  0.2] 0.37477455315659836
[0.   0.25] 0.37883828384343154
[0.  0.3] 0.39095171387240957
[0.   0.35] 0.4425281273417821
[0.  0.4] 0.4573462106320835
[0.   0.45] 0.4491823943383042
[0.  0.5] 0.4622253123605513
[0.   0.55] 0.48295373517845164
[0.  0.6] 0.49320768440913376
[0.   0.65] 0.49579766859850305
[0.  0.7] 0.47236121931668906
[0.   0.75] 0.46523425463842033
[0.  0.8] 0.43795639319233903
[0.   0.85] 0.4451620100715495
[0.  0.9] 0.41446851789951883
[0.   0.95] 0.3991675016176393
[0.   0.05] 0.363317571387716
[0.05 0.05] 0.36316433908784407
[0.05 0.1 ] 0.36803454359569765
[0.05 0.15] 0.36078387313151433
[0.05 0.2 ] 0.38134181441399656
[0.05 0.25] 0.36531521344166273
[0.05 0.3 ] 0.39335826392867573
[0.05 0.35] 0.38851409261733133
[0.05 0.4 ] 0.3857948465392217
[0.05 0.45] 0.4167722458205677
[0.05 0.5 ] 0.4209922907708774
[0.05 0.55] 0.44258586488941065
[0.05 0.6 ] 0.

ValueError: Quantiles must be in the range [0, 1]

In [None]:
X = cdf[features].to_numpy()
y_tail = cdf['tail_class'].to_numpy()
y = cdf[target].to_numpy()

###Using split###
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
X_train_aux, X_test_aux, y_train_tail, y_test_tail = train_test_split(X, y_tail, test_size=0.2, random_state=0)

In [None]:
#baseline = RandomForestRegressor(max_depth=5, random_state=0)
#baseline = XGBRegressor(n_estimators=100, max_leaves=0, random_state = 0)
baseline = DecisionTreeRegressor(random_state=0, max_depth=5) 

In [None]:
#tail_classifier = ct.fit_tail_classifier(X,y_tail,RandomForestClassifier(max_depth=5, random_state=0))
#tail_classifier = ct.fit_tail_classifier(X,y_tail,XGBClassifier(n_estimators=100, max_leaves=0, random_state = 0))
#tail_classifier = ct.fit_tail_classifier(X,y_tail,DecisionTreeClassifier(random_state=0, max_depth=5)) 

###Using split###
#tail_classifier = ct.fit_tail_classifier(X_train,y_train_tail,RandomForestClassifier(max_depth=5, random_state=0))
#tail_classifier = ct.fit_tail_classifier(X_train,y_train_tail,XGBClassifier(n_estimators=100, max_leaves=0, random_state = 0))
tail_classifier = ct.fit_tail_classifier(X_train,y_train_tail,DecisionTreeClassifier(random_state=0, max_depth=5)) 

In [None]:
#models = ct.fit_tail_models(X,y,y_tail,baseline)

###Using split###
models = ct.fit_tail_models(X_train,y_train,y_train_tail,baseline)

In [None]:
#baseline.fit(X,y)
#y_base = baseline.predict(X)
#y_tail = ct.batch_tail_predict(X,tail_classifier,models)

###Using split###
baseline.fit(X_train,y_train)
y_base = baseline.predict(X_test)
y_tail = ct.batch_tail_predict(X_test,tail_classifier,models)

In [None]:
#print(mean_absolute_error(y_base,y))
#print(mean_absolute_error(y_tail,y))
#print(mean_absolute_error(y,y))

###Using split###
print(mean_absolute_error(y_base,y_test))
print(mean_absolute_error(y_tail,y_test))
print(mean_absolute_error(y_test,y_test))

3.5085051155012508
3.955739158750839
0.0


In [None]:
#print(mean_absolute_percentage_error(y_base,y))
#print(mean_absolute_percentage_error(y_tail,y))
#print(mean_absolute_percentage_error(y,y))

###Using split###
print(mean_absolute_percentage_error(y_base,y_test))
print(mean_absolute_percentage_error(y_tail,y_test))
print(mean_absolute_percentage_error(y_test,y_test))

0.37616900301164596
0.4196770338416045
0.0
