In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("henriupton/wind-solar-electricity-production")

print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/henriupton/wind-solar-electricity-production?dataset_version_number=1...


100%|██████████| 574k/574k [00:00<00:00, 84.8MB/s]

Extracting files...
Path to dataset files: /root/.cache/kagglehub/datasets/henriupton/wind-solar-electricity-production/versions/1





In [4]:
data = pd.read_csv('intermittent-renewables-production-france.csv')

In [24]:
data.columns

Index(['Date and Hour', 'Date', 'StartHour', 'EndHour', 'Source', 'Production',
       'dayOfYear', 'dayName', 'monthName'],
      dtype='object')

In [None]:
data.head()

Unnamed: 0,Date and Hour,Date,StartHour,EndHour,Source,Production,dayOfYear,dayName,monthName
0,2020-07-22 20:00:00+02:00,2020-07-22,20:00:00,21:00:00,Solar,244.0,204,Wednesday,July
1,2020-07-23 07:00:00+02:00,2020-07-23,07:00:00,08:00:00,Solar,223.0,205,Thursday,July
2,2020-07-23 16:00:00+02:00,2020-07-23,16:00:00,17:00:00,Solar,2517.0,205,Thursday,July
3,2020-07-23 19:00:00+02:00,2020-07-23,19:00:00,20:00:00,Solar,658.0,205,Thursday,July
4,2020-07-23 23:00:00+02:00,2020-07-23,23:00:00,24:00:00,Solar,0.0,205,Thursday,July


In [5]:
data.describe()

Unnamed: 0,Production,dayOfYear
count,24560.0,24560.0
mean,2665.385546,157.416572
std,3067.157069,102.909601
min,0.0,1.0
25%,38.0,68.0
50%,1721.0,144.0
75%,3803.25,239.0
max,14668.0,366.0


In [6]:
data.tail()

Unnamed: 0,Date and Hour,Date,StartHour,EndHour,Source,Production,dayOfYear,dayName,monthName
24556,2021-06-05 08:00:00+02:00,2021-06-05,08:00:00,09:00:00,Solar,1185.0,156.0,Saturday,June
24557,2021-06-11 06:00:00+02:00,2021-06-11,06:00:00,07:00:00,Solar,68.0,162.0,Friday,June
24558,2021-06-05 09:00:00+02:00,2021-06-05,09:00:00,10:00:00,Solar,2142.0,156.0,Saturday,June
24559,2021-06-11 07:00:00+02:00,2021-06-11,07:00:00,08:00:00,Solar,464.0,162.0,Friday,June
24560,2021-06-05 17:00:00+02:00,2021-06-0,,,,,,,


In [7]:
data.isnull().sum()

Unnamed: 0,0
Date and Hour,0
Date,0
StartHour,1
EndHour,1
Source,1
Production,1
dayOfYear,1
dayName,1
monthName,1


In [8]:
data.columns

Index(['Date and Hour', 'Date', 'StartHour', 'EndHour', 'Source', 'Production',
       'dayOfYear', 'dayName', 'monthName'],
      dtype='object')

In [9]:
# Check missing values in Production
data['Production'].isnull().sum()
data = data.dropna(subset=['Production'])



In [10]:
# Convert to numeric (integers)
data['StartHour'] = pd.to_numeric(data['StartHour'], errors='coerce')
data['EndHour'] = pd.to_numeric(data['EndHour'], errors='coerce')

# Fill any NaN values (if '24' was coerced to NaN)
data['StartHour'].fillna(0, inplace=True)
data['EndHour'].fillna(0, inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['StartHour'].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['EndHour'].fillna(0, inplace=True)


In [28]:

data_encoded = pd.get_dummies(data, columns=['Source', 'monthName'], drop_first=True)



# Select features and target
feature_columns = ['StartHour', 'EndHour', 'dayOfYear'] + \
                  [col for col in data_encoded.columns if 'Source_' in col or 'monthName_' in col]

X = data_encoded[feature_columns]
y = data_encoded['Production']

X


Unnamed: 0,StartHour,EndHour,dayOfYear,Source_Wind,monthName_August,monthName_December,monthName_February,monthName_January,monthName_July,monthName_June,monthName_March,monthName_May,monthName_November,monthName_October,monthName_September
0,0.0,0.0,204.0,False,False,False,False,False,True,False,False,False,False,False,False
1,0.0,0.0,205.0,False,False,False,False,False,True,False,False,False,False,False,False
2,0.0,0.0,205.0,False,False,False,False,False,True,False,False,False,False,False,False
3,0.0,0.0,205.0,False,False,False,False,False,True,False,False,False,False,False,False
4,0.0,0.0,205.0,False,False,False,False,False,True,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24555,0.0,0.0,162.0,False,False,False,False,False,False,True,False,False,False,False,False
24556,0.0,0.0,156.0,False,False,False,False,False,False,True,False,False,False,False,False
24557,0.0,0.0,162.0,False,False,False,False,False,False,True,False,False,False,False,False
24558,0.0,0.0,156.0,False,False,False,False,False,False,True,False,False,False,False,False


In [27]:
X

Unnamed: 0,StartHour,EndHour,dayOfYear
0,0.0,0.0,204.0
1,0.0,0.0,205.0
2,0.0,0.0,205.0
3,0.0,0.0,205.0
4,0.0,0.0,205.0
...,...,...,...
24555,0.0,0.0,162.0
24556,0.0,0.0,156.0
24557,0.0,0.0,162.0
24558,0.0,0.0,156.0


In [12]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)


X_train shape: (19648, 15)
X_test shape: (4912, 15)
y_train shape: (19648,)
y_test shape: (4912,)


In [14]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error,r2_score
from sklearn.linear_model import LogisticRegression,Lasso,Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

In [19]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error,r2_score
model =LinearRegression()
model.fit(X_train,y_train)
y_pred= model.predict(X_test)

# Evaluation
print("Mean Squared Error:", mean_squared_error(y_test, y_pred))
print("R-squared Score:", r2_score(y_test, y_pred)) # training the model with object

Mean Squared Error: 5713958.47090903
R-squared Score: 0.40121974360190915


In [20]:
model.fit(X_train,y_train)

In [21]:
y_pred= model.predict(X_test)

In [22]:

# Evaluation
print("Mean Squared Error:", mean_squared_error(y_test, y_pred))
print("R-squared Score:", r2_score(y_test, y_pred))

Mean Squared Error: 5713958.47090903
R-squared Score: 0.40121974360190915


In [34]:
# Features and Target
X = data[["StartHour", "EndHour", "dayOfYear"]]
y = data["Production"]

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [35]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

models = {
    "Linear Regression": LinearRegression(),
    "Ridge Regression": Ridge(),
    "Lasso Regression": Lasso(),
    "Decision Tree": DecisionTreeRegressor(),
    "Random Forest": RandomForestRegressor()
}

results = {}

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    results[name] = {
        "MSE": mean_squared_error(y_test, y_pred),
        "R²": r2_score(y_test, y_pred)
    }

# Show results
for model_name, metrics in results.items():
    print(f"{model_name} -> MSE: {metrics['MSE']:.2f}, R²: {metrics['R²']:.2f}")


Linear Regression -> MSE: 9523107.03, R²: 0.00
Ridge Regression -> MSE: 9523107.03, R²: 0.00
Lasso Regression -> MSE: 9523105.97, R²: 0.00
Decision Tree -> MSE: 8127541.70, R²: 0.15
Random Forest -> MSE: 8123481.05, R²: 0.15


In [36]:
best_model = RandomForestRegressor()   # Example: choose best performing one
best_model.fit(X_train, y_train)

# Custom input
input_data = [[6, 12, 200]]  # 2D array
predicted_power = best_model.predict(input_data)



print("Predicted Power Generation:", predicted_power[0], "kWh")


Predicted Power Generation: 1171.054125511326 kWh




In [39]:
import pickle

with open("model1.pkl", "wb") as f:
    pickle.dump(best_model, f)

print("✅ Model saved as model1.pkl")


✅ Model saved as model1.pkl
