*Importing libraries*

In [43]:
import pandas as pd
import glob
import matplotlib.pyplot as plt
import numpy as np 
import pvlib
import os
from pvlib import clearsky, atmosphere, solarposition
from pvlib.location import Location
from pvlib.iotools import read_tmy3

import warnings
warnings.filterwarnings("ignore")

*Importing data*

In [44]:
file_path = "C:\\Users\\shari\\Desktop\\MNRE\\raw_data\\GHI_raw_"

In [45]:
years = range(2020, 2024 + 1)

In [46]:
dfs = []
for year in years:
    file_path_temp = f"{file_path}{year}.csv"
    if os.path.exists(file_path_temp):
        df_temp = pd.read_csv(file_path_temp)
        dfs.append(df_temp)

In [47]:
GHI_df = pd.concat(dfs, ignore_index=True)

In [48]:
GHI_df['datetime'] = pd.to_datetime(GHI_df['datetime'], format="%d-%m-%Y %H:%M", errors='coerce')


In [49]:
GHI_df.set_index('datetime', inplace=True)

In [50]:
print("Number of GHI_df measurements: " + str(GHI_df.shape[0]))

print("Number of NA: " + str(GHI_df['GHI'].isnull().sum()))

GHI_df.head(10)

Number of GHI_df measurements: 8770365
Number of NA: 402012


Unnamed: 0_level_0,GHI,dateandtime
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-01-01 00:00:00,3.046,
2020-01-01 00:00:00,3.118,
2020-01-01 00:00:00,3.027,
2020-01-01 00:00:00,2.961,
2020-01-01 00:00:00,3.018,
2020-01-01 00:00:00,2.973,
2020-01-01 00:01:00,2.996,
2020-01-01 00:01:00,3.007,
2020-01-01 00:01:00,3.018,
2020-01-01 00:01:00,2.975,


*Step 1- Removal of missing values*

In [53]:
# Define a threshold for the maximum number of consecutive missing values allowed
threshold = 1

# Identify and remove high-density consecutive missing values
def remove_consecutive_nan(df, column, threshold):
    bool_series = df[column].isnull()
    df['block'] = (bool_series.diff(1) != 0).astype('int').cumsum()
    df = df[~((df[column].isnull()) & (df.groupby('block')['block'].transform('size') > threshold))]
    df = df.drop('block', axis=1)
    return df

GHI_df = remove_consecutive_nan(GHI_df, 'GHI', threshold)

print('Number of GHI measurements to interpolate: ' + str(GHI_df['GHI'].isnull().sum()))

Number of GHI measurements to interpolate: 1086


In [54]:
GHI_df['GHI'] = GHI_df['GHI'].interpolate()

print('Number of NaNs: ' + str(GHI_df['GHI'].isnull().sum()))
print("Number of GHI measurements: " + str(GHI_df.shape[0]))

Number of NaNs: 0
Number of GHI measurements: 8369439


*Step 2- Identification and removal of outliers*

In [55]:
GHI_max = 1000
GHI_min = 0

GHI_df = GHI_df[(GHI_df["GHI"] < GHI_max) & (GHI_df["GHI"] > GHI_min)]
print("Number of GHI measurements: " + str(GHI_df.shape[0]))

Number of GHI measurements: 8262567


*Step 3- Clear sky global horizontal irradiance (GHIcs)

In [56]:
latitude = 46.518
longitude = 6.565
time_zone = 'Europe/Zurich'
altitude = 400
place = 'Ecublens'
frequency = '10S'

tus = Location(latitude, longitude, time_zone, altitude, place)

cs = tus.get_clearsky(GHI_df.index)
GHI_df['GHIcs'] = cs.ghi
