In [6]:
import pandas as pd
import multiprocessing
import os
import glob
import concurrent.futures
import pickle

In [7]:
#פונקציות העוזרות לתמוך בפורמטים שונים
#קריאה מקובץ והוצאה לקובץ

def read_file(file_path):
    # זיהוי סוג הקובץ לפי הסיומת
    file_extension = os.path.splitext(file_path)[1][1:].lower()  # חותכים את ה- "." מהסיומת
    read_func = getattr(pd, f"read_{file_extension}")
    return read_func(file_path)
    
def save_file(df, file_path):
    # זיהוי סוג הקובץ לפי הסיומת
    file_extension = os.path.splitext(file_path)[1][1:].lower()  # חותכים את ה- "." מהסיומת
    save_func = getattr(df, f"to_{file_extension}")
    # שמירת הקובץ
    save_func(file_path)

In [10]:
df = read_file('./time_series.csv')
df.replace("not_a_number", pd.NA, inplace=True)
df = df.dropna()
df = df.dropna(subset=["value"])

In [4]:
# פיצול הדאטה לפי ימים ושמירה לכל יום בקובץ נפרד
df['timestamp'] = pd.to_datetime(df['timestamp'], format='%d/%m/%Y %H:%M')

for day, group in df.groupby(df["timestamp"].dt.day):
    filename = f"data_day_{day:02d}.csv"  # יצירת שם קובץ לפי היום בחודש
    group.to_csv(filename, index=False)
    print(f"Saved {filename}")


Saved data_day_01.csv
Saved data_day_02.csv
Saved data_day_03.csv
Saved data_day_04.csv
Saved data_day_05.csv
Saved data_day_06.csv
Saved data_day_07.csv
Saved data_day_08.csv
Saved data_day_09.csv
Saved data_day_10.csv
Saved data_day_11.csv
Saved data_day_12.csv
Saved data_day_13.csv
Saved data_day_14.csv
Saved data_day_15.csv
Saved data_day_16.csv
Saved data_day_17.csv
Saved data_day_18.csv
Saved data_day_19.csv
Saved data_day_20.csv
Saved data_day_21.csv
Saved data_day_22.csv
Saved data_day_23.csv
Saved data_day_24.csv
Saved data_day_25.csv
Saved data_day_26.csv
Saved data_day_27.csv
Saved data_day_28.csv
Saved data_day_29.csv
Saved data_day_30.csv


In [5]:
import pandas as pd

# פונקציה לחישוב ממוצע שעתי לכל קובץ יום
def process_hourly_average(day):
    try:
        filename = f"data_day_{day:02d}.csv"
        df_day = read_file(filename)

        # המרת timestamp לפורמט datetime
        df_day["timestamp"] = pd.to_datetime(df_day["timestamp"],errors='coerce')

        # יצירת עמודת שעה
        df_day["hour"] = df_day["timestamp"].dt.strftime("%Y-%m-%d %H:00")  # פורמט YYYY-MM-DD HH:00

        # חישוב ממוצע שעתי
        hourly_avg = df_day.groupby("hour")["value"].mean().reset_index()

        # יצירת טווח של 24 שעות לכל יום
        all_hours = pd.date_range(start=f"2025-06-{day:02d} 00:00", periods=24, freq="H").strftime("%Y-%m-%d %H:00")
        all_hours_df = pd.DataFrame(all_hours, columns=["hour"])

        # מיזוג עם שעות חסרות
        hourly_avg = pd.merge(all_hours_df, hourly_avg, on="hour", how="left")
        hourly_avg["value"].fillna(0, inplace=True)  # אם יש שעות חסרות, נמלא ב-0 (או ב-NaN אם צריך)

        # שמירת הממוצע השעתי לקובץ נפרד
        avg_filename = f"hourly_avg_day_{day:02d}.csv"
        hourly_avg.to_csv(avg_filename, index=False)
        print(f"Processed hourly averages for {filename} -> {avg_filename}")
    except Exception as e:
        print(f"Error processing {day}: {e}")

       
# df = pd.read_csv("data.csv")  # יש לוודא שהקובץ נטען לפני
df["timestamp"] = pd.to_datetime(df["timestamp"])
unique_days = df["timestamp"].dt.day.unique()
print(unique_days)
print(pickle.dumps(process_hourly_average))  # אם יש שגיאה כאן, זו הבעיה

# הרצה במקביל
with concurrent.futures.ThreadPoolExecutor() as executor:
    executor.map(process_hourly_average, unique_days, chunksize=1)

# שלב 2: איחוד כל קובצי הממוצעים
def load_csv(filename):
    return read_file(filename)

hourly_avg_files = [f"hourly_avg_day_{day:02d}.csv" for day in unique_days]
df_combined = pd.concat(map(load_csv, hourly_avg_files), ignore_index=True)

df_combined["hour"] = pd.to_datetime(df_combined["hour"], errors="coerce")
df_combined["hour_only"] = df_combined["hour"].dt.hour

df_grouped = df_combined.groupby("hour_only").agg({
    "hour":"min",
    "value": "mean"  # מחשב ממוצע לערכים אחרים
}).reset_index()

df_combined["hour"] = df_combined["hour"].dt.strftime("%Y-%m-%d %H:00")  # שמירה על פורמט אחיד של תאריך ושעה
df_grouped = df_grouped.drop(columns=["hour_only"])

save_file(df_grouped, "final_hourly_averages.csv")
print("Merged all hourly averages into final_hourly_averages.csv")

[28  1 10 23  5 26  6  7 19 13  4 14 20 16 24 15  3 27  8 12 25 29 30 22
 18 21  2 11 17  9]
b"\x80\x04\x95'\x00\x00\x00\x00\x00\x00\x00\x8c\x08__main__\x94\x8c\x16process_hourly_average\x94\x93\x94."


  all_hours = pd.date_range(start=f"2025-06-{day:02d} 00:00", periods=24, freq="H").strftime("%Y-%m-%d %H:00")
  all_hours = pd.date_range(start=f"2025-06-{day:02d} 00:00", periods=24, freq="H").strftime("%Y-%m-%d %H:00")
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  hourly_avg["value"].fillna(0, inplace=True)  # אם יש שעות חסרות, נמלא ב-0 (או ב-NaN אם צריך)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].m

Processed hourly averages for data_day_13.csv -> hourly_avg_day_13.csvProcessed hourly averages for data_day_06.csv -> hourly_avg_day_06.csv

Processed hourly averages for data_day_01.csv -> hourly_avg_day_01.csv
Processed hourly averages for data_day_26.csv -> hourly_avg_day_26.csv
Processed hourly averages for data_day_10.csv -> hourly_avg_day_10.csv
Processed hourly averages for data_day_23.csv -> hourly_avg_day_23.csv
Processed hourly averages for data_day_28.csv -> hourly_avg_day_28.csv
Processed hourly averages for data_day_19.csv -> hourly_avg_day_19.csv
Processed hourly averages for data_day_04.csv -> hourly_avg_day_04.csv
Processed hourly averages for data_day_05.csv -> hourly_avg_day_05.csv
Processed hourly averages for data_day_07.csv -> hourly_avg_day_07.csv
Processed hourly averages for data_day_14.csv -> hourly_avg_day_14.csv


  all_hours = pd.date_range(start=f"2025-06-{day:02d} 00:00", periods=24, freq="H").strftime("%Y-%m-%d %H:00")
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  hourly_avg["value"].fillna(0, inplace=True)  # אם יש שעות חסרות, נמלא ב-0 (או ב-NaN אם צריך)
  all_hours = pd.date_range(start=f"2025-06-{day:02d} 00:00", periods=24, freq="H").strftime("%Y-%m-%d %H:00")
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].m

Processed hourly averages for data_day_16.csv -> hourly_avg_day_16.csv
Processed hourly averages for data_day_20.csv -> hourly_avg_day_20.csv


  all_hours = pd.date_range(start=f"2025-06-{day:02d} 00:00", periods=24, freq="H").strftime("%Y-%m-%d %H:00")
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  hourly_avg["value"].fillna(0, inplace=True)  # אם יש שעות חסרות, נמלא ב-0 (או ב-NaN אם צריך)
  all_hours = pd.date_range(start=f"2025-06-{day:02d} 00:00", periods=24, freq="H").strftime("%Y-%m-%d %H:00")
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].m

Processed hourly averages for data_day_03.csv -> hourly_avg_day_03.csvProcessed hourly averages for data_day_15.csv -> hourly_avg_day_15.csv

Processed hourly averages for data_day_12.csv -> hourly_avg_day_12.csv
Processed hourly averages for data_day_27.csv -> hourly_avg_day_27.csvProcessed hourly averages for data_day_08.csv -> hourly_avg_day_08.csv

Processed hourly averages for data_day_25.csv -> hourly_avg_day_25.csv
Processed hourly averages for data_day_24.csv -> hourly_avg_day_24.csv
Processed hourly averages for data_day_29.csv -> hourly_avg_day_29.csv
Processed hourly averages for data_day_22.csv -> hourly_avg_day_22.csv


  all_hours = pd.date_range(start=f"2025-06-{day:02d} 00:00", periods=24, freq="H").strftime("%Y-%m-%d %H:00")
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  hourly_avg["value"].fillna(0, inplace=True)  # אם יש שעות חסרות, נמלא ב-0 (או ב-NaN אם צריך)
  all_hours = pd.date_range(start=f"2025-06-{day:02d} 00:00", periods=24, freq="H").strftime("%Y-%m-%d %H:00")
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].m

Processed hourly averages for data_day_30.csv -> hourly_avg_day_30.csv


  all_hours = pd.date_range(start=f"2025-06-{day:02d} 00:00", periods=24, freq="H").strftime("%Y-%m-%d %H:00")
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  hourly_avg["value"].fillna(0, inplace=True)  # אם יש שעות חסרות, נמלא ב-0 (או ב-NaN אם צריך)
  all_hours = pd.date_range(start=f"2025-06-{day:02d} 00:00", periods=24, freq="H").strftime("%Y-%m-%d %H:00")
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].m

Processed hourly averages for data_day_21.csv -> hourly_avg_day_21.csvProcessed hourly averages for data_day_18.csv -> hourly_avg_day_18.csv

Processed hourly averages for data_day_11.csv -> hourly_avg_day_11.csv
Processed hourly averages for data_day_02.csv -> hourly_avg_day_02.csv
Processed hourly averages for data_day_17.csv -> hourly_avg_day_17.csv
Processed hourly averages for data_day_09.csv -> hourly_avg_day_09.csv


  all_hours = pd.date_range(start=f"2025-06-{day:02d} 00:00", periods=24, freq="H").strftime("%Y-%m-%d %H:00")
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  hourly_avg["value"].fillna(0, inplace=True)  # אם יש שעות חסרות, נמלא ב-0 (או ב-NaN אם צריך)
  all_hours = pd.date_range(start=f"2025-06-{day:02d} 00:00", periods=24, freq="H").strftime("%Y-%m-%d %H:00")
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].m

Merged all hourly averages into final_hourly_averages.csv
