In [1]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 50)


In [6]:
# Load the datasets
DATA_DIR = './data'
revenue_df = pd.read_csv(f'{DATA_DIR}/Revenue by Greenhouse by Crop.csv')
calendar_df = pd.read_csv(f'{DATA_DIR}/crop calendar weeks 2025-08 to 2025-32.csv')
hydro_df = pd.read_csv(f'{DATA_DIR}/Hydronomics Data - GH1.csv')

# Display basic information about each dataset
print("=== Revenue by Greenhouse by Crop Dataset ===")
print("\nShape:", revenue_df.shape)
print("\nColumns:", revenue_df.columns.tolist())
print("\nFirst few rows:")
display(revenue_df.head())
print("\n" + "="*50 + "\n")

print("=== Crop Calendar Dataset ===")
print("\nShape:", calendar_df.shape)
print("\nColumns:", calendar_df.columns.tolist())
print("\nFirst few rows:")
display(calendar_df.head())
print("\n" + "="*50 + "\n")

print("=== Hydronomics Data Dataset ===")
print("\nShape:", hydro_df.shape)
print("\nColumns:", hydro_df.columns.tolist())
print("\nFirst few rows:")
display(hydro_df.head())


=== Revenue by Greenhouse by Crop Dataset ===

Shape: (240, 15)

Columns: ['resource_tag', 'crop_name', 'harvest_week_num', 'week_label', 'b2b_b_target_revenue', 'hotel_b_target_revenue', 'b2b_b_harvest_revenue', 'hotel_b_harvest_revenue', 'target_kg', 'kg_harvested', 'pct_of_target_kg', 'target_unit_g', 'harvest_unit_g', 'pct_of_target_unit_g', 'current_week']

First few rows:


Unnamed: 0,resource_tag,crop_name,harvest_week_num,week_label,b2b_b_target_revenue,hotel_b_target_revenue,b2b_b_harvest_revenue,hotel_b_harvest_revenue,target_kg,kg_harvested,pct_of_target_kg,target_unit_g,harvest_unit_g,pct_of_target_unit_g,current_week
0,GH1,Arugula,2025-10,Mar-03 w10,64000,76800,2300,2760,128,4.6,-96.41,29,1.24,-95.72,2025-26
1,GH1,Arugula,2025-11,Mar-10 w11,64000,76800,1650,1980,128,3.3,-97.42,29,1.11,-96.17,2025-26
2,GH1,Arugula,2025-12,Mar-17 w12,64000,76800,3400,4080,128,6.8,-94.69,29,2.15,-92.59,2025-26
3,GH1,Arugula,2025-13,Mar-24 w13,64000,76800,23950,28740,128,47.9,-62.58,29,10.3,-64.48,2025-26
4,GH1,Arugula,2025-14,Mar-31 w14,64000,76800,27300,32760,128,54.6,-57.34,29,10.87,-62.52,2025-26




=== Crop Calendar Dataset ===

Shape: (461, 10)

Columns: ['program_id', 'crop_id', 'crop_name', 'resource_tag', 'harvest_week_num', 'kg_harvested', 'units_harvested', 'seeding_date', 'transplant_date', 'harvest_date']

First few rows:


Unnamed: 0,program_id,crop_id,crop_name,resource_tag,harvest_week_num,kg_harvested,units_harvested,seeding_date,transplant_date,harvest_date
0,910,12,Arugula,PNQ-GH1,2025-08,,,2025-01-06 00:00:00+00,2025-01-27 00:00:00+00,2025-02-17 00:00:00+00
1,910,12,Arugula,PNQ-GH1,2025-09,,,2025-01-13 00:00:00+00,2025-02-03 00:00:00+00,2025-02-24 00:00:00+00
2,901,14,Swiss Chard,PNQ-GH1,2025-09,,,2025-01-13 00:00:00+00,2025-02-03 00:00:00+00,2025-02-24 00:00:00+00
3,904,16,Watercress,PNQ-GH1,2025-09,,,2025-01-13 00:00:00+00,2025-02-03 00:00:00+00,2025-02-24 00:00:00+00
4,910,12,Arugula,PNQ-GH1,2025-10,4.6,3720.0,2025-01-20 00:00:00+00,2025-02-10 00:00:00+00,2025-03-03 00:00:00+00




=== Hydronomics Data Dataset ===

Shape: (896, 5)

Columns: ['metric_code', 'date_time', 'week_num', 'value', 'location_code']

First few rows:


Unnamed: 0,metric_code,date_time,week_num,value,location_code
0,sys_ec,2024-12-30,2025-01,1295.0,GH1
1,sys_ec,2024-12-31,2025-01,1259.0,GH1
2,sys_ec,2025-01-01,2025-01,1265.0,GH1
3,sys_ec,2025-01-02,2025-01,1237.0,GH1
4,sys_ec,2025-01-03,2025-01,1242.0,GH1


In [4]:
# Basic data quality checks
def check_dataset(df, name):
    print(f"\n=== Data Quality Check for {name} ===")
    print("\nMissing values:")
    print(df.isnull().sum())
    print("\nData types:")
    print(df.dtypes)
    print("\nSummary statistics:")
    print(df.describe())
    print("\n" + "="*50)

# Run checks for each dataset
check_dataset(revenue_df, "Revenue Dataset")
check_dataset(calendar_df, "Calendar Dataset")
check_dataset(hydro_df, "Hydronomics Dataset")



=== Data Quality Check for Revenue Dataset ===

Missing values:
resource_tag               0
crop_name                  0
harvest_week_num           0
week_label                 0
b2b_b_target_revenue       0
hotel_b_target_revenue     0
b2b_b_harvest_revenue      0
hotel_b_harvest_revenue    0
target_kg                  0
kg_harvested               0
pct_of_target_kg           0
target_unit_g              0
harvest_unit_g             3
pct_of_target_unit_g       3
current_week               0
dtype: int64

Data types:
resource_tag                object
crop_name                   object
harvest_week_num            object
week_label                  object
b2b_b_target_revenue         int64
hotel_b_target_revenue       int64
b2b_b_harvest_revenue        int64
hotel_b_harvest_revenue      int64
target_kg                    int64
kg_harvested               float64
pct_of_target_kg           float64
target_unit_g                int64
harvest_unit_g             float64
pct_of_target_unit_