In [None]:
# ============================================================
# NOTEBOOK 4 - FEATURE PREPARATION
# ============================================================

from google.colab import drive
drive.mount("/content/drive")

import sys, importlib
sys.path.append("/content/drive/MyDrive/SA_CropType_SourceCoop")

import common
importlib.reload(common)
from common import *

ensure_dirs()

print("BASE_DIR:", BASE_DIR)
print("META_DIR:", META_DIR)
print("LABELS_DIR exists:", LABELS_DIR.exists())
print("S2_DIR exists:", S2_DIR.exists())
print("S1_DIR exists:", S1_DIR.exists())


Mounted at /content/drive
BASE_DIR: /content/drive/MyDrive/SA_CropType_SourceCoop
META_DIR: /content/drive/MyDrive/SA_CropType_SourceCoop/outputs/meta
LABELS_DIR exists: True
S2_DIR exists: True
S1_DIR exists: True


In [None]:
from pathlib import Path
import pandas as pd
import numpy as np
import rasterio
import matplotlib.pyplot as plt
import os, json
from tqdm import tqdm

In [None]:
import pandas as pd

TFC_PATH = META_DIR / "tile_field_crop_table.csv"
print("TFC_PATH:", TFC_PATH, "| exists:", TFC_PATH.exists())

tfc = pd.read_csv(TFC_PATH)
tfc.head()

TFC_PATH: /content/drive/MyDrive/SA_CropType_SourceCoop/outputs/meta/tile_field_crop_table.csv | exists: True


Unnamed: 0,tile_id,field_id,crop
0,1000,10697,Weeds
1,1000,14604,Lucerne/Medics
2,1000,17138,Fallow
3,1000,22246,Lucerne/Medics
4,1000,25176,Fallow


In [None]:
tfc = pd.read_csv(TFC_PATH)
tfc.columns = [c.strip() for c in tfc.columns]  # garanti

print("tfc shape:", tfc.shape)
print("cols:", tfc.columns.tolist())
display(tfc.head())

need = {"tile_id","field_id","crop"}
missing = need - set(tfc.columns)
if missing:
    raise ValueError(f"tfc eksik kolon: {missing}")


tfc shape: (16893, 3)
cols: ['tile_id', 'field_id', 'crop']


Unnamed: 0,tile_id,field_id,crop
0,1000,10697,Weeds
1,1000,14604,Lucerne/Medics
2,1000,17138,Fallow
3,1000,22246,Lucerne/Medics
4,1000,25176,Fallow


In [None]:
TARGET_CROPS = [
    "Wine grapes",
    "Wheat",
    "Planted pastures (perennial)",
    "Lucerne/Medics",
]
print("Target crops:", TARGET_CROPS)

Target crops: ['Wine grapes', 'Wheat', 'Planted pastures (perennial)', 'Lucerne/Medics']


In [None]:
SAMPLES_PER_CLASS = 1200
SEED = 42
rng = np.random.default_rng(SEED)

# sadece hedef 4 Ã¼rÃ¼n
sub = tfc[tfc["crop"].isin(TARGET_CROPS)].copy()
print("Target subset shape:", sub.shape)

selected_rows = []
summary = []

for crop in TARGET_CROPS:
    dfc = sub[sub["crop"] == crop].copy()

    # her field bir kere gelsin (aynÄ± field tekrar olmasÄ±n)
    dfc = dfc.drop_duplicates(subset=["field_id"])

    n_avail = len(dfc)
    n_take = min(SAMPLES_PER_CLASS, n_avail)

    picked = dfc.sample(n=n_take, random_state=SEED)  # deterministic
    selected_rows.append(picked)

    summary.append({
        "crop": crop,
        "available_fields": int(n_avail),
        "selected_fields": int(n_take),
        "unique_tiles_in_selected": int(picked["tile_id"].nunique()),
    })

selected = pd.concat(selected_rows, ignore_index=True)

print("âœ… Selected shape:", selected.shape)
print("âœ… Unique fields:", selected["field_id"].nunique())
print("âœ… Unique tiles:", selected["tile_id"].nunique())

summary_df = pd.DataFrame(summary)
display(summary_df)

Target subset shape: (11227, 3)
âœ… Selected shape: (4800, 3)
âœ… Unique fields: 4800
âœ… Unique tiles: 347


Unnamed: 0,crop,available_fields,selected_fields,unique_tiles_in_selected
0,Wine grapes,5024,1200,88
1,Wheat,2097,1200,190
2,Planted pastures (perennial),2473,1200,253
3,Lucerne/Medics,1633,1200,206


In [None]:
selected_path = META_DIR / "selected_fields_4c_1200_each.csv"
tiles_path    = META_DIR / "required_tiles_4c_1200_each.csv"
tiles_json    = META_DIR / "required_tiles_4c_1200_each.json"

selected.to_csv(selected_path, index=False)

required_tiles = sorted(selected["tile_id"].unique().tolist())
pd.DataFrame({"tile_id": required_tiles}).to_csv(tiles_path, index=False)

with open(tiles_json, "w", encoding="utf-8") as f:
    json.dump({"tile_id": required_tiles}, f, ensure_ascii=False, indent=2)

print("Saved:", selected_path)
print("Saved:", tiles_path)
print("Saved:", tiles_json)

Saved: /content/drive/MyDrive/SA_CropType_SourceCoop/outputs/meta/selected_fields_4c_1200_each.csv
Saved: /content/drive/MyDrive/SA_CropType_SourceCoop/outputs/meta/required_tiles_4c_1200_each.csv
Saved: /content/drive/MyDrive/SA_CropType_SourceCoop/outputs/meta/required_tiles_4c_1200_each.json


In [None]:
# =========================================================
# TILE SAYISI ANALÄ°ZÄ° (1200 field / Ã¼rÃ¼n)
# =========================================================

print("ðŸ”¢ TOPLAM TILE SAYISI (tÃ¼m Ã¼rÃ¼nler):")
print(selected["tile_id"].nunique())

print("\nðŸ“¦ HER ÃœRÃœN Ä°Ã‡Ä°N TILE SAYISI:")
tile_per_crop = (
    selected
    .groupby("crop")["tile_id"]
    .nunique()
    .sort_values(ascending=False)
)

display(tile_per_crop)

print("\nðŸ“Š HER TILE'DA KAÃ‡ FIELD VAR (ilk 15):")
tile_field_counts = (
    selected
    .groupby("tile_id")["field_id"]
    .nunique()
    .sort_values(ascending=False)
)

display(tile_field_counts.head(15))

print("\nðŸ“ˆ TILE BAÅžINA FIELD Ä°STATÄ°STÄ°KLERÄ°:")
display(tile_field_counts.describe())


ðŸ”¢ TOPLAM TILE SAYISI (tÃ¼m Ã¼rÃ¼nler):
347

ðŸ“¦ HER ÃœRÃœN Ä°Ã‡Ä°N TILE SAYISI:


Unnamed: 0_level_0,tile_id
crop,Unnamed: 1_level_1
Planted pastures (perennial),253
Lucerne/Medics,206
Wheat,190
Wine grapes,88



ðŸ“Š HER TILE'DA KAÃ‡ FIELD VAR (ilk 15):


Unnamed: 0_level_0,field_id
tile_id,Unnamed: 1_level_1
127,102
1049,82
1129,82
1385,80
1303,77
1079,73
1264,73
1214,71
1420,53
1424,52



ðŸ“ˆ TILE BAÅžINA FIELD Ä°STATÄ°STÄ°KLERÄ°:


Unnamed: 0,field_id
count,347.0
mean,13.832853
std,15.39671
min,1.0
25%,3.0
50%,9.0
75%,19.5
max,102.0


In [None]:
def list_s2_dates(tile_id: int):
    tdir = S2_DIR / str(tile_id)
    if not tdir.exists():
        return []
    return sorted([p.name for p in tdir.iterdir() if p.is_dir()])

date_stats = []
for tid in required_tiles:
    dates = list_s2_dates(int(tid))
    date_stats.append({
        "tile_id": int(tid),
        "num_dates_s2": int(len(dates)),
        "first_5_dates": ", ".join(dates[:5]),
    })

date_stats_df = pd.DataFrame(date_stats).sort_values("num_dates_s2")
display(date_stats_df.head(10))
display(date_stats_df.tail(10))

date_stats_path = META_DIR / "required_tiles_s2_date_coverage.csv"
date_stats_df.to_csv(date_stats_path, index=False)
print("Saved:", date_stats_path)

Unnamed: 0,tile_id,num_dates_s2,first_5_dates
274,1343,38,"2017_04_01, 2017_04_11, 2017_04_21, 2017_05_01..."
281,1355,38,"2017_04_01, 2017_04_11, 2017_04_21, 2017_05_01..."
145,1170,38,"2017_04_01, 2017_04_11, 2017_04_21, 2017_05_01..."
146,1171,38,"2017_04_01, 2017_04_11, 2017_04_21, 2017_05_01..."
148,1174,38,"2017_04_01, 2017_04_11, 2017_04_21, 2017_05_01..."
149,1175,38,"2017_04_01, 2017_04_11, 2017_04_21, 2017_05_01..."
153,1179,38,"2017_04_01, 2017_04_11, 2017_04_21, 2017_05_01..."
154,1182,38,"2017_04_01, 2017_04_11, 2017_04_21, 2017_05_01..."
275,1346,38,"2017_04_01, 2017_04_11, 2017_04_21, 2017_05_01..."
157,1186,38,"2017_04_01, 2017_04_11, 2017_04_21, 2017_05_01..."


Unnamed: 0,tile_id,num_dates_s2,first_5_dates
156,1185,76,"2017_04_01, 2017_04_04, 2017_04_11, 2017_04_14..."
161,1190,76,"2017_04_01, 2017_04_04, 2017_04_11, 2017_04_14..."
164,1195,76,"2017_04_01, 2017_04_04, 2017_04_11, 2017_04_14..."
167,1200,76,"2017_04_01, 2017_04_04, 2017_04_11, 2017_04_14..."
169,1202,76,"2017_04_01, 2017_04_04, 2017_04_11, 2017_04_14..."
170,1203,76,"2017_04_01, 2017_04_04, 2017_04_11, 2017_04_14..."
171,1204,76,"2017_04_01, 2017_04_04, 2017_04_11, 2017_04_14..."
173,1206,76,"2017_04_01, 2017_04_04, 2017_04_11, 2017_04_14..."
175,1209,76,"2017_04_01, 2017_04_04, 2017_04_11, 2017_04_14..."
179,1213,76,"2017_04_01, 2017_04_04, 2017_04_11, 2017_04_14..."


Saved: /content/drive/MyDrive/SA_CropType_SourceCoop/outputs/meta/required_tiles_s2_date_coverage.csv


In [None]:
tile_dates = {int(tid): set(list_s2_dates(int(tid))) for tid in required_tiles}
all_sets = list(tile_dates.values())

if all_sets:
    common_dates = set.intersection(*all_sets)
else:
    common_dates = set()

print("âœ… Required tiles:", len(required_tiles))
print("âœ… Ortak tarih sayÄ±sÄ± (tÃ¼m tile'larda ortak):", len(common_dates))
print("Ã–rnek ortak tarihler:", sorted(list(common_dates))[:15])

common_dates_path = META_DIR / "common_dates_across_required_tiles.json"
with open(common_dates_path, "w", encoding="utf-8") as f:
    json.dump({"common_dates": sorted(list(common_dates))}, f, ensure_ascii=False, indent=2)

print("Saved:", common_dates_path)

âœ… Required tiles: 347
âœ… Ortak tarih sayÄ±sÄ± (tÃ¼m tile'larda ortak): 0
Ã–rnek ortak tarihler: []
Saved: /content/drive/MyDrive/SA_CropType_SourceCoop/outputs/meta/common_dates_across_required_tiles.json


In [None]:
tile_summary = (
    selected
    .groupby(["crop", "tile_id"])
    .agg(num_fields=("field_id", "nunique"))
    .reset_index()
)

tile_summary_path = META_DIR / "tile_usage_summary_4c_1200.csv"
tile_summary.to_csv(tile_summary_path, index=False)

print("âœ… Kaydedildi:", tile_summary_path)


âœ… Kaydedildi: /content/drive/MyDrive/SA_CropType_SourceCoop/outputs/meta/tile_usage_summary_4c_1200.csv
