# 🥈 Amini Canopy or Crop Challenge Solution - Team M&M

 This notebook documents our solution for the Amini Canopy/Crop Challenge. Key strategies include temporal aggregation, advanced missing value handling, and optimized XGBoost modeling.


 ## 1. Environment Setup
* Import Critical Libraries
*Key packages for data processing, ML, and visualization:

In [None]:
# 1. Data manipulation

import pandas as pd
import numpy as np

# 2. Visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
%matplotlib inline

# 3. Stats library
from scipy import stats

# 4. Pre-processing libraries
from sklearn.preprocessing import  MinMaxScaler, LabelEncoder,StandardScaler
from sklearn.impute import  KNNImputer
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV

# 5. Iterative imputerlibraries
from sklearn.experimental import  enable_iterative_imputer
from sklearn.impute import IterativeImputer

# 6. Machine learning libraries
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.utils import class_weight

# 7. Regression models Library
from sklearn.ensemble import RandomForestRegressor

# 8. Classification models
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import  SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
# 9. Metrics for classification
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score ,confusion_matrix, classification_report

# 10. ignore warnings
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)


 ## 2. Data Loading & Preprocessing


 #### 2.1 Load Datasets
* Raw train/test data provided by Zindi. Find on the [data page](https://zindi.africa/competitions/amini-canopy-or-crop-challenge/data).


In [3]:
train = pd.read_csv('Train.csv')
test = pd.read_csv('Test.csv')

In [4]:
train['time'] = pd.to_datetime(train['time'])
test['time'] = pd.to_datetime(test['time'])

#### 2.2 Data Exploration
* Quick inspection of train data structure.

In [5]:
train.head()

Unnamed: 0,ID,time,Green,Blue,RED,NIR,SWIR1,SWIR2,Red_Edge,Aerosols,Red_Edge_2,Red_Edge_3,Red_Edge_4,Water_vapor,Cirrus,NDVI,NDMI,NDWI,CI,Target
0,1D_0000,2021-01-04,,,,,,,,,,,,,,,,,,0.0
1,1D_0000,2021-01-09,,,,,,,,,,,,,,,,,,0.0
2,1D_0000,2021-01-14,0.0845,0.103,0.0583,0.1977,0.1152,0.0464,0.0742,0.1326,0.1682,0.2243,0.25,0.0347,0.0007,0.544531,0.263663,-0.401134,0.12,0.0
3,1D_0000,2021-01-19,0.1095,0.1357,0.0858,0.1596,0.0782,0.033,0.0873,0.162,0.1396,0.1736,0.182,0.026,0.0011,0.300733,0.342304,-0.186176,0.008666,0.0
4,1D_0000,2021-01-24,0.1128,0.13,0.0919,0.2141,0.124,0.0608,0.1035,0.1546,0.1845,0.2302,0.2606,0.0703,0.0041,0.399346,0.266489,-0.309881,0.059365,0.0


In [6]:
# Notice that the test does not contain the target feature
test.head()

Unnamed: 0,ID,time,Green,Blue,RED,NIR,SWIR1,SWIR2,Red_Edge,Aerosols,Red_Edge_2,Red_Edge_3,Red_Edge_4,Water_vapor,Cirrus,NDVI,NDMI,NDWI,CI
0,1D_0005,2021-01-04,0.1199,0.1391,0.0978,0.2329,0.1782,0.0989,0.1146,0.2127,0.2074,0.2675,0.3081,0.0503,0.0011,0.408527,0.133058,-0.320295,0.079096
1,1D_0005,2021-01-09,,,,,,,,,,,,,,,,,
2,1D_0005,2021-01-14,0.0818,0.101,0.0563,0.1848,0.1139,0.0459,0.0756,0.1324,0.1687,0.2246,0.2513,0.0372,0.0007,0.532974,0.237362,-0.386347,0.146323
3,1D_0005,2021-01-19,0.127,0.1581,0.1031,0.1899,0.1186,0.0583,0.1124,0.1763,0.1639,0.1987,0.2156,0.0298,0.0012,0.296246,0.231118,-0.198485,0.043155
4,1D_0005,2021-01-24,0.1122,0.1311,0.0916,0.2125,0.1259,0.0608,0.106,0.1553,0.184,0.2307,0.2611,0.0709,0.0046,0.397567,0.25591,-0.308901,0.072874


 ## 3. Feature Engineering Strategy


#### 3.1 Temporal Aggregation
* Capture patterns across time sequences for each geographic location (ID).

In [7]:
feature_cols = [col for col in train.columns if col not in ['ID', 'time', 'Target']]
agg_funcs = ['mean', 'std', 'min', 'max', 'median']

# Aggregate training data
train_agg = train.groupby('ID')[feature_cols].agg(agg_funcs)
train_agg.columns = ['_'.join(col).strip() for col in train_agg.columns.values]
train_target = train.groupby('ID')['Target'].apply(lambda x: x.mode()[0]).astype(int)

# Aggregate test data (preserve ID order)
test_agg = test.groupby('ID', sort=False)[feature_cols].agg(agg_funcs)
test_agg.columns = ['_'.join(col).strip() for col in test_agg.columns.values]

 #### 3.2 Missing Value Handling
 * **Anti-Leakage Protocol**: Use training mean statistic to fill test NaNs.
 * Would also consider inputing mode here.

In [8]:
# Handle missing values using training data's mean
for col in train_agg.columns:
    train_mean = train_agg[col].mean()
    train_agg[col].fillna(train_mean, inplace=True)
    test_agg[col].fillna(train_mean, inplace=True)

 ## 4. Model Development
  

#### 4.1 Class Balancing
* Address imbalanced classes using sample weighting.

In [9]:
# Prepare data
X = train_agg
y = train_target

# Compute class weights for imbalance
classes = np.unique(y)
class_weights = class_weight.compute_class_weight('balanced', classes=classes, y=y)
sample_weights = class_weights[y]

# Cross-validation
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
f1_scores = []

 ### 4.2 Cross-Validation Setup
* Stratified 5-fold CV to maintain class distribution:

In [10]:
# Cross-validation
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
f1_scores = []

 ## 5. XGBoost Configuration
**Key Hyperparameters**
* Early stopping to prevent overfitting  
* Class-weighted loss function

In [14]:
for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
    sw_train = sample_weights[train_idx]

    model = XGBClassifier(
        objective='multi:softmax',
        num_class=3,
        n_estimators=1000,
        learning_rate=0.05,
        max_depth=6,
        subsample=0.8,
        colsample_bytree=0.8,
        eval_metric='mlogloss',
        early_stopping_rounds=50
    )

    model.fit(
        X_train, y_train,
        sample_weight=sw_train,
        eval_set=[(X_val, y_val)],
        verbose=False
    )

    y_pred = model.predict(X_val)
    fold_f1 = f1_score(y_val, y_pred, average='macro')
    f1_scores.append(fold_f1)
    print(f'Fold {fold + 1} F1: {fold_f1:.4f}')

print(f'Mean F1: {np.mean(f1_scores):.4f}')

Fold 1 F1: 1.0000
Fold 2 F1: 0.9994
Fold 3 F1: 0.9995
Fold 4 F1: 0.9993
Fold 5 F1: 1.0000
Mean F1: 0.9996


 ## 6. Final Model & Submission


 #### 6.1 Retrain on Full Data
* Leverage all training data with optimal iteration

In [17]:
final_model = XGBClassifier(
    objective='multi:softmax',
    num_class=3,
    n_estimators=model.best_iteration + 50 if model.best_iteration else 1000,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    tree_method='hist'  # Use CPU method for compatibility
)
final_model.fit(X, y, sample_weight=sample_weights)


  ####  6.2 Generate Predictions


In [18]:
# Predict and save submission
test_pred = final_model.predict(test_agg)
submission = pd.DataFrame({'ID': test_agg.index, 'Target': test_pred})
submission.to_csv('submission.csv', index=False)