# Expert Approach: Predicting Farmer Adoption with CatBoost

## 1. Setup and Data Loading
Loading necessary libraries and datasets. The ID column is immediately set as the index for alignment.

In [3]:
import pandas as pd
import numpy as np
import ast
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputClassifier
from catboost import CatBoostClassifier
from collections import Counter

# Load data
train_df = pd.read_csv('Train.csv')
test_df = pd.read_csv('Test.csv')
sample_submission_df = pd.read_csv('SampleSubmission.csv')

# Set Index
train_df.set_index('ID', inplace=True)
test_df.set_index('ID', inplace=True)

# Combine for universal processing
combined_df = pd.concat([train_df.drop(['adopted_within_07_days', 'adopted_within_90_days', 'adopted_within_120_days'], axis=1), test_df], axis=0)

print(f"Training data shape: {train_df.shape}")
print(f"Test data shape: {test_df.shape}")

Training data shape: (5548, 18)
Test data shape: (2387, 15)


## 2. Advanced Feature Engineering
This section focuses on creating high-impact features through temporal analysis, behavioral parsing, and creating interaction and aggregation features to capture deeper patterns.

In [5]:
def expert_feature_engineer(df):
    # Temporal Engineering
    df['first_training_date'] = pd.to_datetime(df['first_training_date'])
    df['month'] = df['first_training_date'].dt.month
    df['year'] = df['first_training_date'].dt.year
    df['dayofyear'] = df['first_training_date'].dt.dayofyear
    df['weekofyear'] = df['first_training_date'].dt.isocalendar().week.astype(int)
    df['dayofweek'] = df['first_training_date'].dt.dayofweek
    df.drop('first_training_date', axis=1, inplace=True)

    # Behavioral Engineering from topics_list
    df['topics_list'] = df['topics_list'].apply(ast.literal_eval)
    df['topic_count'] = df['topics_list'].apply(len)
    
    # Extracting top topics to create binary flags
    all_topics = [topic for sublist in df['topics_list'] for topic in sublist]
    top_10_topics = [topic for topic, count in Counter(all_topics).most_common(10)]
    for topic in top_10_topics:
        df[f'topic_{topic.replace(" ", "_")}'] = df['topics_list'].apply(lambda x: 1 if topic in x else 0)
    df.drop('topics_list', axis=1, inplace=True)

    # Interaction and Derived Features
    df['training_intensity'] = df['num_repeat_trainings'] / (df['num_total_trainings'] + 1e-6)
    df['experience_per_age'] = df['farming_experience'] / (df['age'] + 1e-6)
    df['farm_size_per_person'] = df['farm_size'] / (df['group_size'] + 1e-6)

    # Handling Missing Values
    df['days_to_second_training_missing'] = df['days_to_second_training'].isnull().astype(int)
    df['days_to_second_training'].fillna(-999, inplace=True)

    # Aggregation Features (e.g., stats per county)
    geo_features = ['county', 'subcounty', 'ward']
    for geo in geo_features:
        df[f'{geo}_farm_size_mean'] = df.groupby(geo)['farm_size'].transform('mean')
        df[f'{geo}_age_mean'] = df.groupby(geo)['age'].transform('mean')
        df[f'{geo}_experience_mean'] = df.groupby(geo)['farming_experience'].transform('mean')
        
    return df

combined_df_featured = expert_feature_engineer(combined_df.copy())

print('Advanced feature engineering complete.')
print(f"Shape after FE: {combined_df_featured.shape}")

KeyError: 'farming_experience'

## 3. Modeling with CatBoost
We use CatBoost, which is excellent for handling categorical features natively. A `MultiOutputClassifier` is used to manage the three target variables.

In [None]:
# Separate train and test sets
X = combined_df_featured.loc[train_df.index]
X_test = combined_df_featured.loc[test_df.index]
y = train_df[['adopted_within_07_days', 'adopted_within_90_days', 'adopted_within_120_days']]

# Identify categorical features for CatBoost
categorical_features = X.select_dtypes(include=['object', 'category']).columns.tolist()
X[categorical_features] = X[categorical_features].astype(str)
X_test[categorical_features] = X_test[categorical_features].astype(str)

# Align columns before training
X_test = X_test[X.columns]

# Validation Strategy
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Model Definition
catboost_params = {
    'iterations': 1500,
    'learning_rate': 0.03,
    'depth': 7,
    'l2_leaf_reg': 3,
    'loss_function': 'Logloss',
    'eval_metric': 'Logloss',
    'random_seed': 42,
    'verbose': 0, # Set to 100 for verbose training
    'cat_features': categorical_features
}

base_estimator = CatBoostClassifier(**catboost_params)
model = MultiOutputClassifier(estimator=base_estimator)

# Training
print('Starting CatBoost model training...')
model.fit(X_train, y_train)
print('Model training complete.')

## 4. Submission Generation
Generating predictions on the test set and formatting the output to match the `SampleSubmission.csv` file.

In [None]:
print('Generating predictions on the test set...')
test_probabilities = model.predict_proba(X_test)

# Extract probabilities for each target
prob_07 = test_probabilities[0][:, 1]
prob_90 = test_probabilities[1][:, 1]
prob_120 = test_probabilities[2][:, 1]

# Create submission DataFrame
submission_df = pd.DataFrame(index=X_test.index)
submission_df['Target_07_AUC'] = prob_07
submission_df['Target_90_AUC'] = prob_90
submission_df['Target_120_AUC'] = prob_120
submission_df['Target_07_LogLoss'] = prob_07
submission_df['Target_90_LogLoss'] = prob_90
submission_df['Target_120_LogLoss'] = prob_120

# Ensure column order matches sample submission
submission_df = submission_df[sample_submission_df.columns[1:]]

submission_df.to_csv('catboost_submission.csv')

print('Submission file `catboost_submission.csv` created successfully!')
submission_df.head()