# Farmer Technology Adoption Prediction

## 1. Setup and Data Loading

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.multioutput import MultiOutputClassifier
import lightgbm as lgb
import ast

# Load datasets
train_df = pd.read_csv('Train.csv')
test_df = pd.read_csv('Test.csv')
sample_submission_df = pd.read_csv('SampleSubmission.csv')

# Requirement 1: Index Generalization
train_df.set_index('ID', inplace=True)
test_df.set_index('ID', inplace=True)

print('Training data shape:', train_df.shape)
print('Test data shape:', test_df.shape)
train_df.head()

ModuleNotFoundError: No module named 'pandas'

## 2. Feature Engineering

In [None]:
def feature_engineer(df):
    # Requirement 2: Temporal Engineering
    df['first_training_date'] = pd.to_datetime(df['first_training_date'])
    df['month'] = df['first_training_date'].dt.month
    df['week'] = df['first_training_date'].dt.isocalendar().week.astype(int)
    df['day'] = df['first_training_date'].dt.day
    
    df['month_sin'] = np.sin(2 * np.pi * df['month']/12)
    df['week_sin'] = np.sin(2 * np.pi * df['week']/52)
    df['day_sin'] = np.sin(2 * np.pi * df['day']/31)
    
    # Requirement 3: Behavioral Engineering
    df['topics_list'] = df['topics_list'].apply(ast.literal_eval)
    df['topic_count'] = df['topics_list'].apply(len)
    
    df['Biogas_Interest'] = df['topics_list'].apply(lambda x: 1 if 'biogas' in [topic.lower() for topic in x] else 0)
    df['Poultry_Focus'] = df['topics_list'].apply(lambda x: 1 if 'poultry' in [topic.lower() for topic in x] else 0)
    df['Dairy_Focus'] = df['topics_list'].apply(lambda x: 1 if 'dairy' in [topic.lower() for topic in x] else 0)

    # Requirement 4: Feature Engineering
    df['training_intensity'] = df['num_repeat_trainings'] / (df['num_total_trainings'] + 1e-6)
    
    # Requirement 4: Handle missing values
    df['days_to_second_training_missing'] = df['days_to_second_training'].isnull().astype(int)
    df['days_to_second_training'].fillna(-999, inplace=True) # Sentinel value
    
    return df

train_df = feature_engineer(train_df)
test_df = feature_engineer(test_df)

print('Feature engineering complete.')

### Categorical Feature Encoding

In [None]:
# Requirement 4: Label Encoding for geographic and trainer fields
categorical_features = ['county', 'subcounty', 'ward', 'trainer']
for col in categorical_features:
    le = LabelEncoder()
    train_df[col] = le.fit_transform(train_df[col].astype(str))
    test_df[col] = le.transform(test_df[col].astype(str))
    
print('Categorical encoding complete.')

## 3. Validation Strategy

In [None]:
features = [
    'age', 'gender', 'access_to_credit', 'education_level',
    'farming_experience', 'farm_size', 'income_source', 'district',
    'num_total_trainings', 'num_repeat_trainings', 'training_topic_most_frequent',
    'group_size', 'contact_preference', 'preferred_training_method', 'county', 'subcounty', 'ward', 'trainer',
    'days_to_second_training', 'month_sin', 'week_sin', 'day_sin', 'topic_count',
    'Biogas_Interest', 'Poultry_Focus', 'Dairy_Focus', 'training_intensity',
    'days_to_second_training_missing'
]
targets = ['adopted_within_07_days', 'adopted_within_90_days', 'adopted_within_120_days']

# Align columns - crucial for consistent feature sets
train_cols = train_df.columns
test_cols = test_df.columns
shared_cols = list(set(train_cols) & set(test_cols))
features = [col for col in features if col in shared_cols]

X = train_df[features]
y = train_df[targets]
X_test = test_df[features]

# Requirement 5: Split the data
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

print('Data shapes:')
print('X_train:', X_train.shape)
print('X_val:', X_val.shape)
print('y_train:', y_train.shape)
print('y_val:', y_val.shape)
print('X_test:', X_test.shape)

## 4. Modeling & Testing

In [None]:
# Requirement 6: Train a multi-output probabilistic model
lgb_params = {
    'objective': 'binary',
    'metric': 'logloss',
    'n_estimators': 2000,
    'learning_rate': 0.02,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 1,
    'verbose': -1,
    'n_jobs': -1,
    'seed': 42
}

base_estimator = lgb.LGBMClassifier(**lgb_params)

model = MultiOutputClassifier(estimator=base_estimator)

model.fit(X_train, y_train)

print('Model training complete.')

## 5. Submission Formatting

In [None]:
# Requirement 7: Generate predictions for the Test.csv data
test_probabilities = model.predict_proba(X_test)

# The output of predict_proba for MultiOutputClassifier is a list of arrays
prob_07 = test_probabilities[0][:, 1]
prob_90 = test_probabilities[1][:, 1]
prob_120 = test_probabilities[2][:, 1]

# Requirement 7: Format the final CSV
submission_df = pd.DataFrame(index=X_test.index)

submission_df['Target_07_AUC'] = prob_07
submission_df['Target_90_AUC'] = prob_90
submission_df['Target_120_AUC'] = prob_120

# Assign the same probability scores to LogLoss columns
submission_df['Target_07_LogLoss'] = prob_07
submission_df['Target_90_LogLoss'] = prob_90
submission_df['Target_120_LogLoss'] = prob_120

# Ensure the columns are in the correct order
submission_df = submission_df[sample_submission_df.columns[1:]]

submission_df.to_csv('submission.csv')

print('Submission file created successfully!')
submission_df.head()