# DONT EDIT - MESSAGE TO MYSELF

In [1]:
# Import system libraries

import os
import sys

# Import data cleaning libraries
import pandas as pd
import numpy as np
import calendar
from datetime import datetime

# Import machine learning libraries
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold, GridSearchCV

# Import data visualisation libraries
import matplotlib.pyplot as plt

# Import warning libraries
import warnings
warnings.filterwarnings("ignore")

# Set working directory
# Set this to your own path
os.chdir('/home/shaw/Documents/GitHub/crop-yield-estimate/')
# Set this to your own path
sys.path.insert(0, '/home/shaw/Documents/GitHub/crop-yield-estimate/pipeline')

# Import preprocessing libraries
from preprocessing import clustering
from preprocessing import dim_reduction
from preprocessing import feature_selection
from preprocessing import scaling
from preprocessing import feature_engineering
from preprocessing import cleaning

# Preprocess data
train_path = "data/Train.csv"
test_path = "data/Test.csv"
df = cleaning.clean_data(train_path, test_path)
df = feature_engineering.get_features(df)
df = scaling.scale_features(df)
df = feature_selection.select_features(df)
df = dim_reduction.reduce_dim(df)
df = clustering.get_clusters(df)


# Split data into training and test sets
df_train = df[df['Yield'].isna() == False]
df_test = df[df['Yield'].isna() == True]

In [2]:
top_cols = ['SeedlingsPerPit','Ganaura','CropOrgFYM','NoFertilizerAppln','BasalDAP',
            'BasalUrea','2appDaysUrea','Harv_hand_rent','Residue_length',
            'TransplantingIrrigationHours_per_Acre','TransIrriCost_per_Acre',
            'CropOrgFYM_per_Acre','BasalDAP_per_Acre','BasalUrea_per_Acre','1tdUrea_per_Acre',
            'Harv_hand_rent_per_Acre','TpIrrigationCost_Imputed_per_Acre',
            'Days_bw_SowTransp_Harv','Days_bw_Harv_Thresh','NursingDate_ModeDiff',
            'TillageDate_ModeDiff','HarvestDate_ModeDiff','ThreshingDate_ModeDiff',
            'Num_LandPrepMethod','Num_CropbasalFerts','Num_TopDressFert','Latitude',
            'Longitude','CropEstMethod_LineSowingAfterTillage','Threshing_method_machine',
            'Stubble_use_plowed_in_soil','LandPrepMethod_FourWheelTracRotavator_True',
            'LandPrepMethod_WetTillagePuddling_True','NursDetFactor_PreMonsoonShowers_True',
            'NursDetFactor_LabourAvailability_True','FirstTopDressFert_DAP_True',
            'HarvestMonth_November','ThreshingMonth_January','Block_Chehrakala',
            'PCropSolidOrgFertAppMethod_Broadcasting','PCropSolidOrgFertAppMethod_SoilApplied',
            'MineralFertAppMethod_1_Broadcasting','MineralFertAppMethod_1_SoilApplied','PC4',
            'PC10','PC21','top_shapley_k2_label_1', 'TpIrrigationHours_Imputed',
            'TpIrrigationCost_Imputed', 'SeedlingsPerPit_Imputed', 'NursingDate_ModeDiff_Imputed',
            '2appDaysUrea_Imputed']#,'Error_Prediction','Block_Prediction']#,'Linear_Yield_Prediction']

# Remove the specified columns from top_cols
columns_to_remove = ['SeedlingsPerPit', 'TransplantingIrrigationHours', 'TransIrriCost', 'StandingWater',
                     '1appDaysUrea', '2appDaysUrea', 'TransplantingIrrigationHours_per_Acre',
                     'TransIrriCost_per_Acre', 'TransplantingIrrigationHours_per_Acre_capped',
                     'TransIrriCost_per_Acre_capped', 'Days_bw_Nurs_SowTransp', 'Days_bw_Nurs_Harv',
                     'Days_bw_Nurs_Till', 'NursingDate_ModeDiff', 'Days_bw_Nurs_SowTransp_ModeDiff',
                     'Days_bw_Nurs_Harv_ModeDiff', 'Days_bw_Nurs_Till_ModeDiff', '2appDaysUrea_MeanDiff']

for column in columns_to_remove:
    if column in top_cols:
     top_cols.remove(column)


In [55]:
from sklearn.linear_model import SGDRegressor
from sklearn.svm import SVC
import shap
from sklearn.ensemble import VotingRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
import xgboost as xgb


# Split data
outcome_cols = ["Yield", "Yield_per_Acre", "New_Yield", "New_Yield_per_Acre"]

X, y = df_train.drop(outcome_cols, axis=1), df_train["New_Yield_per_Acre"]
X = X[top_cols]


# Initialize an array to store fold-wise predictions
k = 5
fold_wise_predictions = np.zeros((len(df_test), k))

# Define number of splits for k-fold cross-validation
kfold = KFold(n_splits=k, shuffle=True, random_state=42)

# Iterate over each fold and train XGBoost model
for i, (train_idx, val_idx) in enumerate(kfold.split(X)):
    X_tr, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_tr, y_val = y.iloc[train_idx], y.iloc[val_idx]

    # Instantiate an XGBoost regressor model
    best_params = {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 100,
                   'alpha': 0, 'lambda': 0}
    xgb_reg = xgb.XGBRegressor(**best_params, colsample_bytree=0.3)



    # Tuned Tree estimators
    lgbm = LGBMRegressor(boosting_type='goss', n_estimators=1000, learning_rate=0.08, num_leaves=100, max_depth=7)
    catboost = CatBoostRegressor(depth=10, iterations=1000, learning_rate=0.5, l2_leaf_reg=5)


    # Define the VotingRegressor
    voting_regressor = VotingRegressor(
        estimators=[
            ('lgbm', lgbm),
            ('catboost', catboost),
            ('xgboost', xgb_reg)
        ], weights=[2, 3, 1]
    )


    # Fit the model
    voting_regressor.fit(X_tr, y_tr)

    # Make predictions
    test_predictors = df_test.drop(outcome_cols, axis=1)[top_cols]
    test_folds_pred = voting_regressor.predict(
        test_predictors) * df_test["Acre"]


    # Store fold-wise predictions
    fold_wise_predictions[:, i] = test_folds_pred

    
# Calculate the average of predictions from each fold for each row
final_predictions = np.mean(fold_wise_predictions, axis=1)


# Add predictions to sample submission file
df_test_pred = pd.read_csv("data/SampleSubmission.csv")
df_test_pred['Yield'] = final_predictions
df_test_pred['Yield'] = np.where(df_test_pred['ID'] == 'ID_PMSOXFT4FYDW',
                                 df_test_pred['Yield'] * 10, df_test_pred['Yield'])

# Choose a floor value
df_test_pred['Yield'] = np.where(
    df_test_pred['Yield'] <= 4, 4, df_test_pred['Yield'])

# Export submission
df_test_pred.to_csv('submission/SubmissionShawFinalRepeatTest.csv', index=False)

print("---Predictions made---")

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000900 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2722
[LightGBM] [Info] Number of data points in the train set: 3096, number of used features: 47
[LightGBM] [Info] Using GOSS
[LightGBM] [Info] Start training from score 1882.305577
0:	learn: 420.5803283	total: 25.9ms	remaining: 25.8s
1:	learn: 361.1827531	total: 52.6ms	remaining: 26.2s
2:	learn: 329.4257069	total: 77.5ms	remaining: 25.7s
3:	learn: 310.0241850	total: 99.9ms	remaining: 24.9s
4:	learn: 298.2179786	total: 122ms	remaining: 24.3s
5:	learn: 290.1641301	total: 146ms	remaining: 24.1s
6:	learn: 278.7310947	total: 169ms	remaining: 24s
7:	learn: 274.6213261	total: 192ms	remaining: 23.8s
8:	learn: 268.1978004	total: 215ms	remaining: 23.7s
9:	learn: 265.6546546	total: 239ms	remaining: 23.6s
10:	learn: 263.8419652	total: 262ms	remaining: 23.6s
11:	learn: 261.7740315	total: 285ms	remaining: 23.5s

# EDIT THIS ONE !!!!

In [3]:
import xgboost as xgb
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
from sklearn.ensemble import VotingRegressor
import shap
from sklearn.svm import SVC
from sklearn.linear_model import SGDRegressor
from joblib import load

# Load the model
model = load(
    '/home/shaw/Documents/GitHub/crop-yield-estimate/voting_regressor_model_9147.88969046982.pkl')


# Split data
outcome_cols = ["Yield", "Yield_per_Acre", "New_Yield", "New_Yield_per_Acre"]

X, y = df_train.drop(outcome_cols, axis=1), df_train["New_Yield_per_Acre"]
X = X[top_cols]


# Initialize an array to store fold-wise predictions
k = 5
fold_wise_predictions = np.zeros((len(df_test), k))

# Define number of splits for k-fold cross-validation
kfold = KFold(n_splits=k, shuffle=True, random_state=42)

# Iterate over each fold and train XGBoost model
for i, (train_idx, val_idx) in enumerate(kfold.split(X)):
    X_tr, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_tr, y_val = y.iloc[train_idx], y.iloc[val_idx]

    # Instantiate an XGBoost regressor model
    best_params = {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 100,
                   'alpha': 0, 'lambda': 0}

    # Fit the model
    model.fit(X_tr, y_tr)

    # Make predictions
    test_predictors = df_test.drop(outcome_cols, axis=1)[top_cols]
    test_folds_pred = model.predict(
        test_predictors) * df_test["Acre"]

    # Store fold-wise predictions
    fold_wise_predictions[:, i] = test_folds_pred


# Calculate the average of predictions from each fold for each row
final_predictions = np.mean(fold_wise_predictions, axis=1)


# Add predictions to sample submission file
df_test_pred = pd.read_csv("data/SampleSubmission.csv")
df_test_pred['Yield'] = final_predictions
df_test_pred['Yield'] = np.where(df_test_pred['ID'] == 'ID_PMSOXFT4FYDW',
                                 df_test_pred['Yield'] * 10, df_test_pred['Yield'])

# Choose a floor value
df_test_pred['Yield'] = np.where(
    df_test_pred['Yield'] <= 4, 4, df_test_pred['Yield'])

# Export submission
df_test_pred.to_csv(
    'submission/SubmissionShawFinalRepeatTest.csv', index=False)

print("---Predictions made---")

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000744 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2722
[LightGBM] [Info] Number of data points in the train set: 3096, number of used features: 47
[LightGBM] [Info] Using GOSS
[LightGBM] [Info] Start training from score 1882.305577
0:	learn: 420.5803283	total: 88.9ms	remaining: 1m 28s
1:	learn: 361.1827531	total: 126ms	remaining: 1m 2s
2:	learn: 329.4446802	total: 162ms	remaining: 54s
3:	learn: 310.0398329	total: 200ms	remaining: 49.9s
4:	learn: 298.2077470	total: 237ms	remaining: 47.3s
5:	learn: 290.1489134	total: 272ms	remaining: 45.1s
6:	learn: 278.4261555	total: 305ms	remaining: 43.2s
7:	learn: 274.2498673	total: 344ms	remaining: 42.6s
8:	learn: 271.3342428	total: 379ms	remaining: 41.8s
9:	learn: 263.3738750	total: 414ms	remaining: 41s
10:	learn: 261.4567312	total: 449ms	remaining

In [4]:
test_folds_pred = model.predict(
    test_predictors) * df_test["Acre"]

# Store fold-wise predictions
fold_wise_predictions[:, i] = test_folds_pred


# Calculate the average of predictions from each fold for each row
final_predictions = np.mean(fold_wise_predictions, axis=1)


# Add predictions to sample submission file
df_test_pred = pd.read_csv("data/SampleSubmission.csv")
df_test_pred['Yield'] = final_predictions
df_test_pred['Yield'] = np.where(df_test_pred['ID'] == 'ID_PMSOXFT4FYDW',
                                df_test_pred['Yield'] * 10, df_test_pred['Yield'])

# Choose a floor value
df_test_pred['Yield'] = np.where(
df_test_pred['Yield'] <= 4, 4, df_test_pred['Yield'])

# Export submission
df_test_pred.to_csv(
'submission/SubmissionShawFinalRepeatTest.csv', index=False)

print("---Predictions made---")


SubmissionShawFinalRepeatTest = pd.read_csv(
    'submission/SubmissionShawFinalRepeatTest.csv')
SubmissionShawFinal = pd.read_csv('submission/SubmissionShawFinal.csv')

difference = (SubmissionShawFinal['Yield']).subtract(
    SubmissionShawFinalRepeatTest['Yield'])
positive_difference = difference.abs()
sum_of_positive_difference = positive_difference.sum().sum()

In [5]:
print(sum_of_positive_difference)

8989.400778719299
