In [81]:

import os
import sys


# Import data cleaning libraries
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error


# Import machine learning libraries
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
import xgboost as xgb
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import Lasso
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier

# Import data visualisation libraries
import matplotlib.pyplot as plt

# Import warning libraries
import warnings
warnings.filterwarnings("ignore")

# Set working directory
# Set this to your own path
os.chdir('/home/shaw/Documents/GitHub/crop-yield-estimate/')
# Set this to your own path
sys.path.insert(0, '/home/shaw/Documents/GitHub/crop-yield-estimate/pipeline')


from preprocessing import dim_reduction
from preprocessing import feature_selection
from preprocessing import scaling
from preprocessing import feature_engineering
from preprocessing import cleaning


# Import preprocessing libraries
# Import system libraries
# Preprocess data
train_path = "data/Train.csv"
test_path = "data/Test.csv"
df = cleaning.clean_data(train_path, test_path)
df = feature_engineering.get_features(df)
df = scaling.scale_features(df)
df = feature_selection.select_features(df)
df = dim_reduction.reduce_dim(df)


df.drop(['SeedlingsPerPit',
         'Ganaura',
         'CropOrgFYM',
         'NoFertilizerAppln',
         'BasalDAP',
         'BasalUrea',
         '2appDaysUrea',
         'Harv_hand_rent',
         'Residue_length',
         'TransplantingIrrigationHours_per_Acre',
         'TransIrriCost_per_Acre',
         'CropOrgFYM_per_Acre',
         'BasalDAP_per_Acre',
         'BasalUrea_per_Acre',
         '1tdUrea_per_Acre',
         'Harv_hand_rent_per_Acre',
         'TpIrrigationCost_Imputed_per_Acre',
         'Days_bw_SowTransp_Harv',
         'Days_bw_Harv_Thresh',
         'NursingDate_ModeDiff',
         'TillageDate_ModeDiff',
         'HarvestDate_ModeDiff',
         'ThreshingDate_ModeDiff',
         'Num_LandPrepMethod',
         'Num_CropbasalFerts',
         'Num_TopDressFert',
         'Latitude',
         'Longitude',
         'CropEstMethod_LineSowingAfterTillage',
         'Threshing_method_machine',
         'Stubble_use_plowed_in_soil',
         'LandPrepMethod_FourWheelTracRotavator_True',
         'LandPrepMethod_WetTillagePuddling_True',
         'NursDetFactor_PreMonsoonShowers_True',
         'NursDetFactor_LabourAvailability_True',
         'FirstTopDressFert_DAP_True',
         'HarvestMonth_November',
         'ThreshingMonth_January',
         'Block_Chehrakala',
         'PCropSolidOrgFertAppMethod_Broadcasting',
         'PCropSolidOrgFertAppMethod_SoilApplied',
         'MineralFertAppMethod_1_Broadcasting',
         'MineralFertAppMethod_1_SoilApplied',
         'PC4',
         'PC10',
         'PC21',], axis=1, inplace=True)

# Split data into training and test sets
df_train = df[df['Yield'].isna() == False]
df_test = df[df['Yield'].isna() == True]


df_train.dropna(axis=1, inplace=True)
df_test.dropna(axis=1, inplace=True)


df_train.drop(['PCropSolidOrgFertAppMethod_NaN_True',
              'PCropSolidOrgFertAppMethod_RootApplication'], axis=1, inplace=True)
df_train_PC = df_train.loc[:, df_train.columns.str.startswith('PC')]
df_train_PC = df_train_PC.join(df_train['New_Yield'])
df_train_PC = df_train_PC.join(df_train['New_Yield_per_Acre'])


outcome_cols = ["New_Yield", "New_Yield_per_Acre"]

# Drop columns excluding the top features
X, y = df_train_PC.drop(outcome_cols, axis=1), df_train["Yield_per_Acre"]

In [82]:
# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [83]:
yield_columns = [col for col in X.columns if 'Yield' in col]
if yield_columns:
    print("Columns with 'Yield':", yield_columns)
else:
    print("No columns with 'Yield'")

No columns with 'Yield'


In [84]:
X_train.head()

Unnamed: 0,PC1,PC2,PC3,PC5,PC6,PC7,PC8,PC9,PC11,PC12,PC13,PC14,PC15,PC16,PC17,PC18,PC19,PC20
2181,5.083096,2.493467,-1.411135,0.848582,0.377295,0.33418,-1.22361,-0.047263,-2.293631,-0.94042,0.380055,-1.650923,1.056912,0.260299,0.040615,0.060862,0.760774,-0.356835
416,-1.301525,-2.488164,0.917029,0.551083,-0.35945,-0.649583,-0.339823,-2.445524,0.035708,-0.955869,0.189115,0.933712,0.79805,0.844835,0.740963,-0.43738,-0.711326,-0.563341
1583,5.829986,1.29832,-1.285657,-1.976,-1.131087,-0.660654,3.04199,-0.350764,-0.298248,1.040839,1.490278,-1.142712,-0.603695,-0.223713,1.460308,-0.310301,-0.367809,-1.420912
217,-2.038552,-0.919338,-2.980304,-1.196671,1.34244,-2.634624,-1.014387,0.757789,0.880049,0.017374,1.865414,0.50373,-0.601573,0.531377,-0.848593,0.696443,0.270866,-0.165753
888,1.718242,0.206835,5.490007,-2.94393,0.126223,-1.488065,0.20018,1.723881,1.025447,0.704274,-0.112494,-0.718456,-0.361655,-0.196527,-0.421804,0.397109,-1.143486,-0.56836


In [85]:
y_train.head()

2181      44.0
416     2640.0
1583    2200.0
217     1787.5
888     1760.0
Name: Yield_per_Acre, dtype: float64

In [86]:
best_params = {'alpha': 0,
               'lambda': 0,
               'learning_rate': 0.01,
               'max_depth': 5,
               'n_estimators': 100}

In [87]:
!pip install lightgbm
!pip install catboost



In [88]:


# Defining the pipeline
from sklearn.ensemble import RandomForestRegressor, StackingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPRegressor
from sklearn.neighbors import KNeighborsRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor

meta_regressor = xgb.XGBRegressor(
    objective='reg:squarederror', colsample_bytree=0.3, **best_params)


estimators = [
    ('lgbm', LGBMRegressor()),
    ('catboost', CatBoostRegressor(verbose=False))
]

stacked_regressor = StackingRegressor(
    estimators=estimators, final_estimator=meta_regressor)

# Training the pipeline
stacked_regressor.fit(X_train, y_train)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001040 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4590
[LightGBM] [Info] Number of data points in the train set: 3096, number of used features: 18
[LightGBM] [Info] Start training from score 1948.530268


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000417 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4590
[LightGBM] [Info] Number of data points in the train set: 2476, number of used features: 18
[LightGBM] [Info] Start training from score 1931.258023
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000438 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4590
[LightGBM] [Info] Number of data points in the train set: 2477, number of used features: 18
[LightGBM] [Info] Start training from score 1953.977337
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000409 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4590
[LightGBM] [Info] Number of data points in the train set: 2477, number of used features: 18
[LightGBM] [Info] Start 

In [89]:
y_pred = stacked_regressor.predict(X_test)


In [90]:
print(y_pred)

[2148.8303  2242.3586  1769.661   1962.4869  1924.2653  2148.8303
 2007.2897  1941.557   1931.4414  1876.2191  1983.9581  1854.2358
 1876.2191  2242.3586  2183.4517  1826.89    1929.1704  1798.7147
 1968.1246  2065.169   1869.2109  2142.7275  1103.507   1768.2546
 1826.89    1789.664   1798.7147  2089.7278  1962.4869  1817.8392
 1948.3314  2046.6411  1968.3354  1777.3054  2480.8242  2148.8303
 2148.634   2081.942   1777.3054  1777.3054  2083.5347  2030.7693
 2460.0132  2213.93    2366.5764  2089.7278  2177.2585  2213.93
  814.26575 2009.1084  2100.0962  2075.7488  1876.2191  1856.4259
 2212.1409  2155.0234  1968.1246  1722.1244  1879.8413  1915.3842
 1890.6202  1870.5814  1826.89    1777.3054  1763.3217  1777.3054
 1947.1947  1962.4869  2049.627   2183.4517  1768.2546  1752.2782
 2012.733   1798.7147  1798.7147  1968.1246  1948.3314  2242.3586
 1948.3314  1760.6102  2279.556   1968.1246  2081.942   1918.7955
 3936.0527  2075.7488  2486.4482  2236.4521  2065.3257  1798.7147
 2525.8281  

In [91]:
indices = list(y_test.index)

In [92]:
rmse = mean_squared_error(
    df_train.loc[indices]["New_Yield"],  y_pred * df_train.loc[indices]["Acre"], squared=False)

In [93]:
print(rmse)

153.69059626484213
