In [107]:

import os
import sys


# Import data cleaning libraries
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error


# Import machine learning libraries
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
import xgboost as xgb
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import Lasso
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier

# Import data visualisation libraries
import matplotlib.pyplot as plt

# Import warning libraries
import warnings
warnings.filterwarnings("ignore")

# Set working directory
# Set this to your own path
os.chdir('/home/shaw/Documents/GitHub/crop-yield-estimate/')
# Set this to your own path
sys.path.insert(0, '/home/shaw/Documents/GitHub/crop-yield-estimate/pipeline')


from preprocessing import dim_reduction
from preprocessing import feature_selection
from preprocessing import scaling
from preprocessing import feature_engineering
from preprocessing import cleaning


# Import preprocessing libraries
# Import system libraries
# Preprocess data
train_path = "data/Train.csv"
test_path = "data/Test.csv"
df = cleaning.clean_data(train_path, test_path)
df = feature_engineering.get_features(df)
df = scaling.scale_features(df)
df = feature_selection.select_features(df)
df = dim_reduction.reduce_dim(df)


df.drop(['SeedlingsPerPit',
         'Ganaura',
         'CropOrgFYM',
         'NoFertilizerAppln',
         'BasalDAP',
         'BasalUrea',
         '2appDaysUrea',
         'Harv_hand_rent',
         'Residue_length',
         'TransplantingIrrigationHours_per_Acre',
         'TransIrriCost_per_Acre',
         'CropOrgFYM_per_Acre',
         'BasalDAP_per_Acre',
         'BasalUrea_per_Acre',
         '1tdUrea_per_Acre',
         'Harv_hand_rent_per_Acre',
         'TpIrrigationCost_Imputed_per_Acre',
         'Days_bw_SowTransp_Harv',
         'Days_bw_Harv_Thresh',
         'NursingDate_ModeDiff',
         'TillageDate_ModeDiff',
         'HarvestDate_ModeDiff',
         'ThreshingDate_ModeDiff',
         'Num_LandPrepMethod',
         'Num_CropbasalFerts',
         'Num_TopDressFert',
         'Latitude',
         'Longitude',
         'CropEstMethod_LineSowingAfterTillage',
         'Threshing_method_machine',
         'Stubble_use_plowed_in_soil',
         'LandPrepMethod_FourWheelTracRotavator_True',
         'LandPrepMethod_WetTillagePuddling_True',
         'NursDetFactor_PreMonsoonShowers_True',
         'NursDetFactor_LabourAvailability_True',
         'FirstTopDressFert_DAP_True',
         'HarvestMonth_November',
         'ThreshingMonth_January',
         'Block_Chehrakala',
         'PCropSolidOrgFertAppMethod_Broadcasting',
         'PCropSolidOrgFertAppMethod_SoilApplied',
         'MineralFertAppMethod_1_Broadcasting',
         'MineralFertAppMethod_1_SoilApplied',
         'PC4',
         'PC10',
         'PC21',], axis=1, inplace=True)

# Split data into training and test sets
df_train = df[df['Yield'].isna() == False]
df_test = df[df['Yield'].isna() == True]


df_train.dropna(axis=1, inplace=True)
df_test.dropna(axis=1, inplace=True)


# df_train.drop(['PCropSolidOrgFertAppMethod_NaN_True',
#               'PCropSolidOrgFertAppMethod_RootApplication'], axis=1, inplace=True)
# df_train_PC = df_train.loc[:, df_train.columns.str.startswith('PC')]
# df_train_PC = df_train_PC.join(df_train['New_Yield'])
# df_train_PC = df_train_PC.join(df_train['New_Yield_per_Acre'])


outcome_cols = ["Yield", "Yield_per_Acre", "New_Yield", "New_Yield_per_Acre"]

# Drop columns excluding the top features
X, y = df_train.drop(outcome_cols, axis=1), df_train["New_Yield_per_Acre"]

In [108]:
X_test = df_test

In [109]:
yield_columns = [col for col in df_test.columns if 'Yield' in col]
if yield_columns:
    print("Columns with 'Yield':", yield_columns)
else:
    print("No columns with 'Yield'")


No columns with 'Yield'


In [110]:
X.head()

Unnamed: 0,CultLand,CropCultLand,CropTillageDepth,1tdUrea,2tdUrea,Residue_perc,Acre,Nb_of_NaN,TpIrrigationHours_Imputed,TpIrrigationCost_Imputed,...,PC11,PC12,PC13,PC14,PC15,PC16,PC17,PC18,PC19,PC20
0,0.560165,0.560362,0.571429,0.54662,-0.477626,1.0,0.3125,0.166667,-0.071751,-0.409783,...,-0.193645,0.149644,0.199306,-1.866192,-1.34172,-0.932724,-0.714486,-0.530509,-1.398849,-0.075302
1,-0.092617,0.039792,0.571429,1.089515,-0.477626,0.0,0.3125,0.166667,-0.071751,-0.608898,...,0.118576,0.067153,-0.720835,-0.088561,-0.41202,0.680368,-1.00396,-1.198571,0.147022,-0.409071
2,-0.642329,-0.555145,0.714286,-0.539171,-0.477626,0.0,0.148148,0.083333,-0.096613,-0.728366,...,-4.715935,0.59688,0.083233,2.303326,-1.506611,-0.438229,0.409927,-1.663447,-0.076183,-1.152174
3,-0.470544,-0.369227,0.714286,-0.539171,-0.477626,0.0,0.222222,0.5,-0.096613,-0.27704,...,-1.411941,-0.857906,-0.674739,0.817439,0.816465,0.626397,-1.763749,-0.833367,0.886651,0.753257
4,1.07552,1.304034,0.428571,2.175306,-0.477626,1.0,0.46875,0.166667,0.027695,-0.144297,...,0.662074,0.373149,0.697635,-1.219317,-1.185658,-0.384397,-0.851026,-0.808797,-0.249903,-0.305041


In [111]:
y.head()

0    1920.000000
1    1920.000000
2    1518.750000
3    2106.000000
4    1173.333333
Name: New_Yield_per_Acre, dtype: float64

In [112]:
best_params = {'alpha': 0,
               'lambda': 0,
               'learning_rate': 0.1,
               'max_depth': 5,
               'n_estimators': 100}

In [113]:


# Defining the pipeline
from sklearn.ensemble import  StackingRegressor

meta_regressor =  xgb.XGBRegressor(objective='reg:squarederror', colsample_bytree=0.3, **best_params)




estimators=[
    ('svm', svm.SVR(kernel='linear', C=100, gamma='auto')) ,
    ('lasso', Lasso(alpha=0.1)),
]

stacked_regressor = StackingRegressor(estimators=estimators, final_estimator=meta_regressor)

# Training the pipeline
stacked_regressor.fit(X, y)


In [114]:
y_pred = stacked_regressor.predict(X_test)


In [115]:
print(y_pred)

[2175.9724 1825.5287 1721.6228 ... 1706.0623 1777.8738 2033.791 ]


In [116]:


# Make predictions
test_pred = y_pred* df_test["Acre"]

# Add predictions to sample submission file
df_test_pred = pd.read_csv("data/SampleSubmission.csv")
df_test_pred['Yield'] = test_pred
df_test_pred['Yield'] = np.where(df_test_pred['ID'] == 'ID_PMSOXFT4FYDW',
                                 df_test_pred['Yield'] * 10, df_test_pred['Yield'])


df_test_pred['Yield'] = np.where(df_test_pred['Yield'] < 0, 4, df_test_pred['Yield'])


# Export submission
df_test_pred.to_csv('submission/SubmissionSHAW.csv', index=False)

print("---Predictions made---")

---Predictions made---


In [118]:
print(rmse)

137.9559767490829


In [121]:
from joblib import dump


dump(stacked_regressor, 'stacked_regressor.joblib')

['stacked_regressor.joblib']