In [30]:

import os
import sys



# Import data cleaning libraries
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error


# Import machine learning libraries
from sklearn.model_selection import train_test_split
from sklearn import svm

# Import data visualisation libraries
import matplotlib.pyplot as plt

# Import warning libraries
import warnings
warnings.filterwarnings("ignore")

# Set working directory
# Set this to your own path
os.chdir('/home/shaw/Documents/GitHub/crop-yield-estimate/')
# Set this to your own path
sys.path.insert(0, '/home/shaw/Documents/GitHub/crop-yield-estimate/pipeline')
from preprocessing import cleaning
from preprocessing import feature_engineering
from preprocessing import scaling
from preprocessing import feature_selection
from preprocessing import dim_reduction



# Import preprocessing libraries
# Import system libraries
# Preprocess data
train_path = "data/Train.csv"
test_path = "data/Test.csv"
df = cleaning.clean_data(train_path, test_path)
df = feature_engineering.get_features(df)
df = scaling.scale_features(df)
df = feature_selection.select_features(df)
df = dim_reduction.reduce_dim(df)


df.drop(['SeedlingsPerPit',
         'Ganaura',
         'CropOrgFYM',
         'NoFertilizerAppln',
         'BasalDAP',
         'BasalUrea',
         '2appDaysUrea',
         'Harv_hand_rent',
         'Residue_length',
         'TransplantingIrrigationHours_per_Acre',
         'TransIrriCost_per_Acre',
         'CropOrgFYM_per_Acre',
         'BasalDAP_per_Acre',
         'BasalUrea_per_Acre',
         '1tdUrea_per_Acre',
         'Harv_hand_rent_per_Acre',
         'TpIrrigationCost_Imputed_per_Acre',
         'Days_bw_SowTransp_Harv',
         'Days_bw_Harv_Thresh',
         'NursingDate_ModeDiff',
         'TillageDate_ModeDiff',
         'HarvestDate_ModeDiff',
         'ThreshingDate_ModeDiff',
         'Num_LandPrepMethod',
         'Num_CropbasalFerts',
         'Num_TopDressFert',
         'Latitude',
         'Longitude',
         'CropEstMethod_LineSowingAfterTillage',
         'Threshing_method_machine',
         'Stubble_use_plowed_in_soil',
         'LandPrepMethod_FourWheelTracRotavator_True',
         'LandPrepMethod_WetTillagePuddling_True',
         'NursDetFactor_PreMonsoonShowers_True',
         'NursDetFactor_LabourAvailability_True',
         'FirstTopDressFert_DAP_True',
         'HarvestMonth_November',
         'ThreshingMonth_January',
         'Block_Chehrakala',
         'PCropSolidOrgFertAppMethod_Broadcasting',
         'PCropSolidOrgFertAppMethod_SoilApplied',
         'MineralFertAppMethod_1_Broadcasting',
         'MineralFertAppMethod_1_SoilApplied',
         'PC4',
         'PC10',
         'PC21',], axis=1, inplace=True)

# Split data into training and test sets
df_train = df[df['Yield'].isna() == False]
df_test = df[df['Yield'].isna() == True]

outcome_cols = ["Yield", "Yield_per_Acre", "New_Yield", "New_Yield_per_Acre"]

df_train.dropna(axis=1, inplace=True)
df_test.dropna(axis=1, inplace=True)


df_train.drop(['PCropSolidOrgFertAppMethod_NaN_True',
              'PCropSolidOrgFertAppMethod_RootApplication'], axis=1, inplace=True)
df_train_PC = df_train.loc[:, df_train.columns.str.startswith('PC')]
df_train_PC = df_train_PC.join(df_train['New_Yield'])
df_train_PC = df_train_PC.join(df_train['New_Yield_per_Acre'])


# Split data
X, y = df_train_PC.iloc[:, :-2], df_train_PC.iloc[:, -2]


# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

In [31]:
# Lasso Regression - Without CV

from sklearn.linear_model import Lasso

reg_No_CV = Lasso(alpha=0.1)
reg_No_CV.fit(X_train, y_train)
y_pred_No_CV = reg_No_CV.predict(X_test)



In [32]:
indices = list(y_test.index)
lasso_RMSE_No_CV = mean_squared_error(
    df_train.loc[indices]["New_Yield"], y_pred_No_CV * df_train.loc[indices]["Acre"], squared=False)

In [33]:
lasso_RMSE_No_CV


387.84917053153447

In [34]:
# Lasso Regression - With CV

from sklearn.model_selection import GridSearchCV

reg_CV = Lasso()
parameters = {'alpha': [0.1, 0.5, 1, 2, 5, 10, 20, 50], 'max_iter': [1000, 5000, 10000], 'tol': [0.0001, 0.001, 0.01], 'selection': ['cyclic', 'random']}
reg_CV = GridSearchCV(reg_CV, parameters, cv=5)
reg_CV.fit(X_train, y_train)
y_pred_CV = reg_CV.predict(X_test)


In [35]:
indices = list(y_test.index)
lasso_RMSE_CV = mean_squared_error(
    df_train.loc[indices]["New_Yield"], y_pred_CV * df_train.loc[indices]["Acre"], squared=False)

In [36]:
lasso_RMSE_CV

388.6594556506128

In [37]:
# Lasso Regression - With Randomized CV

from sklearn.model_selection import RandomizedSearchCV

reg_Random_CV = Lasso()
parameters = {'alpha': [0.1, 0.5, 1, 2, 5, 10, 20, 50]}
reg_Random_CV = RandomizedSearchCV(reg_Random_CV, parameters, cv=5)
reg_Random_CV.fit(X_train, y_train)
y_pred_Random_CV = reg_Random_CV.predict(X_test)

In [38]:
indicies = list(y_test.index)
lasso_RMSE_Random_CV = mean_squared_error(
    df_train.loc[indices]["New_Yield"], y_pred_Random_CV * df_train.loc[indices]["Acre"], squared=False)


In [39]:
lasso_RMSE_Random_CV

388.2436130456594