In [75]:
import os
import sys


# Import data cleaning libraries
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error


# Import machine learning libraries
from sklearn.model_selection import train_test_split
from sklearn import svm

# Import data visualisation libraries
import matplotlib.pyplot as plt

# Import warning libraries
import warnings
warnings.filterwarnings("ignore")

# Set working directory
# Set this to your own path
os.chdir('/home/shaw/Documents/GitHub/crop-yield-estimate/')
# Set this to your own path
sys.path.insert(0, '/home/shaw/Documents/GitHub/crop-yield-estimate/pipeline')


from preprocessing import cleaning
from preprocessing import feature_engineering
from preprocessing import scaling
from preprocessing import feature_selection
from preprocessing import dim_reduction

# Import preprocessing libraries
# Import system libraries
# Preprocess data
train_path = "data/Train.csv"
test_path = "data/Test.csv"
df = cleaning.clean_data(train_path, test_path)
df = feature_engineering.get_features(df)
df = scaling.scale_features(df)
df = feature_selection.select_features(df)
df = dim_reduction.reduce_dim(df)


df.drop(['SeedlingsPerPit',
               'Ganaura',
               'CropOrgFYM',
               'NoFertilizerAppln',
               'BasalDAP',
               'BasalUrea',
               '2appDaysUrea',
               'Harv_hand_rent',
               'Residue_length',
               'TransplantingIrrigationHours_per_Acre',
               'TransIrriCost_per_Acre',
               'CropOrgFYM_per_Acre',
               'BasalDAP_per_Acre',
               'BasalUrea_per_Acre',
               '1tdUrea_per_Acre',
               'Harv_hand_rent_per_Acre',
               'TpIrrigationCost_Imputed_per_Acre',
               'Days_bw_SowTransp_Harv',
               'Days_bw_Harv_Thresh',
               'NursingDate_ModeDiff',
               'TillageDate_ModeDiff',
               'HarvestDate_ModeDiff',
               'ThreshingDate_ModeDiff',
               'Num_LandPrepMethod',
               'Num_CropbasalFerts',
               'Num_TopDressFert',
               'Latitude',
               'Longitude',
               'CropEstMethod_LineSowingAfterTillage',
               'Threshing_method_machine',
               'Stubble_use_plowed_in_soil',
               'LandPrepMethod_FourWheelTracRotavator_True',
               'LandPrepMethod_WetTillagePuddling_True',
               'NursDetFactor_PreMonsoonShowers_True',
               'NursDetFactor_LabourAvailability_True',
               'FirstTopDressFert_DAP_True',
               'HarvestMonth_November',
               'ThreshingMonth_January',
               'Block_Chehrakala',
               'PCropSolidOrgFertAppMethod_Broadcasting',
               'PCropSolidOrgFertAppMethod_SoilApplied',
               'MineralFertAppMethod_1_Broadcasting',
               'MineralFertAppMethod_1_SoilApplied',
               'PC4',
               'PC10',
               'PC21',], axis=1, inplace=True)

# Split data into training and test sets
df_train = df[df['Yield'].isna() == False]
df_test = df[df['Yield'].isna() == True]

outcome_cols = ["Yield", "Yield_per_Acre", "New_Yield", "New_Yield_per_Acre"]


In [76]:
df_train.head()

Unnamed: 0,CultLand,CropCultLand,CropTillageDepth,TransplantingIrrigationHours,TransIrriCost,StandingWater,1tdUrea,1appDaysUrea,2tdUrea,Residue_perc,...,PC11,PC12,PC13,PC14,PC15,PC16,PC17,PC18,PC19,PC20
0,0.560165,0.560362,0.571429,-0.096048,-0.443788,0.052632,0.54662,0.22973,-0.477626,1.0,...,0.313171,0.227473,0.040736,-1.084199,-0.568745,1.979507,0.278055,-1.677674,-0.255236,-1.881702
1,-0.092617,0.039792,0.571429,-0.096048,-0.623574,0.105263,1.089515,0.513514,-0.477626,0.0,...,0.026647,-0.697497,-0.474871,1.357698,-0.403206,0.236363,-0.90775,-0.724912,0.558819,0.072024
2,-0.642329,-0.555145,0.714286,-0.132006,-0.731445,0.052632,-0.539171,0.864865,-0.477626,0.0,...,-1.361937,2.96561,0.35697,1.638507,2.097583,-0.005524,-0.386158,-1.035751,0.92873,1.768059
3,-0.470544,-0.369227,0.714286,,,,-0.539171,0.054054,-0.477626,0.0,...,-1.543871,1.11513,-0.10979,0.566342,0.104613,-0.875979,-1.741405,-0.247138,0.543535,-2.730283
4,1.07552,1.304034,0.428571,0.047782,-0.204073,0.052632,2.175306,0.337838,-0.477626,1.0,...,0.884364,-0.937253,0.478405,-0.075504,-0.816697,1.171539,-0.556823,-1.197638,0.432311,-1.230807


# I DROPPED EVERTHING APART FROM PC COLUMNS

In [77]:
df_train.dropna(axis=1, inplace=True)
df_test.dropna(axis=1, inplace=True)


df_train.drop(['PCropSolidOrgFertAppMethod_NaN_True',
              'PCropSolidOrgFertAppMethod_RootApplication'], axis=1, inplace=True)
df_train_PC = df_train.loc[:, df_train.columns.str.startswith('PC')]
df_train_PC = df_train_PC.join(df_train['New_Yield'])
df_train_PC = df_train_PC.join(df_train['New_Yield_per_Acre'])



# Split data
X, y = df_train_PC.iloc[:, :-2], df_train_PC.iloc[:, -2]


# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

In [78]:
X.head()

Unnamed: 0,PC1,PC2,PC3,PC5,PC6,PC7,PC8,PC9,PC11,PC12,PC13,PC14,PC15,PC16,PC17,PC18,PC19,PC20
0,1.029525,-2.151299,1.787111,-1.281319,0.653851,-1.213776,-0.104256,0.209215,0.313171,0.227473,0.040736,-1.084199,-0.568745,1.979507,0.278055,-1.677674,-0.255236,-1.881702
1,3.233728,-1.034569,1.367611,-0.763216,0.083424,-1.437855,-0.043809,-0.54112,0.026647,-0.697497,-0.474871,1.357698,-0.403206,0.236363,-0.90775,-0.724912,0.558819,0.072024
2,2.623609,4.202947,-1.412242,1.01853,0.322129,-1.569942,-0.405476,-3.451308,-1.361937,2.96561,0.35697,1.638507,2.097583,-0.005524,-0.386158,-1.035751,0.92873,1.768059
3,2.563425,4.366347,0.531604,-0.300522,0.089895,0.048274,1.110633,0.339399,-1.543871,1.11513,-0.10979,0.566342,0.104613,-0.875979,-1.741405,-0.247138,0.543535,-2.730283
4,3.675738,-1.461399,2.756269,0.470638,0.185668,-1.70879,-0.379333,-0.931436,0.884364,-0.937253,0.478405,-0.075504,-0.816697,1.171539,-0.556823,-1.197638,0.432311,-1.230807


In [79]:
y.head()

0    600.0
1    600.0
2    225.0
3    468.0
4    550.0
Name: New_Yield, dtype: float64

In [80]:
# Linear Kernel
lin = svm.SVR(kernel='linear', C=100, gamma='auto') 
lin.fit(X_train, y_train)
y_pred_lin = lin.predict(X_test)

In [84]:
# Polynomial Kernel
poly = svm.SVR(kernel='poly', C=100, gamma='auto', degree=3, epsilon=0.1,
               coef0=1)
poly.fit(X_train, y_train)
y_pred_poly = poly.predict(X_test)

In [87]:
# Non-Linear Kernel

rbf = svm.SVR(kernel='rbf', C=100, gamma=0.1, epsilon=0.1)
rbf.fit(X_train, y_train)
y_pred_rbf = rbf.predict(X_test)

In [81]:
indices = list(y_test.index)

In [82]:

# Evaluate the best model using RMSE (Root Mean Squared Error)
lin_rmse = mean_squared_error(
    df_train.loc[indices]["New_Yield"], y_pred_lin * df_train.loc[indices]["Acre"], squared=False)


In [88]:
rbf_rmse = mean_squared_error(
    df_train.loc[indices]["New_Yield"], y_pred_rbf * df_train.loc[indices]["Acre"], squared=False)

In [85]:
poly_rmse = mean_squared_error(
    df_train.loc[indices]["New_Yield"], y_pred_poly * df_train.loc[indices]["Acre"], squared=False)

In [83]:
print("Linear RMSE: ", lin_rmse)

Linear RMSE:  398.12279000554355


In [89]:
print("RBF RMSE: ", rbf_rmse)

RBF RMSE:  464.16697329797097


In [86]:
print("Polynomial RMSE: ", poly_rmse)

Polynomial RMSE:  2107.572541608834
