In [2]:
import pandas as pd
import numpy as np
pd.set_option('display.max_rows', 1000) 
pd.set_option('display.max_columns', 1000) 

In [3]:
def basic_details(df):
    """Returns basic statistics for dataset - duplicates,
    % of zero, nan, unique values, min, max, mean, median and standard deviation."""
    b = pd.DataFrame()
    b['Missing, %'] = round(df.isnull().sum()/df.shape[0]*100, 2)
    b['Zero, %'] = round(df.isin([0]).sum() /df.shape[0]*100, 2)
    b['N unique value'] = df.nunique()
    b['dtype'] = df.dtypes
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    b.loc[numeric_cols, 'Min'] = df[numeric_cols].min().round(2)
    b.loc[numeric_cols, '1%'] = df[numeric_cols].quantile(0.01)
    b.loc[numeric_cols, '25%'] = df[numeric_cols].quantile(0.25)
    b.loc[numeric_cols, 'Mean'] = df[numeric_cols].mean().round(2)
    b.loc[numeric_cols, 'Median'] = df[numeric_cols].median().round(2)
    b.loc[numeric_cols, '75%'] = df[numeric_cols].quantile(0.75)
    b.loc[numeric_cols, '995%'] = df[numeric_cols].quantile(0.995)
    b.loc[numeric_cols, 'Max'] = df[numeric_cols].max().round(2)
    b.loc[numeric_cols, 'Std'] = df[numeric_cols].std().round(2)
    return b

In [4]:
# load train and test datasets
df_train = pd.read_csv(r'F:\DS\Kernel\data\train.csv')
df_test = pd.read_csv(r'F:\DS\Kernel\data\test.csv')
df_train.shape, df_test.shape

((5734, 162), (2192, 162))

In [5]:
# checking for dplicates
df_train.duplicated().sum(), df_test.duplicated().sum()

(47, 0)

In [6]:
# need full duplicates to be dropped
df_train = df_train.drop_duplicates()

In [7]:
# take a close look at data and check for duplicates by Field and Year features
df_train[df_train.duplicated(subset=['Field', 'Year'], keep=False)].sort_values(by='Field').head(6)

Unnamed: 0,Field,Year,Cluster,Area,Yield,Geozone,Predecessor,FAO,N_kg/ga,P_kg/ga,K_kg/ga,5_YEARLY_median_AVERAGE_TEMPERATURE,6_YEARLY_median_AVERAGE_TEMPERATURE,7_YEARLY_median_AVERAGE_TEMPERATURE,8_YEARLY_median_AVERAGE_TEMPERATURE,9_YEARLY_median_AVERAGE_TEMPERATURE,5_YEARLY_median_EVAPORATION,6_YEARLY_median_EVAPORATION,7_YEARLY_median_EVAPORATION,8_YEARLY_median_EVAPORATION,9_YEARLY_median_EVAPORATION,5_YEARLY_median_HUMIDITY(%),6_YEARLY_median_HUMIDITY(%),7_YEARLY_median_HUMIDITY(%),8_YEARLY_median_HUMIDITY(%),9_YEARLY_median_HUMIDITY(%),5_YEARLY_sum_CALCULATED_RADIATION,6_YEARLY_sum_CALCULATED_RADIATION,7_YEARLY_sum_CALCULATED_RADIATION,8_YEARLY_sum_CALCULATED_RADIATION,9_YEARLY_sum_CALCULATED_RADIATION,5_YEARLY_sum_RAINFALL,6_YEARLY_sum_RAINFALL,7_YEARLY_sum_RAINFALL,8_YEARLY_sum_RAINFALL,9_YEARLY_sum_RAINFALL,22_WEEK_AVERAGE_TEMPERATURE,23_WEEK_AVERAGE_TEMPERATURE,24_WEEK_AVERAGE_TEMPERATURE,25_WEEK_AVERAGE_TEMPERATURE,26_WEEK_AVERAGE_TEMPERATURE,27_WEEK_AVERAGE_TEMPERATURE,28_WEEK_AVERAGE_TEMPERATURE,29_WEEK_AVERAGE_TEMPERATURE,30_WEEK_AVERAGE_TEMPERATURE,31_WEEK_AVERAGE_TEMPERATURE,32_WEEK_AVERAGE_TEMPERATURE,33_WEEK_AVERAGE_TEMPERATURE,34_WEEK_AVERAGE_TEMPERATURE,35_WEEK_AVERAGE_TEMPERATURE,36_WEEK_AVERAGE_TEMPERATURE,37_WEEK_AVERAGE_TEMPERATURE,38_WEEK_AVERAGE_TEMPERATURE,39_WEEK_AVERAGE_TEMPERATURE,22_WEEK_CALCULATED_RADIATION,23_WEEK_CALCULATED_RADIATION,24_WEEK_CALCULATED_RADIATION,25_WEEK_CALCULATED_RADIATION,26_WEEK_CALCULATED_RADIATION,27_WEEK_CALCULATED_RADIATION,28_WEEK_CALCULATED_RADIATION,29_WEEK_CALCULATED_RADIATION,30_WEEK_CALCULATED_RADIATION,31_WEEK_CALCULATED_RADIATION,32_WEEK_CALCULATED_RADIATION,33_WEEK_CALCULATED_RADIATION,34_WEEK_CALCULATED_RADIATION,35_WEEK_CALCULATED_RADIATION,36_WEEK_CALCULATED_RADIATION,37_WEEK_CALCULATED_RADIATION,38_WEEK_CALCULATED_RADIATION,39_WEEK_CALCULATED_RADIATION,22_WEEK_EVAPORATION,23_WEEK_EVAPORATION,24_WEEK_EVAPORATION,25_WEEK_EVAPORATION,26_WEEK_EVAPORATION,27_WEEK_EVAPORATION,28_WEEK_EVAPORATION,29_WEEK_EVAPORATION,30_WEEK_EVAPORATION,31_WEEK_EVAPORATION,32_WEEK_EVAPORATION,33_WEEK_EVAPORATION,34_WEEK_EVAPORATION,35_WEEK_EVAPORATION,36_WEEK_EVAPORATION,37_WEEK_EVAPORATION,38_WEEK_EVAPORATION,39_WEEK_EVAPORATION,22_WEEK_HUMIDITY(%),23_WEEK_HUMIDITY(%),24_WEEK_HUMIDITY(%),25_WEEK_HUMIDITY(%),26_WEEK_HUMIDITY(%),27_WEEK_HUMIDITY(%),28_WEEK_HUMIDITY(%),29_WEEK_HUMIDITY(%),30_WEEK_HUMIDITY(%),31_WEEK_HUMIDITY(%),32_WEEK_HUMIDITY(%),33_WEEK_HUMIDITY(%),34_WEEK_HUMIDITY(%),35_WEEK_HUMIDITY(%),36_WEEK_HUMIDITY(%),37_WEEK_HUMIDITY(%),38_WEEK_HUMIDITY(%),39_WEEK_HUMIDITY(%),22_WEEK_RAINFALL,23_WEEK_RAINFALL,24_WEEK_RAINFALL,25_WEEK_RAINFALL,26_WEEK_RAINFALL,27_WEEK_RAINFALL,28_WEEK_RAINFALL,29_WEEK_RAINFALL,30_WEEK_RAINFALL,31_WEEK_RAINFALL,32_WEEK_RAINFALL,33_WEEK_RAINFALL,34_WEEK_RAINFALL,35_WEEK_RAINFALL,36_WEEK_RAINFALL,37_WEEK_RAINFALL,38_WEEK_RAINFALL,39_WEEK_RAINFALL,22_WEEK_SUM_ACTIVE_TEMPERATURE,23_WEEK_SUM_ACTIVE_TEMPERATURE,24_WEEK_SUM_ACTIVE_TEMPERATURE,25_WEEK_SUM_ACTIVE_TEMPERATURE,26_WEEK_SUM_ACTIVE_TEMPERATURE,27_WEEK_SUM_ACTIVE_TEMPERATURE,28_WEEK_SUM_ACTIVE_TEMPERATURE,29_WEEK_SUM_ACTIVE_TEMPERATURE,30_WEEK_SUM_ACTIVE_TEMPERATURE,31_WEEK_SUM_ACTIVE_TEMPERATURE,32_WEEK_SUM_ACTIVE_TEMPERATURE,33_WEEK_SUM_ACTIVE_TEMPERATURE,34_WEEK_SUM_ACTIVE_TEMPERATURE,35_WEEK_SUM_ACTIVE_TEMPERATURE,36_WEEK_SUM_ACTIVE_TEMPERATURE,37_WEEK_SUM_ACTIVE_TEMPERATURE,38_WEEK_SUM_ACTIVE_TEMPERATURE,39_WEEK_SUM_ACTIVE_TEMPERATURE,22_week_ndvi,23_week_ndvi,24_week_ndvi,25_week_ndvi,26_week_ndvi,27_week_ndvi,28_week_ndvi,29_week_ndvi,30_week_ndvi,31_week_ndvi,32_week_ndvi,33_week_ndvi,34_week_ndvi,35_week_ndvi,36_week_ndvi,37_week_ndvi,38_week_ndvi,39_week_ndvi
1325,203,2016,3,58.61,100.6844,32,3,280.0,173.735813,36.831599,17.326395,16.15702,19.990389,21.764953,21.200779,15.738342,4.03,4.885,4.75,4.01,2.305,60.0,60.0,62.0,56.0,64.0,12739158.0,13135284.0,13319235.0,11696723.0,7812364.0,662.91,790.94,712.54,378.93,564.04,18.200284,16.02558,20.563832,24.936578,24.125591,18.977528,25.818209,19.496388,24.63326,24.331133,21.277534,21.344435,21.994795,19.023883,19.389107,15.535005,9.953778,11.012446,146102.0,137609.0,141452.0,156950.0,149217.0,153156.0,168951.0,135364.0,153701.0,152954.0,115629.0,129194.0,117250.0,124202.0,116010.0,99702.0,74248.0,65858.0,4.02,3.58,4.06,4.72,4.71,4.23,5.46,4.07,4.34,4.59,4.14,3.6,3.59,2.8,2.86,1.92,1.53,1.33,62.0,59.0,80.0,65.0,70.0,66.0,53.0,76.0,63.0,57.0,70.0,72.0,66.0,60.0,52.0,59.0,69.0,75.0,0.0,6.8,24.59,0.3,2.69,10.59,0.0,41.3,0.0,36.6,44.58,6.1,4.0,3.0,0.0,0.0,0.2,1.0,127.874481,106.672069,145.758199,173.191591,166.786003,134.793798,176.362026,140.408399,169.579444,166.837085,138.096271,146.429443,154.169686,139.67415,138.032814,114.329193,34.666376,78.618821,0.542182,0.589273,0.644,0.741,0.741933,0.742867,0.697,0.697,0.697,0.689,0.665571,0.6265,0.567,0.504,0.4155,0.411,0.354889,0.348
1324,203,2016,3,58.61,100.6844,32,1,280.0,173.735813,36.831599,17.326395,16.15702,19.990389,21.764953,21.200779,15.738342,4.03,4.885,4.75,4.01,2.305,60.0,60.0,62.0,56.0,64.0,12739158.0,13135284.0,13319235.0,11696723.0,7812364.0,662.91,790.94,712.54,378.93,564.04,18.200284,16.02558,20.563832,24.936578,24.125591,18.977528,25.818209,19.496388,24.63326,24.331133,21.277534,21.344435,21.994795,19.023883,19.389107,15.535005,9.953778,11.012446,146102.0,137609.0,141452.0,156950.0,149217.0,153156.0,168951.0,135364.0,153701.0,152954.0,115629.0,129194.0,117250.0,124202.0,116010.0,99702.0,74248.0,65858.0,4.02,3.58,4.06,4.72,4.71,4.23,5.46,4.07,4.34,4.59,4.14,3.6,3.59,2.8,2.86,1.92,1.53,1.33,62.0,59.0,80.0,65.0,70.0,66.0,53.0,76.0,63.0,57.0,70.0,72.0,66.0,60.0,52.0,59.0,69.0,75.0,0.0,6.8,24.59,0.3,2.69,10.59,0.0,41.3,0.0,36.6,44.58,6.1,4.0,3.0,0.0,0.0,0.2,1.0,127.874481,106.672069,145.758199,173.191591,166.786003,134.793798,176.362026,140.408399,169.579444,166.837085,138.096271,146.429443,154.169686,139.67415,138.032814,114.329193,34.666376,78.618821,0.542182,0.589273,0.644,0.741,0.741933,0.742867,0.697,0.697,0.697,0.689,0.665571,0.6265,0.567,0.504,0.4155,0.411,0.354889,0.348
5088,217,2018,6,61.471,129.365327,19,4,270.0,170.616391,54.224594,68.028339,15.912748,19.87456,21.582069,20.92,15.504203,3.68,4.27,4.3,3.72,2.26,61.0,63.0,62.0,58.0,66.0,10616051.0,10939453.0,11124660.0,9757864.0,6729291.0,580.86,1003.52,655.64,471.77,613.36,18.61501,20.422678,21.883744,23.024404,20.18769,20.949318,22.09591,21.754855,22.258841,23.713467,21.491939,23.982349,21.581273,22.072297,19.684869,18.646973,17.760935,9.270489,148291.0,140600.0,132239.0,131323.0,121447.0,133400.0,124433.0,111755.0,110914.0,118891.0,125402.0,121322.0,124495.0,112238.0,84392.0,75502.0,89007.0,59708.0,4.28,4.16,4.34,4.46,3.76,4.04,4.01,3.45,3.52,3.97,3.66,4.05,3.64,3.54,2.68,2.05,2.18,1.33,36.0,55.0,74.0,56.0,74.0,60.0,66.0,78.0,80.0,60.0,51.0,55.0,45.0,46.0,51.0,75.0,62.0,76.0,0.69,10.0,24.9,9.39,10.0,1.0,22.0,5.7,43.28,6.0,0.0,0.5,0.0,0.0,28.4,19.4,4.0,17.0,134.740213,135.675776,152.359354,148.080981,138.502336,139.07881,154.210581,149.428935,158.112453,167.597508,152.482377,166.073386,150.32046,154.281321,136.235696,128.520602,118.261425,37.218771,0.534,0.53,0.643,0.692,0.697,0.7262,0.729333,0.729722,0.73,0.717,0.653,0.608,0.483,0.401667,0.32175,0.299,0.282,0.267
5087,217,2018,6,61.471,129.365327,19,2,270.0,170.616391,54.224594,68.028339,15.912748,19.87456,21.582069,20.92,15.504203,3.68,4.27,4.3,3.72,2.26,61.0,63.0,62.0,58.0,66.0,10616051.0,10939453.0,11124660.0,9757864.0,6729291.0,580.86,1003.52,655.64,471.77,613.36,18.61501,20.422678,21.883744,23.024404,20.18769,20.949318,22.09591,21.754855,22.258841,23.713467,21.491939,23.982349,21.581273,22.072297,19.684869,18.646973,17.760935,9.270489,148291.0,140600.0,132239.0,131323.0,121447.0,133400.0,124433.0,111755.0,110914.0,118891.0,125402.0,121322.0,124495.0,112238.0,84392.0,75502.0,89007.0,59708.0,4.28,4.16,4.34,4.46,3.76,4.04,4.01,3.45,3.52,3.97,3.66,4.05,3.64,3.54,2.68,2.05,2.18,1.33,36.0,55.0,74.0,56.0,74.0,60.0,66.0,78.0,80.0,60.0,51.0,55.0,45.0,46.0,51.0,75.0,62.0,76.0,0.69,10.0,24.9,9.39,10.0,1.0,22.0,5.7,43.28,6.0,0.0,0.5,0.0,0.0,28.4,19.4,4.0,17.0,134.740213,135.675776,152.359354,148.080981,138.502336,139.07881,154.210581,149.428935,158.112453,167.597508,152.482377,166.073386,150.32046,154.281321,136.235696,128.520602,118.261425,37.218771,0.534,0.53,0.643,0.692,0.697,0.7262,0.729333,0.729722,0.73,0.717,0.653,0.608,0.483,0.401667,0.32175,0.299,0.282,0.267
5090,219,2018,6,162.252,144.030545,19,2,,134.320967,54.90442,69.77122,15.912748,19.87456,21.582069,20.92,15.504203,3.68,4.27,4.3,3.72,2.26,61.0,63.0,62.0,58.0,66.0,10616051.0,10939453.0,11124660.0,9757864.0,6729291.0,580.86,1003.52,655.64,471.77,613.36,18.61501,20.422678,21.883744,23.024404,20.18769,20.949318,22.09591,21.754855,22.258841,23.713467,21.491939,23.982349,21.581273,22.072297,19.684869,18.646973,17.760935,9.270489,148291.0,140600.0,132239.0,131323.0,121447.0,133400.0,124433.0,111755.0,110914.0,118891.0,125402.0,121322.0,124495.0,112238.0,84392.0,75502.0,89007.0,59708.0,4.28,4.16,4.34,4.46,3.76,4.04,4.01,3.45,3.52,3.97,3.66,4.05,3.64,3.54,2.68,2.05,2.18,1.33,36.0,55.0,74.0,56.0,74.0,60.0,66.0,78.0,80.0,60.0,51.0,55.0,45.0,46.0,51.0,75.0,62.0,76.0,0.69,10.0,24.9,9.39,10.0,1.0,22.0,5.7,43.28,6.0,0.0,0.5,0.0,0.0,28.4,19.4,4.0,17.0,134.740213,135.675776,152.359354,148.080981,138.502336,139.07881,154.210581,149.428935,158.112453,167.597508,152.482377,166.073386,150.32046,154.281321,136.235696,128.520602,118.261425,37.218771,0.56,0.705,0.758,0.770286,0.7625,0.767,0.766294,0.766706,0.766,0.742,0.68725,0.6375,0.517,0.3895,0.32,0.286,0.276,0.267
5089,219,2018,6,162.252,144.030545,19,1,,134.320967,54.90442,69.77122,15.912748,19.87456,21.582069,20.92,15.504203,3.68,4.27,4.3,3.72,2.26,61.0,63.0,62.0,58.0,66.0,10616051.0,10939453.0,11124660.0,9757864.0,6729291.0,580.86,1003.52,655.64,471.77,613.36,18.61501,20.422678,21.883744,23.024404,20.18769,20.949318,22.09591,21.754855,22.258841,23.713467,21.491939,23.982349,21.581273,22.072297,19.684869,18.646973,17.760935,9.270489,148291.0,140600.0,132239.0,131323.0,121447.0,133400.0,124433.0,111755.0,110914.0,118891.0,125402.0,121322.0,124495.0,112238.0,84392.0,75502.0,89007.0,59708.0,4.28,4.16,4.34,4.46,3.76,4.04,4.01,3.45,3.52,3.97,3.66,4.05,3.64,3.54,2.68,2.05,2.18,1.33,36.0,55.0,74.0,56.0,74.0,60.0,66.0,78.0,80.0,60.0,51.0,55.0,45.0,46.0,51.0,75.0,62.0,76.0,0.69,10.0,24.9,9.39,10.0,1.0,22.0,5.7,43.28,6.0,0.0,0.5,0.0,0.0,28.4,19.4,4.0,17.0,134.740213,135.675776,152.359354,148.080981,138.502336,139.07881,154.210581,149.428935,158.112453,167.597508,152.482377,166.073386,150.32046,154.281321,136.235696,128.520602,118.261425,37.218771,0.56,0.705,0.758,0.770286,0.7625,0.767,0.766294,0.766706,0.766,0.742,0.68725,0.6375,0.517,0.3895,0.32,0.286,0.276,0.267


In [8]:
# lets take Filed with max value_counts() value
df_train[df_train.Field==1147]

Unnamed: 0,Field,Year,Cluster,Area,Yield,Geozone,Predecessor,FAO,N_kg/ga,P_kg/ga,K_kg/ga,5_YEARLY_median_AVERAGE_TEMPERATURE,6_YEARLY_median_AVERAGE_TEMPERATURE,7_YEARLY_median_AVERAGE_TEMPERATURE,8_YEARLY_median_AVERAGE_TEMPERATURE,9_YEARLY_median_AVERAGE_TEMPERATURE,5_YEARLY_median_EVAPORATION,6_YEARLY_median_EVAPORATION,7_YEARLY_median_EVAPORATION,8_YEARLY_median_EVAPORATION,9_YEARLY_median_EVAPORATION,5_YEARLY_median_HUMIDITY(%),6_YEARLY_median_HUMIDITY(%),7_YEARLY_median_HUMIDITY(%),8_YEARLY_median_HUMIDITY(%),9_YEARLY_median_HUMIDITY(%),5_YEARLY_sum_CALCULATED_RADIATION,6_YEARLY_sum_CALCULATED_RADIATION,7_YEARLY_sum_CALCULATED_RADIATION,8_YEARLY_sum_CALCULATED_RADIATION,9_YEARLY_sum_CALCULATED_RADIATION,5_YEARLY_sum_RAINFALL,6_YEARLY_sum_RAINFALL,7_YEARLY_sum_RAINFALL,8_YEARLY_sum_RAINFALL,9_YEARLY_sum_RAINFALL,22_WEEK_AVERAGE_TEMPERATURE,23_WEEK_AVERAGE_TEMPERATURE,24_WEEK_AVERAGE_TEMPERATURE,25_WEEK_AVERAGE_TEMPERATURE,26_WEEK_AVERAGE_TEMPERATURE,27_WEEK_AVERAGE_TEMPERATURE,28_WEEK_AVERAGE_TEMPERATURE,29_WEEK_AVERAGE_TEMPERATURE,30_WEEK_AVERAGE_TEMPERATURE,31_WEEK_AVERAGE_TEMPERATURE,32_WEEK_AVERAGE_TEMPERATURE,33_WEEK_AVERAGE_TEMPERATURE,34_WEEK_AVERAGE_TEMPERATURE,35_WEEK_AVERAGE_TEMPERATURE,36_WEEK_AVERAGE_TEMPERATURE,37_WEEK_AVERAGE_TEMPERATURE,38_WEEK_AVERAGE_TEMPERATURE,39_WEEK_AVERAGE_TEMPERATURE,22_WEEK_CALCULATED_RADIATION,23_WEEK_CALCULATED_RADIATION,24_WEEK_CALCULATED_RADIATION,25_WEEK_CALCULATED_RADIATION,26_WEEK_CALCULATED_RADIATION,27_WEEK_CALCULATED_RADIATION,28_WEEK_CALCULATED_RADIATION,29_WEEK_CALCULATED_RADIATION,30_WEEK_CALCULATED_RADIATION,31_WEEK_CALCULATED_RADIATION,32_WEEK_CALCULATED_RADIATION,33_WEEK_CALCULATED_RADIATION,34_WEEK_CALCULATED_RADIATION,35_WEEK_CALCULATED_RADIATION,36_WEEK_CALCULATED_RADIATION,37_WEEK_CALCULATED_RADIATION,38_WEEK_CALCULATED_RADIATION,39_WEEK_CALCULATED_RADIATION,22_WEEK_EVAPORATION,23_WEEK_EVAPORATION,24_WEEK_EVAPORATION,25_WEEK_EVAPORATION,26_WEEK_EVAPORATION,27_WEEK_EVAPORATION,28_WEEK_EVAPORATION,29_WEEK_EVAPORATION,30_WEEK_EVAPORATION,31_WEEK_EVAPORATION,32_WEEK_EVAPORATION,33_WEEK_EVAPORATION,34_WEEK_EVAPORATION,35_WEEK_EVAPORATION,36_WEEK_EVAPORATION,37_WEEK_EVAPORATION,38_WEEK_EVAPORATION,39_WEEK_EVAPORATION,22_WEEK_HUMIDITY(%),23_WEEK_HUMIDITY(%),24_WEEK_HUMIDITY(%),25_WEEK_HUMIDITY(%),26_WEEK_HUMIDITY(%),27_WEEK_HUMIDITY(%),28_WEEK_HUMIDITY(%),29_WEEK_HUMIDITY(%),30_WEEK_HUMIDITY(%),31_WEEK_HUMIDITY(%),32_WEEK_HUMIDITY(%),33_WEEK_HUMIDITY(%),34_WEEK_HUMIDITY(%),35_WEEK_HUMIDITY(%),36_WEEK_HUMIDITY(%),37_WEEK_HUMIDITY(%),38_WEEK_HUMIDITY(%),39_WEEK_HUMIDITY(%),22_WEEK_RAINFALL,23_WEEK_RAINFALL,24_WEEK_RAINFALL,25_WEEK_RAINFALL,26_WEEK_RAINFALL,27_WEEK_RAINFALL,28_WEEK_RAINFALL,29_WEEK_RAINFALL,30_WEEK_RAINFALL,31_WEEK_RAINFALL,32_WEEK_RAINFALL,33_WEEK_RAINFALL,34_WEEK_RAINFALL,35_WEEK_RAINFALL,36_WEEK_RAINFALL,37_WEEK_RAINFALL,38_WEEK_RAINFALL,39_WEEK_RAINFALL,22_WEEK_SUM_ACTIVE_TEMPERATURE,23_WEEK_SUM_ACTIVE_TEMPERATURE,24_WEEK_SUM_ACTIVE_TEMPERATURE,25_WEEK_SUM_ACTIVE_TEMPERATURE,26_WEEK_SUM_ACTIVE_TEMPERATURE,27_WEEK_SUM_ACTIVE_TEMPERATURE,28_WEEK_SUM_ACTIVE_TEMPERATURE,29_WEEK_SUM_ACTIVE_TEMPERATURE,30_WEEK_SUM_ACTIVE_TEMPERATURE,31_WEEK_SUM_ACTIVE_TEMPERATURE,32_WEEK_SUM_ACTIVE_TEMPERATURE,33_WEEK_SUM_ACTIVE_TEMPERATURE,34_WEEK_SUM_ACTIVE_TEMPERATURE,35_WEEK_SUM_ACTIVE_TEMPERATURE,36_WEEK_SUM_ACTIVE_TEMPERATURE,37_WEEK_SUM_ACTIVE_TEMPERATURE,38_WEEK_SUM_ACTIVE_TEMPERATURE,39_WEEK_SUM_ACTIVE_TEMPERATURE,22_week_ndvi,23_week_ndvi,24_week_ndvi,25_week_ndvi,26_week_ndvi,27_week_ndvi,28_week_ndvi,29_week_ndvi,30_week_ndvi,31_week_ndvi,32_week_ndvi,33_week_ndvi,34_week_ndvi,35_week_ndvi,36_week_ndvi,37_week_ndvi,38_week_ndvi,39_week_ndvi
2172,1147,2018,4,110.68,162.553126,6,1,350.0,129.173338,53.052123,72.680882,15.90439,19.762328,21.516067,20.766404,15.407706,3.61,4.155,4.11,3.5,2.085,62.0,63.0,65.0,61.0,69.0,10260465.0,10571115.0,10666761.0,9359488.0,6306735.0,758.13,651.31,739.61,537.95,528.38,18.398816,19.73841,22.013152,23.013775,20.81441,20.742769,22.038673,21.753278,22.389772,23.599903,21.224658,23.85252,21.152412,21.970893,20.319815,18.611327,17.611912,9.278459,140267.0,138001.0,123105.0,125517.0,116995.0,128258.0,121778.0,101332.0,110347.0,115294.0,120703.0,114259.0,120537.0,108685.0,84269.0,76851.0,87150.0,57776.0,4.04,4.29,3.99,4.29,3.7,4.11,3.82,3.1,3.7,3.84,3.5,3.66,3.48,3.11,3.41,1.99,1.95,1.26,41.0,49.0,71.0,55.0,72.0,61.0,67.0,81.0,81.0,64.0,58.0,60.0,47.0,57.0,39.0,73.0,66.0,79.0,7.0,2.0,37.0,7.0,36.0,1.0,5.19,28.7,35.0,6.0,0.0,0.0,2.0,0.0,4.0,10.0,3.0,18.3,132.531141,133.21404,151.827233,148.79563,139.61023,138.422233,154.15638,149.0057,159.188343,167.324007,150.754884,164.825092,147.424944,153.4833,140.326465,128.792912,117.695369,36.535368,0.465,0.649,0.734,0.788714,0.7915,0.779,0.773167,0.769,0.747,0.745,0.7,0.649,0.517,0.393,0.342,0.305,0.275111,0.253
2173,1147,2018,4,110.68,162.553126,6,4,350.0,129.173338,53.052123,72.680882,15.90439,19.762328,21.516067,20.766404,15.407706,3.61,4.155,4.11,3.5,2.085,62.0,63.0,65.0,61.0,69.0,10260465.0,10571115.0,10666761.0,9359488.0,6306735.0,758.13,651.31,739.61,537.95,528.38,18.398816,19.73841,22.013152,23.013775,20.81441,20.742769,22.038673,21.753278,22.389772,23.599903,21.224658,23.85252,21.152412,21.970893,20.319815,18.611327,17.611912,9.278459,140267.0,138001.0,123105.0,125517.0,116995.0,128258.0,121778.0,101332.0,110347.0,115294.0,120703.0,114259.0,120537.0,108685.0,84269.0,76851.0,87150.0,57776.0,4.04,4.29,3.99,4.29,3.7,4.11,3.82,3.1,3.7,3.84,3.5,3.66,3.48,3.11,3.41,1.99,1.95,1.26,41.0,49.0,71.0,55.0,72.0,61.0,67.0,81.0,81.0,64.0,58.0,60.0,47.0,57.0,39.0,73.0,66.0,79.0,7.0,2.0,37.0,7.0,36.0,1.0,5.19,28.7,35.0,6.0,0.0,0.0,2.0,0.0,4.0,10.0,3.0,18.3,132.531141,133.21404,151.827233,148.79563,139.61023,138.422233,154.15638,149.0057,159.188343,167.324007,150.754884,164.825092,147.424944,153.4833,140.326465,128.792912,117.695369,36.535368,0.465,0.649,0.734,0.788714,0.7915,0.779,0.773167,0.769,0.747,0.745,0.7,0.649,0.517,0.393,0.342,0.305,0.275111,0.253
2174,1147,2018,4,110.68,162.553126,6,2,350.0,129.173338,53.052123,72.680882,15.90439,19.762328,21.516067,20.766404,15.407706,3.61,4.155,4.11,3.5,2.085,62.0,63.0,65.0,61.0,69.0,10260465.0,10571115.0,10666761.0,9359488.0,6306735.0,758.13,651.31,739.61,537.95,528.38,18.398816,19.73841,22.013152,23.013775,20.81441,20.742769,22.038673,21.753278,22.389772,23.599903,21.224658,23.85252,21.152412,21.970893,20.319815,18.611327,17.611912,9.278459,140267.0,138001.0,123105.0,125517.0,116995.0,128258.0,121778.0,101332.0,110347.0,115294.0,120703.0,114259.0,120537.0,108685.0,84269.0,76851.0,87150.0,57776.0,4.04,4.29,3.99,4.29,3.7,4.11,3.82,3.1,3.7,3.84,3.5,3.66,3.48,3.11,3.41,1.99,1.95,1.26,41.0,49.0,71.0,55.0,72.0,61.0,67.0,81.0,81.0,64.0,58.0,60.0,47.0,57.0,39.0,73.0,66.0,79.0,7.0,2.0,37.0,7.0,36.0,1.0,5.19,28.7,35.0,6.0,0.0,0.0,2.0,0.0,4.0,10.0,3.0,18.3,132.531141,133.21404,151.827233,148.79563,139.61023,138.422233,154.15638,149.0057,159.188343,167.324007,150.754884,164.825092,147.424944,153.4833,140.326465,128.792912,117.695369,36.535368,0.465,0.649,0.734,0.788714,0.7915,0.779,0.773167,0.769,0.747,0.745,0.7,0.649,0.517,0.393,0.342,0.305,0.275111,0.253
2175,1147,2019,4,103.338,93.73187,6,0,330.0,183.576226,52.030105,49.539956,15.90439,19.762328,21.516067,20.766404,15.407706,3.61,4.155,4.11,3.5,2.085,62.0,63.0,65.0,61.0,69.0,10260465.0,10571115.0,10666761.0,9359488.0,6306735.0,758.13,651.31,739.61,537.95,528.38,21.48,21.85,24.51,24.84,22.71,20.78,16.95,18.7,20.69,17.97,20.18,20.46,22.74,21.42,20.0,18.92,11.68,12.13,124876.0,120658.0,133679.0,141756.0,127930.0,137362.0,122113.0,131255.0,125629.0,110211.0,126160.0,97173.0,118307.0,110190.0,90138.0,97721.0,67780.0,63025.0,5.32,5.18,5.81,6.04,5.63,5.6,4.48,4.81,4.87,4.27,4.85,4.34,4.97,4.89,4.15,4.72,2.61,2.38,60.75,62.35,54.89,51.7,56.73,50.06,68.42,64.28,68.03,63.94,60.23,62.99,52.75,48.22,52.14,35.35,69.24,66.68,32.0,18.2,0.0,0.0,10.0,1.0,13.9,7.0,4.8,4.0,16.0,0.0,0.0,0.0,0.0,0.0,2.7,20.6,149.29,152.25,172.59,171.38,147.77,147.31,119.85,137.03,148.75,139.26,138.42,144.28,157.83,147.12,136.99,126.63,67.27,62.28,0.622,0.698,0.755,0.7555,0.767889,0.767222,0.749667,0.7254,0.698,0.65625,0.519,0.39,0.328,0.28525,0.268,0.271,0.262,0.242
2176,1147,2016,4,82.49,134.4116,6,0,,177.765499,34.76028,-8.55905,15.90439,19.762328,21.516067,20.766404,15.407706,3.61,4.155,4.11,3.5,2.085,62.0,63.0,65.0,61.0,69.0,10260465.0,10571115.0,10666761.0,9359488.0,6306735.0,758.13,651.31,739.61,537.95,528.38,18.373738,15.791048,20.634631,24.781814,24.024465,18.626593,25.129029,18.996274,24.230913,23.982832,20.387675,20.420302,21.334378,19.039905,19.230144,15.445552,9.991187,10.677506,118798.0,110848.0,114683.0,124738.0,122835.0,120058.0,135889.0,105655.0,128073.0,120869.0,103250.0,99700.0,96204.0,101646.0,99536.0,86806.0,63794.0,56772.0,3.57,3.24,3.6,4.13,4.16,3.47,4.83,3.74,3.88,3.73,3.33,2.83,2.88,2.52,2.46,1.94,1.43,1.35,66.0,62.0,79.0,69.0,73.0,67.0,57.0,70.0,66.0,61.0,66.0,72.0,66.0,62.0,55.0,61.0,73.0,74.0,7.0,25.0,21.59,0.69,22.39,1.0,0.0,12.0,0.0,9.0,18.3,26.0,7.0,0.3,0.0,0.0,7.0,0.4,128.035747,106.046141,143.882745,172.267928,166.477771,132.194296,173.236402,137.86792,169.057771,163.555485,132.478836,142.452373,151.729211,137.745429,137.49298,114.462677,34.5949,90.045493,0.472,0.5304,0.611,0.75375,0.786667,0.773571,0.773333,0.776333,0.774,0.688,0.6548,0.598,0.508,0.417,0.3425,0.317,0.294923,0.276
2177,1147,2016,4,82.49,134.4116,6,4,,177.765499,34.76028,-8.55905,15.90439,19.762328,21.516067,20.766404,15.407706,3.61,4.155,4.11,3.5,2.085,62.0,63.0,65.0,61.0,69.0,10260465.0,10571115.0,10666761.0,9359488.0,6306735.0,758.13,651.31,739.61,537.95,528.38,18.373738,15.791048,20.634631,24.781814,24.024465,18.626593,25.129029,18.996274,24.230913,23.982832,20.387675,20.420302,21.334378,19.039905,19.230144,15.445552,9.991187,10.677506,118798.0,110848.0,114683.0,124738.0,122835.0,120058.0,135889.0,105655.0,128073.0,120869.0,103250.0,99700.0,96204.0,101646.0,99536.0,86806.0,63794.0,56772.0,3.57,3.24,3.6,4.13,4.16,3.47,4.83,3.74,3.88,3.73,3.33,2.83,2.88,2.52,2.46,1.94,1.43,1.35,66.0,62.0,79.0,69.0,73.0,67.0,57.0,70.0,66.0,61.0,66.0,72.0,66.0,62.0,55.0,61.0,73.0,74.0,7.0,25.0,21.59,0.69,22.39,1.0,0.0,12.0,0.0,9.0,18.3,26.0,7.0,0.3,0.0,0.0,7.0,0.4,128.035747,106.046141,143.882745,172.267928,166.477771,132.194296,173.236402,137.86792,169.057771,163.555485,132.478836,142.452373,151.729211,137.745429,137.49298,114.462677,34.5949,90.045493,0.472,0.5304,0.611,0.75375,0.786667,0.773571,0.773333,0.776333,0.774,0.688,0.6548,0.598,0.508,0.417,0.3425,0.317,0.294923,0.276
2179,1147,2016,4,82.49,134.4116,6,1,,177.765499,34.76028,-8.55905,15.90439,19.762328,21.516067,20.766404,15.407706,3.61,4.155,4.11,3.5,2.085,62.0,63.0,65.0,61.0,69.0,10260465.0,10571115.0,10666761.0,9359488.0,6306735.0,758.13,651.31,739.61,537.95,528.38,18.373738,15.791048,20.634631,24.781814,24.024465,18.626593,25.129029,18.996274,24.230913,23.982832,20.387675,20.420302,21.334378,19.039905,19.230144,15.445552,9.991187,10.677506,118798.0,110848.0,114683.0,124738.0,122835.0,120058.0,135889.0,105655.0,128073.0,120869.0,103250.0,99700.0,96204.0,101646.0,99536.0,86806.0,63794.0,56772.0,3.57,3.24,3.6,4.13,4.16,3.47,4.83,3.74,3.88,3.73,3.33,2.83,2.88,2.52,2.46,1.94,1.43,1.35,66.0,62.0,79.0,69.0,73.0,67.0,57.0,70.0,66.0,61.0,66.0,72.0,66.0,62.0,55.0,61.0,73.0,74.0,7.0,25.0,21.59,0.69,22.39,1.0,0.0,12.0,0.0,9.0,18.3,26.0,7.0,0.3,0.0,0.0,7.0,0.4,128.035747,106.046141,143.882745,172.267928,166.477771,132.194296,173.236402,137.86792,169.057771,163.555485,132.478836,142.452373,151.729211,137.745429,137.49298,114.462677,34.5949,90.045493,0.472,0.5304,0.611,0.75375,0.786667,0.773571,0.773333,0.776333,0.774,0.688,0.6548,0.598,0.508,0.417,0.3425,0.317,0.294923,0.276


In [9]:
# checking the same at test dataset
df_test[df_test.duplicated(subset=['Field', 'Year'], keep=False)].sort_values(by='Field').head(6)

Unnamed: 0,Field,Year,Cluster,Area,Yield,Geozone,Predecessor,FAO,N_kg/ga,P_kg/ga,K_kg/ga,5_YEARLY_median_AVERAGE_TEMPERATURE,6_YEARLY_median_AVERAGE_TEMPERATURE,7_YEARLY_median_AVERAGE_TEMPERATURE,8_YEARLY_median_AVERAGE_TEMPERATURE,9_YEARLY_median_AVERAGE_TEMPERATURE,5_YEARLY_median_EVAPORATION,6_YEARLY_median_EVAPORATION,7_YEARLY_median_EVAPORATION,8_YEARLY_median_EVAPORATION,9_YEARLY_median_EVAPORATION,5_YEARLY_median_HUMIDITY(%),6_YEARLY_median_HUMIDITY(%),7_YEARLY_median_HUMIDITY(%),8_YEARLY_median_HUMIDITY(%),9_YEARLY_median_HUMIDITY(%),5_YEARLY_sum_CALCULATED_RADIATION,6_YEARLY_sum_CALCULATED_RADIATION,7_YEARLY_sum_CALCULATED_RADIATION,8_YEARLY_sum_CALCULATED_RADIATION,9_YEARLY_sum_CALCULATED_RADIATION,5_YEARLY_sum_RAINFALL,6_YEARLY_sum_RAINFALL,7_YEARLY_sum_RAINFALL,8_YEARLY_sum_RAINFALL,9_YEARLY_sum_RAINFALL,22_WEEK_AVERAGE_TEMPERATURE,23_WEEK_AVERAGE_TEMPERATURE,24_WEEK_AVERAGE_TEMPERATURE,25_WEEK_AVERAGE_TEMPERATURE,26_WEEK_AVERAGE_TEMPERATURE,27_WEEK_AVERAGE_TEMPERATURE,28_WEEK_AVERAGE_TEMPERATURE,29_WEEK_AVERAGE_TEMPERATURE,30_WEEK_AVERAGE_TEMPERATURE,31_WEEK_AVERAGE_TEMPERATURE,32_WEEK_AVERAGE_TEMPERATURE,33_WEEK_AVERAGE_TEMPERATURE,34_WEEK_AVERAGE_TEMPERATURE,35_WEEK_AVERAGE_TEMPERATURE,36_WEEK_AVERAGE_TEMPERATURE,37_WEEK_AVERAGE_TEMPERATURE,38_WEEK_AVERAGE_TEMPERATURE,39_WEEK_AVERAGE_TEMPERATURE,22_WEEK_CALCULATED_RADIATION,23_WEEK_CALCULATED_RADIATION,24_WEEK_CALCULATED_RADIATION,25_WEEK_CALCULATED_RADIATION,26_WEEK_CALCULATED_RADIATION,27_WEEK_CALCULATED_RADIATION,28_WEEK_CALCULATED_RADIATION,29_WEEK_CALCULATED_RADIATION,30_WEEK_CALCULATED_RADIATION,31_WEEK_CALCULATED_RADIATION,32_WEEK_CALCULATED_RADIATION,33_WEEK_CALCULATED_RADIATION,34_WEEK_CALCULATED_RADIATION,35_WEEK_CALCULATED_RADIATION,36_WEEK_CALCULATED_RADIATION,37_WEEK_CALCULATED_RADIATION,38_WEEK_CALCULATED_RADIATION,39_WEEK_CALCULATED_RADIATION,22_WEEK_EVAPORATION,23_WEEK_EVAPORATION,24_WEEK_EVAPORATION,25_WEEK_EVAPORATION,26_WEEK_EVAPORATION,27_WEEK_EVAPORATION,28_WEEK_EVAPORATION,29_WEEK_EVAPORATION,30_WEEK_EVAPORATION,31_WEEK_EVAPORATION,32_WEEK_EVAPORATION,33_WEEK_EVAPORATION,34_WEEK_EVAPORATION,35_WEEK_EVAPORATION,36_WEEK_EVAPORATION,37_WEEK_EVAPORATION,38_WEEK_EVAPORATION,39_WEEK_EVAPORATION,22_WEEK_HUMIDITY(%),23_WEEK_HUMIDITY(%),24_WEEK_HUMIDITY(%),25_WEEK_HUMIDITY(%),26_WEEK_HUMIDITY(%),27_WEEK_HUMIDITY(%),28_WEEK_HUMIDITY(%),29_WEEK_HUMIDITY(%),30_WEEK_HUMIDITY(%),31_WEEK_HUMIDITY(%),32_WEEK_HUMIDITY(%),33_WEEK_HUMIDITY(%),34_WEEK_HUMIDITY(%),35_WEEK_HUMIDITY(%),36_WEEK_HUMIDITY(%),37_WEEK_HUMIDITY(%),38_WEEK_HUMIDITY(%),39_WEEK_HUMIDITY(%),22_WEEK_RAINFALL,23_WEEK_RAINFALL,24_WEEK_RAINFALL,25_WEEK_RAINFALL,26_WEEK_RAINFALL,27_WEEK_RAINFALL,28_WEEK_RAINFALL,29_WEEK_RAINFALL,30_WEEK_RAINFALL,31_WEEK_RAINFALL,32_WEEK_RAINFALL,33_WEEK_RAINFALL,34_WEEK_RAINFALL,35_WEEK_RAINFALL,36_WEEK_RAINFALL,37_WEEK_RAINFALL,38_WEEK_RAINFALL,39_WEEK_RAINFALL,22_WEEK_SUM_ACTIVE_TEMPERATURE,23_WEEK_SUM_ACTIVE_TEMPERATURE,24_WEEK_SUM_ACTIVE_TEMPERATURE,25_WEEK_SUM_ACTIVE_TEMPERATURE,26_WEEK_SUM_ACTIVE_TEMPERATURE,27_WEEK_SUM_ACTIVE_TEMPERATURE,28_WEEK_SUM_ACTIVE_TEMPERATURE,29_WEEK_SUM_ACTIVE_TEMPERATURE,30_WEEK_SUM_ACTIVE_TEMPERATURE,31_WEEK_SUM_ACTIVE_TEMPERATURE,32_WEEK_SUM_ACTIVE_TEMPERATURE,33_WEEK_SUM_ACTIVE_TEMPERATURE,34_WEEK_SUM_ACTIVE_TEMPERATURE,35_WEEK_SUM_ACTIVE_TEMPERATURE,36_WEEK_SUM_ACTIVE_TEMPERATURE,37_WEEK_SUM_ACTIVE_TEMPERATURE,38_WEEK_SUM_ACTIVE_TEMPERATURE,39_WEEK_SUM_ACTIVE_TEMPERATURE,22_week_ndvi,23_week_ndvi,24_week_ndvi,25_week_ndvi,26_week_ndvi,27_week_ndvi,28_week_ndvi,29_week_ndvi,30_week_ndvi,31_week_ndvi,32_week_ndvi,33_week_ndvi,34_week_ndvi,35_week_ndvi,36_week_ndvi,37_week_ndvi,38_week_ndvi,39_week_ndvi
133,2667,2020,1,104.653,0,14,1,330.0,229.341825,61.405922,85.991897,15.523175,19.241317,21.013416,20.258808,15.103848,3.36,3.905,3.94,3.25,2.03,65.0,68.0,67.0,66.0,70.0,9975065.0,10171596.0,10321714.0,8900176.0,6182548.0,767.92,1192.32,876.93,481.49,580.87,11.67,13.89,22.7,20.75,21.17,23.17,19.0,18.71,20.54,22.34,22.77,19.39,22.02,19.86,19.76,17.48,18.89,15.95,76981.0,120126.0,124440.0,113565.0,118655.0,120649.0,126285.0,125271.0,117024.0,117448.0,122218.0,113236.0,112972.0,96970.0,92576.0,92817.0,86233.0,75862.0,2.12,3.23,4.19,3.58,3.54,4.0,3.39,3.46,3.65,3.75,4.21,3.19,3.48,3.04,2.55,2.41,2.34,2.01,86.0,75.0,62.0,78.0,69.0,66.0,62.0,64.0,64.0,66.0,43.0,58.0,57.0,67.0,66.0,59.0,55.0,60.0,42.0,8.4,17.0,63.1,10.2,16.6,5.4,0.8,5.4,2.1,0.0,3.0,0.0,5.0,4.0,1.0,0.0,15.0,81.61,108.55,157.46,145.36,150.88,159.93,139.82,130.45,143.56,147.28,159.87,136.58,152.76,141.5,144.39,125.84,114.65,116.35,0.286,0.299214,0.349714,0.422,0.5435,0.637,0.664125,0.689,0.682333,0.667,0.645667,0.618,0.604,0.568,0.563,0.479,0.378,0.0
134,2667,2020,1,104.653,0,14,4,330.0,229.341825,61.405922,85.991897,15.523175,19.241317,21.013416,20.258808,15.103848,3.36,3.905,3.94,3.25,2.03,65.0,68.0,67.0,66.0,70.0,9975065.0,10171596.0,10321714.0,8900176.0,6182548.0,767.92,1192.32,876.93,481.49,580.87,11.67,13.89,22.7,20.75,21.17,23.17,19.0,18.71,20.54,22.34,22.77,19.39,22.02,19.86,19.76,17.48,18.89,15.95,76981.0,120126.0,124440.0,113565.0,118655.0,120649.0,126285.0,125271.0,117024.0,117448.0,122218.0,113236.0,112972.0,96970.0,92576.0,92817.0,86233.0,75862.0,2.12,3.23,4.19,3.58,3.54,4.0,3.39,3.46,3.65,3.75,4.21,3.19,3.48,3.04,2.55,2.41,2.34,2.01,86.0,75.0,62.0,78.0,69.0,66.0,62.0,64.0,64.0,66.0,43.0,58.0,57.0,67.0,66.0,59.0,55.0,60.0,42.0,8.4,17.0,63.1,10.2,16.6,5.4,0.8,5.4,2.1,0.0,3.0,0.0,5.0,4.0,1.0,0.0,15.0,81.61,108.55,157.46,145.36,150.88,159.93,139.82,130.45,143.56,147.28,159.87,136.58,152.76,141.5,144.39,125.84,114.65,116.35,0.286,0.299214,0.349714,0.422,0.5435,0.637,0.664125,0.689,0.682333,0.667,0.645667,0.618,0.604,0.568,0.563,0.479,0.378,0.0
169,2708,2020,1,153.379,0,14,1,320.0,206.316535,56.72997,73.195744,15.523175,19.241317,21.013416,20.258808,15.103848,3.36,3.905,3.94,3.25,2.03,65.0,68.0,67.0,66.0,70.0,9975065.0,10171596.0,10321714.0,8900176.0,6182548.0,767.92,1192.32,876.93,481.49,580.87,11.67,13.89,22.7,20.75,21.17,23.17,19.0,18.71,20.54,22.34,22.77,19.39,22.02,19.86,19.76,17.48,18.89,15.95,76981.0,120126.0,124440.0,113565.0,118655.0,120649.0,126285.0,125271.0,117024.0,117448.0,122218.0,113236.0,112972.0,96970.0,92576.0,92817.0,86233.0,75862.0,2.12,3.23,4.19,3.58,3.54,4.0,3.39,3.46,3.65,3.75,4.21,3.19,3.48,3.04,2.55,2.41,2.34,2.01,86.0,75.0,62.0,78.0,69.0,66.0,62.0,64.0,64.0,66.0,43.0,58.0,57.0,67.0,66.0,59.0,55.0,60.0,42.0,8.4,17.0,63.1,10.2,16.6,5.4,0.8,5.4,2.1,0.0,3.0,0.0,5.0,4.0,1.0,0.0,15.0,81.61,108.55,157.46,145.36,150.88,159.93,139.82,130.45,143.56,147.28,159.87,136.58,152.76,141.5,144.39,125.84,114.65,116.35,0.270417,0.306357,0.322857,0.4,0.520571,0.627,0.665182,0.7164,0.732333,0.7,0.684333,0.6755,0.6792,0.656,0.635,0.46,0.33,0.0
170,2708,2020,1,153.379,0,14,4,320.0,206.316535,56.72997,73.195744,15.523175,19.241317,21.013416,20.258808,15.103848,3.36,3.905,3.94,3.25,2.03,65.0,68.0,67.0,66.0,70.0,9975065.0,10171596.0,10321714.0,8900176.0,6182548.0,767.92,1192.32,876.93,481.49,580.87,11.67,13.89,22.7,20.75,21.17,23.17,19.0,18.71,20.54,22.34,22.77,19.39,22.02,19.86,19.76,17.48,18.89,15.95,76981.0,120126.0,124440.0,113565.0,118655.0,120649.0,126285.0,125271.0,117024.0,117448.0,122218.0,113236.0,112972.0,96970.0,92576.0,92817.0,86233.0,75862.0,2.12,3.23,4.19,3.58,3.54,4.0,3.39,3.46,3.65,3.75,4.21,3.19,3.48,3.04,2.55,2.41,2.34,2.01,86.0,75.0,62.0,78.0,69.0,66.0,62.0,64.0,64.0,66.0,43.0,58.0,57.0,67.0,66.0,59.0,55.0,60.0,42.0,8.4,17.0,63.1,10.2,16.6,5.4,0.8,5.4,2.1,0.0,3.0,0.0,5.0,4.0,1.0,0.0,15.0,81.61,108.55,157.46,145.36,150.88,159.93,139.82,130.45,143.56,147.28,159.87,136.58,152.76,141.5,144.39,125.84,114.65,116.35,0.270417,0.306357,0.322857,0.4,0.520571,0.627,0.665182,0.7164,0.732333,0.7,0.684333,0.6755,0.6792,0.656,0.635,0.46,0.33,0.0
916,2835,2020,5,118.515,0,16,0,320.0,124.879281,41.223734,-7.117356,16.365038,20.237856,21.860072,21.48,15.93069,4.11,4.965,4.91,4.1,2.285,58.0,57.0,59.0,50.77,61.0,13856468.0,14356038.0,14601869.0,12809810.0,8507485.0,738.33,678.58,606.64,439.55,477.23,14.4,14.66,24.78,23.09,22.38,24.17,23.28,19.86,22.0,23.96,23.11,18.94,23.1,20.55,24.47,18.34,18.63,17.81,136518.0,167978.0,194603.0,173659.0,176577.0,187171.0,189381.0,152724.0,172294.0,158498.0,169942.0,146315.0,156688.0,144266.0,134571.0,128481.0,101272.0,109337.0,3.35,3.82,6.08,5.59,5.13,5.22,4.95,4.6,4.52,4.96,4.82,3.51,3.86,3.44,3.31,2.77,2.22,1.87,88.0,71.0,44.0,54.0,52.0,48.0,42.0,58.0,49.0,50.0,37.0,58.0,38.0,47.0,8.0,36.0,55.0,36.0,79.8,22.5,7.0,4.3,0.4,6.0,0.1,29.7,0.0,2.5,0.0,4.0,16.3,0.4,0.0,0.0,0.0,0.7,100.21,115.73,170.25,160.85,158.18,172.78,159.05,137.23,150.3,155.65,162.42,134.32,156.41,146.54,164.92,130.2,115.14,119.21,0.26575,0.36,0.5195,0.6435,0.657,0.688,0.7075,0.707,0.696,0.659375,0.578,0.496625,0.3875,0.3005,0.279,0.262,0.219,0.0
917,2835,2020,5,118.515,0,16,4,320.0,124.879281,41.223734,-7.117356,16.365038,20.237856,21.860072,21.48,15.93069,4.11,4.965,4.91,4.1,2.285,58.0,57.0,59.0,50.77,61.0,13856468.0,14356038.0,14601869.0,12809810.0,8507485.0,738.33,678.58,606.64,439.55,477.23,14.4,14.66,24.78,23.09,22.38,24.17,23.28,19.86,22.0,23.96,23.11,18.94,23.1,20.55,24.47,18.34,18.63,17.81,136518.0,167978.0,194603.0,173659.0,176577.0,187171.0,189381.0,152724.0,172294.0,158498.0,169942.0,146315.0,156688.0,144266.0,134571.0,128481.0,101272.0,109337.0,3.35,3.82,6.08,5.59,5.13,5.22,4.95,4.6,4.52,4.96,4.82,3.51,3.86,3.44,3.31,2.77,2.22,1.87,88.0,71.0,44.0,54.0,52.0,48.0,42.0,58.0,49.0,50.0,37.0,58.0,38.0,47.0,8.0,36.0,55.0,36.0,79.8,22.5,7.0,4.3,0.4,6.0,0.1,29.7,0.0,2.5,0.0,4.0,16.3,0.4,0.0,0.0,0.0,0.7,100.21,115.73,170.25,160.85,158.18,172.78,159.05,137.23,150.3,155.65,162.42,134.32,156.41,146.54,164.92,130.2,115.14,119.21,0.26575,0.36,0.5195,0.6435,0.657,0.688,0.7075,0.707,0.696,0.659375,0.578,0.496625,0.3875,0.3005,0.279,0.262,0.219,0.0


## Observations

### Multiple Predecessors
This observation indicates that a given culture in the Field may have several Predecessors, and each of them is described in the dataset with duplicate rows, specifying the predecessors. Additional processing will be required during modeling to handle this case.

### Incomplete Data Across Years
Another observation is that for each Field, there may not necessarily be observations for all 4 years that are presented in the dataset.


In [10]:
basic_details(df_train)

Unnamed: 0,"Missing, %","Zero, %",N unique value,dtype,Min,1%,25%,Mean,Median,75%,995%,Max,Std
Field,0.0,0.02,3684,int64,0.0,26.0,1094.0,2183.63,2179.0,3381.5,4154.0,4180.0,1253.1
Year,0.0,0.0,4,int64,2016.0,2016.0,2017.0,2017.93,2018.0,2019.0,2019.0,2019.0,1.02
Cluster,0.0,41.01,7,int64,0.0,0.0,0.0,1.87,1.0,4.0,6.0,6.0,2.13
Area,0.0,0.0,5149,float64,30.02,30.6779,55.0805,101.2,86.0,123.513,413.4757,730.41,67.61
Yield,0.0,0.0,5401,float64,3.4,31.73485,99.55932,121.4,123.74,146.5387,187.7897,190.38,34.99
Geozone,0.0,2.25,42,int64,0.0,0.0,10.0,20.52,21.0,31.0,41.0,41.0,12.11
Predecessor,0.0,35.43,5,int64,0.0,0.0,0.0,1.31,1.0,2.0,4.0,4.0,1.28
FAO,13.73,0.0,22,float64,180.0,220.0,290.0,320.42,320.0,360.0,440.0,440.0,39.87
N_kg/ga,0.0,0.0,5561,float64,98.31,110.8029,183.6563,204.13,208.52,226.0675,275.4864,279.62,32.9
P_kg/ga,3.92,0.0,5343,float64,13.43,30.39078,45.53731,50.66,50.93,55.77334,87.68536,145.15,9.99


In [11]:
basic_details(df_test)

Unnamed: 0,"Missing, %","Zero, %",N unique value,dtype,Min,1%,25%,Mean,Median,75%,995%,Max,Std
Field,0.0,0.05,2187,int64,0.0,21.91,1073.75,2184.67,2226.0,3335.25,4169.045,4180.0,1264.32
Year,0.0,0.0,1,int64,2020.0,2020.0,2020.0,2020.0,2020.0,2020.0,2020.0,2020.0,0.0
Cluster,0.0,49.27,7,int64,0.0,0.0,0.0,1.65,1.0,4.0,6.0,6.0,2.18
Area,0.0,0.0,2176,float64,30.04,30.69238,56.45525,102.65,86.5,126.7722,399.1527,717.31,66.81
Yield,0.0,100.0,1,int64,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Geozone,0.0,0.64,41,int64,0.0,1.0,10.0,21.82,23.0,32.0,41.0,42.0,11.98
Predecessor,0.0,40.37,5,int64,0.0,0.0,0.0,1.09,1.0,2.0,4.0,4.0,1.09
FAO,22.86,0.0,11,float64,250.0,280.0,310.0,332.66,350.0,360.0,360.0,390.0,27.79
N_kg/ga,0.0,0.0,2187,float64,98.12,112.0794,184.3272,200.0,205.02,220.4334,261.7134,279.36,28.93
P_kg/ga,10.45,0.0,1958,float64,30.01,30.4025,44.42714,49.63,50.11,55.36709,89.6032,93.91,10.52


# Descriptive Statistics Analysis Report

## Conclusions obtained from the analysis of descriptive statistics for each feature for the train set:

### General Characteristics:
- **Field - Field ID**: No missing values, from 0 to 4180 (3684 unique values).
- **Year - Observation Year**: No missing values, from 2016 to 2019.
- **Cluster - Cluster ID**: No missing values, from 0 to 6 (7 unique values).

### Area and Yield:
- **Area - Field area in hectares**: No missing values, from 30 to 730 (a significant increase in value over a very small interval between 99.5 and 100 percentile, which is often an indicator of the presence of outliers).
- **Yield - Yield in ton per hectare**: No missing values, from 3.4 to 190.4 (close to normally distributed without outliers).

### Geozone and Predecessor:
- **Geozone - Geozone ID**: No missing values, from 0 to 41.
- **Predecessor - Predecessor ID (type of crop that was grown in a given field in the previous year)**: No missing values, from 0 to 4.

### FAO Index:
- **FAO - Hybrid FAO**: 13.8% missing values, from 180 to 440 (normally distributed without outliers) - hypothesis of the presence of missing values - the FAO index for them is currently not defined or assigned. If necessary, need to replace the NaNs with the average FAO value for the Geozone (hypothesis: hybrids with similar FAO characteristics will be grown in geographical areas with similar climates).

### Fertilizers Applied (N, P, K):
- **'N_kg/ga'**: The amount of applied fertilizers in kg per hectare, Nitrogen, no missing values (nearly normal distribution without outliers).
- **'P_kg/ga'**: The amount of applied fertilizers in kg per hectare, Phosphorus, 4% of missing values (close to normal distribution, with outliers).
- **'K_kg/ga'**: The amount of applied fertilizers in kg per hectare, Potassium, 6% of missing values, scaled from -97 to 197. Need to rescale between 0 and 1 (check test set distribution).

### Weather and Vegetation Indices:
- **AVERAGE_TEMPERATURE**: Median temperatures on a 5-9 year basis, no missing values, normally distributed from 14.48 to 21.5 degrees.
- **EVAPORATION**: Median evaporation on a 5-9 year basis, no missing values, close to normal distribution.
- **HUMIDITY**: Median humidity on a 5-9 year basis, no missing values, normally distributed.
- **CALCULATED_RADIATION**: Sum calculated radiation on a 5-9 year basis, no missing values, from 5.9K to 15.4K units, close to normal distribution.
- **RAINFALL**: Sum rainfall on a 5-9 year basis, no missing values, from 299 to 1467 units, normally distributed.
- **WEEK_AVERAGE_TEMPERATURE, WEEK_CALCULATED_RADIATION, WEEK_EVAPORATION, WEEK_HUMIDITY**: No missing values, normally distributed.
- **WEEK_RAINFALL**: No missing values, not normal distribution, with outliers (the mean and median are far from each other, there is a large distance between the 99.5 and 100 percentiles).
- **WEEK_SUM_ACTIVE_TEMPERATURE**: No missing values, normally distributed.
- **Week_ndvi**: Value between 0 and 1, no missing values, close to normal distribution.

## Conclusions obtained from the analysis of descriptive statistics for the test set:

- **Field - Field ID**: No missing values, from 0 to 4180 (2187 unique values).
- **Year - Observation Year**: No missing values, 2020 year only.
- **Cluster - Cluster ID**: No missing values, from 0 to 6 (7 unique values).
- **Area - Field area in hectares**: No missing values, from 30 to 717 (a significant increase in value over a very small interval between 99.5 and 100 percentile, which is often an indicator of the presence of outliers).
- **Yield - Yield in ton per hectare**: Needs to be predicted.
- **Geozone - Geozone ID**: No missing values, from 0 to 42!! (instead of 41 in the train set).
- **Predecessor - Predecessor ID (type of crop that was grown in a given field in the previous year)**: No missing values, from 0 to 4.
- **FAO - Hybrid FAO (the higher the FAO value, the higher the yield of the hybrid)**: 22.9% missing values, from 250 to 390 (close to normal distribution without outliers) - hypothesis of the presence of missing values - the FAO index for them is currently not defined or assigned. If necessary, need to replace the NaNs with the average FAO value for the Geozone (hypothesis: hybrids with similar FAO characteristics will be grown in geographical areas with similar climates).
- **'N_kg/ga'**: The amount of applied fertilizers in kg per hectare, Nitrogen, no missing values (nearly normal distribution without outliers).
- **'P_kg/ga'**: The amount of applied fertilizers in kg per hectare, Phosphorus, 10.5% of missing values (close to normal distribution, with outliers).
- **'K_kg/ga'**: The amount of applied fertilizers in kg per hectare, Potassium, 10.5% of missing values, scaled feature -10 and 196. Need to rescale between 0 and 1 (check test set distribution).
- **AVERAGE_TEMPERATURE**: Median temperatures on a 5-9 year basis, no missing values, normally distributed.
- **EVAPORATION**: Median evaporation on a 5-9 year basis, no missing values, close to normal distribution.
- **HUMIDITY**: Median humidity on a 5-9 year basis, no missing values, normally distributed.
- **CALCULATED_RADIATION**: Sum calculated radiation on a 5-9 year basis, no missing values, close to normal distribution.
- **RAINFALL**: Sum rainfall on a 5-9 year basis, no missing values, normally distributed.
- **WEEK_AVERAGE_TEMPERATURE, WEEK_CALCULATED_RADIATION, WEEK_EVAPORATION, WEEK_HUMIDITY**: No missing values, normally distributed.
- **WEEK_RAINFALL**: No missing values, not normal distribution, with outliers (the mean and median are far from each other, there is a large distance between the 99.5 and 100 percentiles).
- **WEEK_SUM_ACTIVE_TEMPERATURE**: No missing values, normally distributed.
- **week_ndvi**: Value between 0 and 1, no missing values, close to normal distribution (**39_week_ndvi - all zero values**).

## Differences between test and train datasets: 
- **Year - Observation Year**: 2016-2019 vs 2020
- **Geozone - Geozone ID**: one additional geozone in test set
- **week_ndvi**: 39_week_ndvi - all zero values


## Notes and Considerations:

### Outlier Investigation:
- **Investigate outliers for the 'Area' feature in both datasets graphically**: Use visual methods such as box plots or scatter plots to better understand the nature of the outliers and determine if they are genuine or due to errors.

### Missing Values in FAO - Hybrid FAO:
- **Replace the NaNs with the average FAO value for the Geozone**: Consider implementing this approach after verifying that it's appropriate for the context and distribution of your data. Ensure that it won't introduce any bias or misrepresentation.

### Feature 'K_kg/ga' Investigation:
- **Further investigate the 'K_kg/ga' feature to determine if rescaling to a 0-1 range is necessary**: Analyze the current scale and distribution of the 'K_kg/ga' values to decide if rescaling would make the data more consistent and interpretable, especially if it's significantly different from other features.

### WEEK_RAINFALL Outlier Investigation:
- **Further investigate the 'WEEK_RAINFALL' feature for outliers**: Given the noted discrepancies between the mean and median and the large distance between the 99.5 and 100 percentiles, a deeper investigation into potential outliers and their impact on the analysis is recommended.

### Imputation Strategy for Missing Values:
- **Determine imputation strategies for missing values**: Aside from the suggested imputation for the FAO feature, consider appropriate strategies for other features with missing values. This might include deletion, imputation with mean/median/mode, or more complex methods like predictive modeling based on other features.
