<a href="https://colab.research.google.com/github/rapsoj/crop-yield-estimate/blob/main/01.02-feature-engineering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#01.02 Feature Engineering
Creating new features to improve predictions for the Digital Green Crop Yield Estimate Challenge.

### Prepare Workspace

In [2]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

ModuleNotFoundError: No module named 'google.colab'

In [None]:
# Import data manipulation libraries
import pandas as pd
import numpy as np

In [None]:
# Load files
data_path = '/content/drive/MyDrive/Colab Notebooks/crop-yield-estimate/'
train = pd.read_csv(data_path + 'Train.csv')
test = pd.read_csv(data_path + 'Test.csv')
var_desc = pd.read_csv(data_path + 'VariableDescription.csv')

### Prepare Workspace Locally

In [3]:
import pandas as pd
import numpy as np

In [4]:
data_path = '/crop-yield-estimate/data/'
train = pd.read_csv(data_path + 'Train.csv')
test = pd.read_csv(data_path + 'Test.csv')
var_desc = pd.read_csv(data_path + 'VariableDescription.csv')


In [5]:
train.head()

Unnamed: 0,ID,District,Block,CultLand,CropCultLand,LandPreparationMethod,CropTillageDate,CropTillageDepth,CropEstMethod,RcNursEstDate,...,Harv_method,Harv_date,Harv_hand_rent,Threshing_date,Threshing_method,Residue_length,Residue_perc,Stubble_use,Acre,Yield
0,ID_GTFAC7PEVWQ9,Nalanda,Noorsarai,45,40,TractorPlough FourWheelTracRotavator,2022-07-20,5,Manual_PuddledRandom,2022-06-27,...,machine,2022-11-16,,2022-11-16,machine,30,40,plowed_in_soil,0.3125,600
1,ID_TK40ARLSPOKS,Nalanda,Rajgir,26,26,WetTillagePuddling TractorPlough FourWheelTrac...,2022-07-18,5,Manual_PuddledRandom,2022-06-20,...,hand,2022-11-25,3.0,2022-12-24,machine,24,10,plowed_in_soil,0.3125,600
2,ID_1FJY2CRIMLZZ,Gaya,Gurua,10,10,TractorPlough FourWheelTracRotavator,2022-06-30,6,Manual_PuddledRandom,2022-06-20,...,hand,2022-12-12,480.0,2023-01-11,machine,30,10,plowed_in_soil,0.148148,225
3,ID_I3IPXS4DB7NE,Gaya,Gurua,15,15,TractorPlough FourWheelTracRotavator,2022-06-16,6,Manual_PuddledRandom,2022-06-17,...,hand,2022-12-02,240.0,2022-12-29,hand,26,10,plowed_in_soil,0.222222,468
4,ID_4T8YQWXWHB4A,Nalanda,Noorsarai,60,60,TractorPlough WetTillagePuddling,2022-07-19,4,Manual_PuddledRandom,2022-06-21,...,machine,2022-11-30,,2022-12-02,machine,24,40,plowed_in_soil,0.46875,550


In [6]:
count = len(train[train['CultLand'] < train['CropCultLand']])
print(count)


0


In [7]:
count = len(train[train['CultLand'] > train['CropCultLand']])
print(count)


1616


In [9]:

# Calculate the correlation between CultLand and Acre columns
corr_cultland = train['CultLand'].corr(train['Acre'])

# Calculate the correlation between CropCultLand and Acre columns
corr_cropcultland = train['CropCultLand'].corr(train['Acre'])

print(corr_cultland)
print(corr_cropcultland)



0.4096040689279083
0.39407007108602965


In [None]:
# How many missing numbers are there?
train.isnull().sum()




ID                                       0
District                                 0
Block                                    0
CultLand                                 0
CropCultLand                             0
LandPreparationMethod                    0
CropTillageDate                          0
CropTillageDepth                         0
CropEstMethod                            0
RcNursEstDate                           83
SeedingSowingTransplanting               0
SeedlingsPerPit                        289
NursDetFactor                          289
TransDetFactor                         289
TransplantingIrrigationHours           193
TransplantingIrrigationSource          115
TransplantingIrrigationPowerSource     503
TransIrriCost                          882
StandingWater                          238
OrgFertilizers                        1335
Ganaura                               2417
CropOrgFYM                            2674
PCropSolidOrgFertAppMethod            1337
NoFertilize

In [None]:
# Calculate percentage of missing values in each column
train.isnull().sum() / len(train) * 100

ID                                     0.000000
District                               0.000000
Block                                  0.000000
CultLand                               0.000000
CropCultLand                           0.000000
LandPreparationMethod                  0.000000
CropTillageDate                        0.000000
CropTillageDepth                       0.000000
CropEstMethod                          0.000000
RcNursEstDate                          2.144703
SeedingSowingTransplanting             0.000000
SeedlingsPerPit                        7.467700
NursDetFactor                          7.467700
TransDetFactor                         7.467700
TransplantingIrrigationHours           4.987080
TransplantingIrrigationSource          2.971576
TransplantingIrrigationPowerSource    12.997416
TransIrriCost                         22.790698
StandingWater                          6.149871
OrgFertilizers                        34.496124
Ganaura                               62

### Perform Feature Engineering

In [None]:
# Create feature for yield per acre 'Yield_per_Acre'
train['Yield_per_Acre'] = train['Yield'] / train['Acre']

In [None]:
# Create feature for past month yield per acre 'Past_Yield_per_Acre'
train['Harv_date'] = pd.to_datetime(train['Harv_date'])
train.sort_values(['District', 'Harv_date'], inplace=True)

# Group the DataFrame by 'District' and calculate the rolling average
train['Past_YpA_Avg'] = train.groupby('District')['Yield_per_Acre'].rolling(
    window = 30).mean().reset_index(0, drop=True)

# Fill NaN values in the 'past_month_avg' column with 0 if needed
train['Past_YpA_Avg'].fillna(0, inplace=True)

In [None]:
# Create feature for days between harvesting and threshing 'Days_Harv_Thresh'
train['Threshing_date'] = pd.to_datetime(train['Threshing_date'])
train['Days_Harv_Thresh'] = (
    train['Threshing_date'] - train['Harv_date']).dt.days