In [2]:
import scaling
import feature_engineering
import cleaning
import os

import pandas as pd
import numpy as np
import calendar
from datetime import datetime

import warnings
warnings.filterwarnings("ignore")


train_path = "T:\crop-yield-estimate\data\Train.csv"
df_train = cleaning.clean_data(train_path)
df_train = feature_engineering.get_features(df_train)

In [3]:
df_train.head()

Unnamed: 0,ID,District,Block,CultLand,CropCultLand,LandPreparationMethod,CropTillageDate,CropTillageDepth,CropEstMethod,RcNursEstDate,...,Total_Crop_Cycle_Duration,Num_LandPrepMethod,Num_NursDetFactor,Num_TransDetFactor,Num_OrgFertilizers,Num_CropbasalFerts,Num_TopDressFert,Latitude,Longitude,Elevation
0,ID_GTFAC7PEVWQ9,Nalanda,Noorsarai,45,40,TractorPlough FourWheelTracRotavator,2022-07-20,5,Manual_PuddledRandom,2022-06-27,...,119.0,2,2,5,0,1,1,25.2748,85.4569,63.0
1,ID_TK40ARLSPOKS,Nalanda,Rajgir,26,26,WetTillagePuddling TractorPlough FourWheelTrac...,2022-07-18,5,Manual_PuddledRandom,2022-06-20,...,159.0,3,3,5,0,2,1,25.0262,85.4174,67.0
2,ID_1FJY2CRIMLZZ,Gaya,Gurua,10,10,TractorPlough FourWheelTracRotavator,2022-06-30,6,Manual_PuddledRandom,2022-06-20,...,195.0,2,2,3,2,1,1,24.6696,84.772,114.0
3,ID_I3IPXS4DB7NE,Gaya,Gurua,15,15,TractorPlough FourWheelTracRotavator,2022-06-16,6,Manual_PuddledRandom,2022-06-17,...,196.0,2,2,5,1,2,1,24.6696,84.772,114.0
4,ID_4T8YQWXWHB4A,Nalanda,Noorsarai,60,60,TractorPlough WetTillagePuddling,2022-07-19,4,Manual_PuddledRandom,2022-06-21,...,136.0,2,2,4,0,2,1,25.2748,85.4569,63.0


In [4]:
# Lets see all the columns
pd.set_option('display.max_columns', None)  

In [5]:
filtered_columns = df_train.filter(like='NursDetFactor_')
df_nurs = filtered_columns.copy()
df_merged = df_nurs.merge(df_train[['ID']], left_index=True, right_index=True)
df_merged.head()



Unnamed: 0,NursDetFactor_CalendarDate,NursDetFactor_PreMonsoonShowers,NursDetFactor_IrrigWaterAvailability,NursDetFactor_LabourAvailability,NursDetFactor_SeedAvailability,ID
0,True,False,True,False,True,ID_GTFAC7PEVWQ9
1,True,True,True,True,True,ID_TK40ARLSPOKS
2,False,True,True,True,False,ID_1FJY2CRIMLZZ
3,True,True,True,True,True,ID_I3IPXS4DB7NE
4,True,False,True,False,True,ID_4T8YQWXWHB4A


In [6]:
date_variables = ["CropTillageMonth", "HarvestMonth", "ThreshingMonth"]


In [7]:
df_train["CropTillageMonth"].value_counts()

CropTillageMonth
July      2908
June       860
August     101
May          1
Name: count, dtype: int64

In [8]:
df_train["ID"]

0       ID_GTFAC7PEVWQ9
1       ID_TK40ARLSPOKS
2       ID_1FJY2CRIMLZZ
3       ID_I3IPXS4DB7NE
4       ID_4T8YQWXWHB4A
             ...       
3865    ID_7ZZQ6R4XB4FK
3866    ID_PVVDF6LK6FO8
3867    ID_RBYVUPRATVMW
3868    ID_ARE9QWENJNJ2
3869    ID_KEPOQDTCZC6S
Name: ID, Length: 3870, dtype: object

In [9]:
# DO THIS IN CLEANING PY
df_train[df_train["CropTillageMonth"] == "May"]["ID"]


379    ID_N2FQH4S194A9
Name: ID, dtype: object

In [10]:
# What categorical columns have entries that do not appear more than 20 times?
for col in date_variables:
     print(df_train[col].value_counts())
     print('\n')


CropTillageMonth
July      2908
June       860
August     101
May          1
Name: count, dtype: int64


HarvestMonth
November     2494
October       807
December      554
January         8
September       5
February        1
March           1
Name: count, dtype: int64


ThreshingMonth
December    1370
November    1008
January      683
October      523
February     236
March         50
Name: count, dtype: int64




In [11]:
# DO THIS IN CLEANING PY
df_train["HarvestMonth"] = df_train["HarvestMonth"].replace({'January': 'December','September': 'October', 'March': 'December', 'February':'December'})


In [12]:
# What categorical columns have entries that do not appear more than 20 times?
for col in date_variables:
     print(df_train[col].value_counts())
     print('\n')

CropTillageMonth
July      2908
June       860
August     101
May          1
Name: count, dtype: int64


HarvestMonth
November    2494
October      812
December     564
Name: count, dtype: int64


ThreshingMonth
December    1370
November    1008
January      683
October      523
February     236
March         50
Name: count, dtype: int64




In [13]:
# Get the count of unique values in each categorical column
categorical_columns = df_train.select_dtypes(include=['object']).columns
small_sample_columns = []

for col in categorical_columns:
    value_counts = df_train[col].value_counts()
    small_samples = value_counts[value_counts < 20]
    
    if not small_samples.empty:
        small_sample_columns.append(col)

print("Columns with small samples (under 20):")
print(small_sample_columns)


Columns with small samples (under 20):
['ID', 'Block', 'LandPreparationMethod', 'NursDetFactor', 'TransDetFactor', 'TransplantingIrrigationSource', 'TransplantingIrrigationPowerSource', 'OrgFertilizers', 'PCropSolidOrgFertAppMethod', 'CropbasalFerts', 'MineralFertAppMethod_1', 'FirstTopDressFert', 'TpIrrigationSource_Imputed', 'TpIrrigationPowerSource_Imputed', 'CropTillageMonth', 'CropTillageSeason', 'HarvestSeason']


In [14]:
for col in small_sample_columns:
    if col != "ID":
        value_counts = df_train[col].value_counts()
        small_samples = value_counts[value_counts < 20]
        print(small_samples)
        print('\n')


Block
Gaya    1
Name: count, dtype: int64


LandPreparationMethod
TractorPlough WetTillagePuddling                                         17
WetTillagePuddling FourWheelTracRotavator TractorPlough                  15
WetTillagePuddling TractorPlough BullockPlough FourWheelTracRotavator    13
TractorPlough WetTillagePuddling BullockPlough                           10
WetTillagePuddling FourWheelTracRotavator TractorPlough BullockPlough     9
WetTillagePuddling BullockPlough TractorPlough FourWheelTracRotavator     8
BullockPlough TractorPlough                                               8
BullockPlough FourWheelTracRotavator TractorPlough WetTillagePuddling     7
WetTillagePuddling Other                                                  7
WetTillagePuddling BullockPlough TractorPlough                            7
Other                                                                     6
TractorPlough FourWheelTracRotavator BullockPlough                        3
BullockPlough TractorP

NursDetFactor has very low SHAPley value so instead of dropping the rows with low values I will drop the column itself

In [15]:
df_train.groupby("Num_TopDressFert")["Yield_per_Acre"].mean()

Num_TopDressFert
0    1917.114279
1    1971.305057
2    1995.967717
3    1939.666667
Name: Yield_per_Acre, dtype: float64