In [78]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.io as pio
import plotly.graph_objects as go
import calendar
from datetime import datetime
pd.set_option('display.max_columns', None)

import warnings
warnings.filterwarnings("ignore")

In [79]:
from google.colab import drive
drive.mount('/content/gdrive')

# directory
%cd '/content/gdrive/My Drive/Oxford/ML_for_Social_Good'

# import
train = pd.read_csv("Train.csv")
test = pd.read_csv("Test.csv")

train["Set"] = "train"
test["Set"] = "test"

df = pd.concat([train, test])
print(df.shape)

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
/content/gdrive/My Drive/Oxford/ML_for_Social_Good
(5160, 45)


In [80]:
datetime_cols = ["CropTillageDate","RcNursEstDate","SeedingSowingTransplanting","Harv_date","Threshing_date"]

for col in datetime_cols:
  df[col] = pd.to_datetime(df[col])#.dt.date

In [81]:
# One row has Jamui as district but Gurua as Block, which is a Gaya block -- correcting its District
df.loc[(df["District"]=="Jamui") & (df["Block"]=="Gurua")].index
df.loc[2177,"District"] = "Gaya"

In [82]:
# OUTLIERS -- first pass

# SeedlingsPerPit has two extreme outliers (800 seedlings & 442 seedlings) --> replacing with the next max value for now (=22)
df["SeedlingsPerPit"] = df["SeedlingsPerPit"].replace(800,22).replace(442,22)

# TransplantingIrrigationHours
df.loc[df["TransplantingIrrigationHours"]>90]["TransplantingIrrigationHours"] = 90

# TransIrriCost have several extreme outliers (e.g. 6000 rupees for an average sized land)
#df["TransIrriCost"] = df["TransIrriCost"].replace(6000,3000)
df.loc[df["TransIrriCost"]>2500]["TransIrriCost"] = 2500

# Ganaura
df.loc[df["Ganaura"]>140]["Ganaura"] = 140

# 1appDaysUrea -- replacing extreme outlier with the next max value
df["1appDaysUrea"] = df["1appDaysUrea"].replace(332,75)

In [83]:
# PER-ACRE COLUMNS

list_cols = ["TransIrriCost","Ganaura","CropOrgFYM","BasalDAP","BasalUrea","1tdUrea","2tdUrea","Harv_hand_rent","Yield"]

for col in list_cols:
  label = str(col) + "_per_Acre"
  df[label] = df[col] / df["Acre"]

In [84]:
# Re-ordering columns and dropping the non-standardized ones
df = df[['ID', 'Set', 'District', 'Block', 'LandPreparationMethod', 'CropTillageDate', 'CropTillageDepth','CropEstMethod', 'RcNursEstDate',
         'SeedingSowingTransplanting','SeedlingsPerPit', 'NursDetFactor', 'TransDetFactor','TransplantingIrrigationHours', 'TransplantingIrrigationSource',
         'TransplantingIrrigationPowerSource', 'TransIrriCost_per_Acre', 'StandingWater','OrgFertilizers', 'Ganaura_per_Acre', 'CropOrgFYM_per_Acre', 'PCropSolidOrgFertAppMethod',
         'NoFertilizerAppln', 'CropbasalFerts', 'BasalDAP_per_Acre', 'BasalUrea_per_Acre','MineralFertAppMethod', 'FirstTopDressFert', '1tdUrea_per_Acre', '1appDaysUrea',
         '2tdUrea_per_Acre', '2appDaysUrea', 'MineralFertAppMethod.1', 'Harv_method','Harv_date', 'Harv_hand_rent_per_Acre', 'Threshing_date', 'Threshing_method',
         'Residue_length', 'Residue_perc', 'Stubble_use', 'Acre', 'Yield','Yield_per_Acre'
         ]]

In [85]:
# PARSING MESSY CATEGORICAL VARIABLES

# 1. LandPreparationMethod
#methods = ["TractorPlough","FourWheelTracRotavator","WetTillagePuddling","BullockPlough","Other"]

df["LandPrepMethod_TractorPlough"] = df["LandPreparationMethod"].str.contains("TractorPlough")
df["LandPrepMethod_FourWheelTracRotavator"] = df["LandPreparationMethod"].str.contains("FourWheelTracRotavator")
df["LandPrepMethod_WetTillagePuddling"] = df["LandPreparationMethod"].str.contains("WetTillagePuddling")
df["LandPrepMethod_BullockPlough"] = df["LandPreparationMethod"].str.contains("BullockPlough")
df["LandPrepMethod_Other"] = df["LandPreparationMethod"].str.contains("Other")


# 2. NursDetFactor
#reasons = ["CalendarDate","PreMonsoonShowers","IrrigWaterAvailability","LabourAvailability","SeedAvailability"]

df["NursDetFactor_CalendarDate"] = df["NursDetFactor"].str.contains("CalendarDate")
df["NursDetFactor_PreMonsoonShowers"] = df["NursDetFactor"].str.contains("PreMonsoonShowers")
df["NursDetFactor_IrrigWaterAvailability"] = df["NursDetFactor"].str.contains("IrrigWaterAvailability")
df["NursDetFactor_LabourAvailability"] = df["NursDetFactor"].str.contains("LabourAvailability" or "LaborAvailability")
df["NursDetFactor_SeedAvailability"] = df["NursDetFactor"].str.contains("SeedAvailability")


# 2. TransDetFactor
#reasons = ["LaborAvailability","CalendarDate","RainArrival","IrrigWaterAvailability","SeedlingAge"] # I think that's all of them

df["TransDetFactor_LabourAvailability"] = df["TransDetFactor"].str.contains("LabourAvailability" or "LaborAvailability")
df["TransDetFactor_CalendarDate"] = df["TransDetFactor"].str.contains("CalendarDate")
df["TransDetFactor_RainArrival"] = df["TransDetFactor"].str.contains("RainArrival")
df["TransDetFactor_IrrigWaterAvailability"] = df["TransDetFactor"].str.contains("IrrigWaterAvailability")
df["TransDetFactor_SeedlingAge"] = df["TransDetFactor"].str.contains("SeedlingAge")


# 3. CropbasalFerts
df["CropbasalFerts"] = df["CropbasalFerts"].fillna("None")
fertilizer_types = ["Urea","DAP","Other","NPK","MoP","NPKS","SSP","None"]

for fertilizer in fertilizer_types:
  label = "CropbasalFerts_" + fertilizer
  df[label] = df["CropbasalFerts"].str.contains(fertilizer)


# 4. FirstTopDressFert
fertilizer_types2 = ["Urea","DAP","NPK","NPKS","SSP","Other"]

for fertilizer in fertilizer_types2:
  label = "FirstTopDressFert_" + fertilizer
  df[label] = df["FirstTopDressFert"].str.contains(fertilizer)


# 5. OrgFertilizers
orgfertilizers = ["Ganaura","FYM","VermiCompost","Pranamrit","Ghanajeevamrit","Jeevamrit","PoultryManure"]
for fertilizer in orgfertilizers:
  label = "OrgFertilizers_" + fertilizer
  df[label] = df["OrgFertilizers"].str.contains(fertilizer)


# 6. Replacing all NaNs with False
cols = ['LandPrepMethod_TractorPlough','LandPrepMethod_FourWheelTracRotavator','LandPrepMethod_WetTillagePuddling', 'LandPrepMethod_BullockPlough','LandPrepMethod_Other',
        'NursDetFactor_CalendarDate','NursDetFactor_PreMonsoonShowers','NursDetFactor_IrrigWaterAvailability','NursDetFactor_LabourAvailability', 'NursDetFactor_SeedAvailability',
        'TransDetFactor_LabourAvailability', 'TransDetFactor_CalendarDate','TransDetFactor_RainArrival', 'TransDetFactor_IrrigWaterAvailability','TransDetFactor_SeedlingAge',
        'CropbasalFerts_Urea','CropbasalFerts_DAP', 'CropbasalFerts_Other', 'CropbasalFerts_NPK','CropbasalFerts_MoP', 'CropbasalFerts_NPKS', 'CropbasalFerts_SSP',
        'CropbasalFerts_None', 'FirstTopDressFert_Urea','FirstTopDressFert_DAP', 'FirstTopDressFert_NPK','FirstTopDressFert_NPKS', 'FirstTopDressFert_SSP','FirstTopDressFert_Other',
        'OrgFertilizers_Ganaura','OrgFertilizers_FYM', 'OrgFertilizers_VermiCompost','OrgFertilizers_Pranamrit', 'OrgFertilizers_Ghanajeevamrit','OrgFertilizers_Jeevamrit',
        'OrgFertilizers_PoultryManure']

for col in cols:
  df[col] = df[col].fillna(False)

In [86]:
# checking
test = df[['LandPrepMethod_TractorPlough','LandPrepMethod_FourWheelTracRotavator','LandPrepMethod_WetTillagePuddling', 'LandPrepMethod_BullockPlough','LandPrepMethod_Other',
        'NursDetFactor_CalendarDate','NursDetFactor_PreMonsoonShowers','NursDetFactor_IrrigWaterAvailability','NursDetFactor_LabourAvailability', 'NursDetFactor_SeedAvailability',
        'TransDetFactor_LabourAvailability', 'TransDetFactor_CalendarDate','TransDetFactor_RainArrival', 'TransDetFactor_IrrigWaterAvailability','TransDetFactor_SeedlingAge',
        'CropbasalFerts_Urea','CropbasalFerts_DAP', 'CropbasalFerts_Other', 'CropbasalFerts_NPK','CropbasalFerts_MoP', 'CropbasalFerts_NPKS', 'CropbasalFerts_SSP',
        'CropbasalFerts_None', 'FirstTopDressFert_Urea','FirstTopDressFert_DAP', 'FirstTopDressFert_NPK','FirstTopDressFert_NPKS', 'FirstTopDressFert_SSP','FirstTopDressFert_Other',
        'OrgFertilizers_Ganaura','OrgFertilizers_FYM', 'OrgFertilizers_VermiCompost','OrgFertilizers_Pranamrit', 'OrgFertilizers_Ghanajeevamrit','OrgFertilizers_Jeevamrit',
        'OrgFertilizers_PoultryManure']]

test.isnull().sum()

LandPrepMethod_TractorPlough             0
LandPrepMethod_FourWheelTracRotavator    0
LandPrepMethod_WetTillagePuddling        0
LandPrepMethod_BullockPlough             0
LandPrepMethod_Other                     0
NursDetFactor_CalendarDate               0
NursDetFactor_PreMonsoonShowers          0
NursDetFactor_IrrigWaterAvailability     0
NursDetFactor_LabourAvailability         0
NursDetFactor_SeedAvailability           0
TransDetFactor_LabourAvailability        0
TransDetFactor_CalendarDate              0
TransDetFactor_RainArrival               0
TransDetFactor_IrrigWaterAvailability    0
TransDetFactor_SeedlingAge               0
CropbasalFerts_Urea                      0
CropbasalFerts_DAP                       0
CropbasalFerts_Other                     0
CropbasalFerts_NPK                       0
CropbasalFerts_MoP                       0
CropbasalFerts_NPKS                      0
CropbasalFerts_SSP                       0
CropbasalFerts_None                      0
FirstTopDre

In [87]:
print(df.OrgFertilizers.isnull().sum())
print(df.FirstTopDressFert.isnull().sum())
print(df.TransDetFactor.isnull().sum())

1750
634
392


In [88]:
# EXPORTING

df.to_csv('cleaned_fulldf.csv',index=False)