In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
import calendar
from datetime import datetime
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler, normalize
from sklearn.preprocessing import MinMaxScaler
pd.set_option('display.max_columns', None)

In [2]:
from google.colab import drive
drive.mount('/content/gdrive')
# directory
%cd '/content/gdrive/My Drive/Oxford/ML_for_Social_Good'
# import
df = pd.read_csv("cleaned_fulldf_withclusters.csv")

Mounted at /content/gdrive
/content/gdrive/My Drive/Oxford/ML_for_Social_Good


In [3]:
# Encoding binary variables
df["Harv_machine"] = df["Harv_method"].replace({"hand":0, "machine":1})
df["Threshing_machine"] = df["Threshing_method"].replace({"hand":0, "machine":1})
df["Stubble_burned"] = df["Stubble_use"].replace({"plowed_in_soil":0, "burned":1})

# Bool -> int
# for [LandPreparationMethod, NursDetFactor, TransDetFactor, OrgFertilizers, CropbasalFerts, FirstTopDressFert] parsed variables
bools = df.select_dtypes(include='bool').columns
df[bools] = df[bools].astype(int)

In [4]:
# Col lists based on pre-processing method

ohecols = ["District","Block","CropTillageMonth","CropEstMethod","NursingMonth","SowTransplantMonth","TransplantingIrrigationSource","TransplantingIrrigationPowerSource",
           "PCropSolidOrgFertAppMethod","MineralFertAppMethod","MineralFertAppMethod.1","HarvestMonth","ThreshingMonth",
           "k2label","k3label","k4label"]

standardscaling = ["CultLand","TransplantingIrrigationHours","TransIrriCost","TransIrriCost_per_Acre","Ganaura","Ganaura_capped","Ganaura_per_Acre","CropOrgFYM","CropOrgFYM_per_Acre",
                   "BasalDAP","BasalDAP_per_Acre","BasalUrea","BasalUrea_per_Acre","1tdUrea","1tdUrea_per_Acre","2tdUrea","2tdUrea_per_Acre","Harv_hand_rent",
                   "Harv_hand_rent_per_Acre","Residue_length","Acre"]

minmaxscaling = ["CropTillageDepth","SeedlingsPerPit","StandingWater","NoFertilizerAppln","1appDaysUrea","2appDaysUrea","Residue_perc",
                 'Days_bw_Nurs_SowTransp', 'Days_bw_SowTransp_Harv','Days_bw_Harv_Thresh', 'Days_bw_Nurs_Harv']
# note: SeedlingsPerPit still has outliers, might mess things up

In [5]:
# One-hot encoding

ohe = OneHotEncoder(drop=None)

for col in ohecols:
  # Adding the variable name before the category, so the column names are clearer
  df[col] = df[col].apply(lambda x: str(col) + "_" + str(x))
  # OHE
  transformed = ohe.fit_transform(df[[col]])
  df[ohe.categories_[0]] = transformed.toarray().astype(int)

In [6]:
# Standard scaling
scaler = StandardScaler()
df[standardscaling] = scaler.fit_transform(df[standardscaling])

In [7]:
# Min-max scaling
scaler = MinMaxScaler()
df[minmaxscaling] = scaler.fit_transform(df[minmaxscaling])

In [8]:
# dropping raw variables
df = df.drop(columns=["District","Block","CropTillageMonth","CropEstMethod","NursingMonth","SowTransplantMonth","TransplantingIrrigationSource","TransplantingIrrigationPowerSource",
           "PCropSolidOrgFertAppMethod","MineralFertAppMethod","MineralFertAppMethod.1","HarvestMonth","ThreshingMonth","k2label","k3label","k4label"])

In [9]:
# Export
df.to_csv('preprocessed.csv',index=False)