# Import & setup

In [105]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.io as pio
import plotly.graph_objects as go
import calendar
from datetime import datetime
pd.set_option('display.max_columns', None)

from sklearn.cluster import SpectralClustering
from sklearn.preprocessing import StandardScaler, normalize
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score

import warnings
warnings.filterwarnings("ignore")

In [106]:
# from google.colab import drive
# drive.mount('/content/gdrive')

# # directory
# %cd '/content/gdrive/My Drive/Oxford/ML_for_Social_Good'

# # import
# df = pd.read_csv("cleaned_fulldf.csv")
# print(df.shape)

In [107]:
import os 

cwd  = os.getcwd()

df = pd.read_csv(cwd + "/cleaned_fulldf.csv")

In [108]:
datetime_cols = ["CropTillageDate", "RcNursEstDate", "SeedingSowingTransplanting", "Harv_date", "Threshing_date"]
for col in datetime_cols:
    df[col] = pd.to_datetime(df[col])
    df[col] = df[col].values.astype("float64")

In [109]:

# to add: FirstTopDressFert, CropbasalFerts, OrgFertilizers
cdf = df[["ID","CropTillageDepth","CropEstMethod","SeedlingsPerPit","TransplantingIrrigationHours","TransplantingIrrigationSource",
          # one-hot encoded LandPrepMethod
          'LandPrepMethod_TractorPlough','LandPrepMethod_FourWheelTracRotavator','LandPrepMethod_WetTillagePuddling', 'LandPrepMethod_BullockPlough','LandPrepMethod_Other',
          # one-hot encoded CropbasalFerts
          'CropbasalFerts_Urea','CropbasalFerts_DAP', 'CropbasalFerts_Other', 'CropbasalFerts_NPK','CropbasalFerts_MoP', 'CropbasalFerts_NPKS', 'CropbasalFerts_SSP',
          'CropbasalFerts_None',
          # one-hot encoded FirstTopDressFert
          'FirstTopDressFert_Urea','FirstTopDressFert_DAP', 'FirstTopDressFert_NPK','FirstTopDressFert_NPKS', 'FirstTopDressFert_SSP','FirstTopDressFert_Other',
          # one-hot encoded OrgFertilizers
          'OrgFertilizers_Ganaura','OrgFertilizers_FYM', 'OrgFertilizers_VermiCompost','OrgFertilizers_Pranamrit', 'OrgFertilizers_Ghanajeevamrit','OrgFertilizers_Jeevamrit',
          'OrgFertilizers_PoultryManure',
          #--
          "Ganaura_per_Acre","CropOrgFYM_per_Acre","PCropSolidOrgFertAppMethod","NoFertilizerAppln","MineralFertAppMethod","MineralFertAppMethod.1",
          "Harv_method","Threshing_method",#"Yield_per_Acre"
          # Date Columns
          datetime_cols[0], datetime_cols[1], datetime_cols[2], datetime_cols[3], datetime_cols[4],
        ]]

cdf.head()

Unnamed: 0,ID,CropTillageDepth,CropEstMethod,SeedlingsPerPit,TransplantingIrrigationHours,TransplantingIrrigationSource,LandPrepMethod_TractorPlough,LandPrepMethod_FourWheelTracRotavator,LandPrepMethod_WetTillagePuddling,LandPrepMethod_BullockPlough,LandPrepMethod_Other,CropbasalFerts_Urea,CropbasalFerts_DAP,CropbasalFerts_Other,CropbasalFerts_NPK,CropbasalFerts_MoP,CropbasalFerts_NPKS,CropbasalFerts_SSP,CropbasalFerts_None,FirstTopDressFert_Urea,FirstTopDressFert_DAP,FirstTopDressFert_NPK,FirstTopDressFert_NPKS,FirstTopDressFert_SSP,FirstTopDressFert_Other,OrgFertilizers_Ganaura,OrgFertilizers_FYM,OrgFertilizers_VermiCompost,OrgFertilizers_Pranamrit,OrgFertilizers_Ghanajeevamrit,OrgFertilizers_Jeevamrit,OrgFertilizers_PoultryManure,Ganaura_per_Acre,CropOrgFYM_per_Acre,PCropSolidOrgFertAppMethod,NoFertilizerAppln,MineralFertAppMethod,MineralFertAppMethod.1,Harv_method,Threshing_method,CropTillageDate,RcNursEstDate,SeedingSowingTransplanting,Harv_date,Threshing_date
0,ID_GTFAC7PEVWQ9,5,Manual_PuddledRandom,2.0,5.0,Boring,True,True,False,False,False,True,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,0.0,0.0,,2,Broadcasting,Broadcasting,machine,machine,1.658275e+18,1.656288e+18,1.658362e+18,1.668557e+18,1.668557e+18
1,ID_TK40ARLSPOKS,5,Manual_PuddledRandom,2.0,5.0,Boring,True,True,True,False,False,True,True,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,0.0,0.0,,2,Broadcasting,Broadcasting,hand,machine,1.658102e+18,1.655683e+18,1.658275e+18,1.669334e+18,1.67184e+18
2,ID_1FJY2CRIMLZZ,6,Manual_PuddledRandom,2.0,4.0,Boring,True,True,False,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,True,True,False,False,False,False,False,6.75,6.75,SoilApplied,2,SoilApplied,RootApplication,hand,machine,1.656547e+18,1.655683e+18,1.660349e+18,1.670803e+18,1.673395e+18
3,ID_I3IPXS4DB7NE,6,Manual_PuddledRandom,2.0,4.0,Boring,True,True,False,False,False,True,True,False,False,False,False,False,False,True,False,False,False,False,False,True,False,False,False,False,False,False,4.5,0.0,SoilApplied,2,Broadcasting,Broadcasting,hand,hand,1.655338e+18,1.655424e+18,1.658016e+18,1.669939e+18,1.672272e+18
4,ID_4T8YQWXWHB4A,4,Manual_PuddledRandom,2.0,9.0,Boring,True,False,True,False,False,True,True,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,0.0,0.0,,2,Broadcasting,Broadcasting,machine,machine,1.658189e+18,1.65577e+18,1.658275e+18,1.669766e+18,1.669939e+18


In [110]:
cdf.columns

Index(['ID', 'CropTillageDepth', 'CropEstMethod', 'SeedlingsPerPit',
       'TransplantingIrrigationHours', 'TransplantingIrrigationSource',
       'LandPrepMethod_TractorPlough', 'LandPrepMethod_FourWheelTracRotavator',
       'LandPrepMethod_WetTillagePuddling', 'LandPrepMethod_BullockPlough',
       'LandPrepMethod_Other', 'CropbasalFerts_Urea', 'CropbasalFerts_DAP',
       'CropbasalFerts_Other', 'CropbasalFerts_NPK', 'CropbasalFerts_MoP',
       'CropbasalFerts_NPKS', 'CropbasalFerts_SSP', 'CropbasalFerts_None',
       'FirstTopDressFert_Urea', 'FirstTopDressFert_DAP',
       'FirstTopDressFert_NPK', 'FirstTopDressFert_NPKS',
       'FirstTopDressFert_SSP', 'FirstTopDressFert_Other',
       'OrgFertilizers_Ganaura', 'OrgFertilizers_FYM',
       'OrgFertilizers_VermiCompost', 'OrgFertilizers_Pranamrit',
       'OrgFertilizers_Ghanajeevamrit', 'OrgFertilizers_Jeevamrit',
       'OrgFertilizers_PoultryManure', 'Ganaura_per_Acre',
       'CropOrgFYM_per_Acre', 'PCropSolidOrgFertAppMet

# Pre-processing

In [111]:
# 1. CATEGORICAL VARIABLES

# Binary variables
cdf["Harv_method"] = cdf["Harv_method"].replace({"hand":0, "machine":1})
cdf["Threshing_method"] = cdf["Threshing_method"].replace({"hand":0, "machine":1})

# Dummies
dummy_cols = ["CropEstMethod","TransplantingIrrigationSource","PCropSolidOrgFertAppMethod","MineralFertAppMethod","MineralFertAppMethod.1"]
cdf = pd.get_dummies(cdf, columns=dummy_cols)

# Bool -> int
bools = cdf.select_dtypes(include='bool').columns
cdf[bools] = cdf[bools].astype(int)

In [112]:
# 2. MISSING DATA

# Ganaura_per_Acre & CropOrgFYM_per_Acre -> replacing with 0
cdf["Ganaura_per_Acre"] = cdf["Ganaura_per_Acre"].fillna(0)
cdf["CropOrgFYM_per_Acre"] = cdf["CropOrgFYM_per_Acre"].fillna(0)

# SeedlingsPerPit -> replacing by median (=2)
cdf["SeedlingsPerPit"] = cdf["SeedlingsPerPit"].fillna(cdf.SeedlingsPerPit.median())

# TransplantingIrrigationHours -> replacing by median (=4)
cdf["TransplantingIrrigationHours"] = cdf["TransplantingIrrigationHours"].fillna(cdf.TransplantingIrrigationHours.median())

In [113]:
# 3. NUMERICAL VARIABLES

num_cols = ["CropTillageDepth","SeedlingsPerPit","TransplantingIrrigationHours","NoFertilizerAppln","CropOrgFYM_per_Acre","Ganaura_per_Acre"]

# Scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(cdf[num_cols])

# Normalizing
cdf[num_cols] = normalize(X_scaled)

In [114]:
cdf = cdf.drop(columns=["ID"])

# Spectral clustering

In [115]:
def run_spectral(k_range, input_df, output_df):
  for k in k_range:
    spectral = SpectralClustering(n_clusters=k, assign_labels='discretize', random_state=0, n_init=200).fit(input_df)
    colname = 'k' + str(k) + "label"
    col_list.append(colname)
    output_df[colname] = spectral.labels_

In [116]:
col_list = []
k_range = range(2,6)
cdf_results = cdf.copy()

run_spectral(k_range, cdf, cdf_results)

In [117]:
print(cdf_results["k2label"].value_counts())
print(cdf_results["k3label"].value_counts())
print(cdf_results["k4label"].value_counts())
print(cdf_results["k5label"].value_counts())

k2label
0    3904
1    3836
Name: count, dtype: int64
k3label
2    2709
0    2591
1    2440
Name: count, dtype: int64
k4label
0    2669
2    2622
3    1360
1    1089
Name: count, dtype: int64
k5label
1    2150
0    2150
2    1541
3     970
4     929
Name: count, dtype: int64


In [118]:
cdf_results["k2label"] = cdf_results["k2label"].replace({0:"A", 1:"B"})
cdf_results["k3label"] = cdf_results["k3label"].replace({0:"A", 1:"B", 2:"C"})
cdf_results["k4label"] = cdf_results["k4label"].replace({0:"A", 1:"B", 2:"C", 3:"D"})
cdf_results["k5label"] = cdf_results["k5label"].replace({0:"A", 2:"B", 1:"C", 3:"D", 4:"E"})

In [119]:
cdf_results.loc[cdf_results["k5label"]=="A"].k2label.value_counts()

k2label
A    1124
B    1026
Name: count, dtype: int64

In [120]:
# MERGING CLUSTER LABELS BACK WITH ORIGINAL DF

tempo = cdf_results[["k2label","k3label","k4label","k5label"]]
df2 = pd.concat([df,tempo], axis=1)
tempo.shape, df.shape, df2.shape

((7740, 4), (7740, 94), (7740, 98))

In [121]:
df2.loc[df2["k2label"]=="B"].CropTillageDepth.value_counts()

CropTillageDepth
4    1570
5     988
6     568
3     457
2     111
8      67
7      61
1      14
Name: count, dtype: int64

In [122]:
fig = px.box(df2, x="CropTillageDepth", color="k3label", width=600, height=400)
fig.show()

fig = px.box(df2, x="SeedlingsPerPit", color="k3label", width=600, height=400)
fig.show()

fig = px.histogram(df2, x="TransplantingIrrigationSource", color="k3label", width=600, height=400)
fig.show()

#fig = px.histogram(df2, x="OrgFertilizers", color="k3label", width=600, height=400)
#fig.show()

fig = px.box(df2, x="CropOrgFYM_per_Acre", color="k3label", width=600, height=400)
fig.show()

fig = px.histogram(df2, x="Harv_method", facet_col="k3label", width=600, height=400)
fig.show()

fig = px.histogram(df2, x="Threshing_method", facet_col="k3label", width=600, height=400)
fig.show()

fig = px.histogram(df2, x="District", facet_col="k3label", width=800, height=400)
fig.show()

fig = px.histogram(df2, x="Block", facet_col="k3label", width=900, height=400)
fig.show()

fig = px.box(df2, x="Yield_per_Acre", color="k3label", width=900, height=400)
fig.show()

fig = px.box(df2, x="Yield_per_Acre", color="k2label", width=900, height=400)
fig.show()

In [123]:
date_cols = [datetime_cols[0], datetime_cols[1], datetime_cols[2], datetime_cols[3], datetime_cols[4]]
df2[date_cols] = df2[date_cols].apply(pd.to_datetime, errors='coerce')


In [124]:
# Exporting df with cluster labels
df2.to_csv('cleaned_fulldf_withclusters.csv',index=False)

In [125]:
df2.head()

Unnamed: 0,ID,Set,District,Block,CultLand,CropTillageDate,CropTillageMonth,CropTillageDepth,CropEstMethod,RcNursEstDate,NursingMonth,SeedingSowingTransplanting,SowTransplantMonth,SeedlingsPerPit,TransplantingIrrigationHours,TransplantingIrrigationSource,TransplantingIrrigationPowerSource,TransIrriCost,TransIrriCost_per_Acre,StandingWater,Ganaura,Ganaura_capped,Ganaura_per_Acre,CropOrgFYM,CropOrgFYM_per_Acre,PCropSolidOrgFertAppMethod,NoFertilizerAppln,BasalDAP,BasalDAP_per_Acre,BasalUrea,BasalUrea_per_Acre,MineralFertAppMethod,1tdUrea,1tdUrea_per_Acre,1appDaysUrea,2tdUrea,2tdUrea_per_Acre,2appDaysUrea,MineralFertAppMethod.1,Harv_method,Harv_date,HarvestMonth,Harv_hand_rent,Harv_hand_rent_per_Acre,Threshing_date,ThreshingMonth,Threshing_method,Residue_length,Residue_perc,Stubble_use,Acre,Yield,Yield_per_Acre,LandPrepMethod_TractorPlough,LandPrepMethod_FourWheelTracRotavator,LandPrepMethod_WetTillagePuddling,LandPrepMethod_BullockPlough,LandPrepMethod_Other,NursDetFactor_CalendarDate,NursDetFactor_PreMonsoonShowers,NursDetFactor_IrrigWaterAvailability,NursDetFactor_LabourAvailability,NursDetFactor_SeedAvailability,TransDetFactor_LabourAvailability,TransDetFactor_CalendarDate,TransDetFactor_RainArrival,TransDetFactor_IrrigWaterAvailability,TransDetFactor_SeedlingAge,CropbasalFerts_Urea,CropbasalFerts_DAP,CropbasalFerts_Other,CropbasalFerts_NPK,CropbasalFerts_MoP,CropbasalFerts_NPKS,CropbasalFerts_SSP,CropbasalFerts_None,FirstTopDressFert_Urea,FirstTopDressFert_DAP,FirstTopDressFert_NPK,FirstTopDressFert_NPKS,FirstTopDressFert_SSP,FirstTopDressFert_Other,OrgFertilizers_Ganaura,OrgFertilizers_FYM,OrgFertilizers_VermiCompost,OrgFertilizers_Pranamrit,OrgFertilizers_Ghanajeevamrit,OrgFertilizers_Jeevamrit,OrgFertilizers_PoultryManure,Days_bw_Nurs_SowTransp,Days_bw_SowTransp_Harv,Days_bw_Harv_Thresh,Days_bw_Nurs_Harv,Nb_of_NaN,k2label,k3label,k4label,k5label
0,ID_GTFAC7PEVWQ9,train,Nalanda,Noorsarai,45,2022-07-20,July,5,Manual_PuddledRandom,2022-06-27,June,2022-07-21,July,2.0,5.0,Boring,Electric,200.0,640.0,2.0,0.0,0.0,0.0,0.0,0.0,,2,0.0,0.0,20.0,64.0,Broadcasting,15.0,48.0,18.0,0.0,0.0,,Broadcasting,machine,2022-11-16,November,0.0,0.0,2022-11-16,November,machine,30,40,plowed_in_soil,0.3125,600,1920.0,True,True,False,False,False,True,False,True,False,True,False,True,True,True,True,True,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,24.0,118.0,0.0,142.0,2,A,B,D,B
1,ID_TK40ARLSPOKS,train,Nalanda,Rajgir,26,2022-07-18,July,5,Manual_PuddledRandom,2022-06-20,June,2022-07-20,July,2.0,5.0,Boring,Electric,125.0,400.0,3.0,0.0,0.0,0.0,0.0,0.0,,2,15.0,48.0,10.0,32.0,Broadcasting,20.0,64.0,39.0,0.0,0.0,,Broadcasting,hand,2022-11-25,November,3.0,9.6,2022-12-24,December,machine,24,10,plowed_in_soil,0.3125,600,1920.0,True,True,True,False,False,True,True,True,True,True,False,True,True,True,True,True,True,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,30.0,128.0,29.0,158.0,2,B,C,C,B
2,ID_1FJY2CRIMLZZ,train,Gaya,Gurua,10,2022-06-30,June,6,Manual_PuddledRandom,2022-06-20,June,2022-08-13,August,2.0,4.0,Boring,Electric,80.0,540.0,2.0,1.0,1.0,6.75,1.0,6.75,SoilApplied,2,4.0,27.0,0.0,0.0,SoilApplied,5.0,33.75,65.0,0.0,0.0,,RootApplication,hand,2022-12-12,December,480.0,3240.0,2023-01-11,January,machine,30,10,plowed_in_soil,0.148148,225,1518.75,True,True,False,False,False,False,True,True,True,False,False,False,False,True,True,False,True,False,False,False,False,False,False,True,False,False,False,False,False,True,True,False,False,False,False,False,54.0,121.0,30.0,175.0,1,B,A,B,C
3,ID_I3IPXS4DB7NE,train,Gaya,Gurua,15,2022-06-16,June,6,Manual_PuddledRandom,2022-06-17,June,2022-07-17,July,2.0,4.0,Boring,,250.0,1125.0,,1.0,1.0,4.5,0.0,0.0,SoilApplied,2,6.0,27.0,3.0,13.5,Broadcasting,5.0,22.5,5.0,0.0,0.0,,Broadcasting,hand,2022-12-02,December,240.0,1080.0,2022-12-29,December,hand,26,10,plowed_in_soil,0.222222,468,2106.0,True,True,False,False,False,True,True,True,True,True,False,True,True,True,True,True,True,False,False,False,False,False,False,True,False,False,False,False,False,True,False,False,False,False,False,False,30.0,138.0,27.0,168.0,6,A,B,D,C
4,ID_4T8YQWXWHB4A,train,Nalanda,Noorsarai,60,2022-07-19,July,4,Manual_PuddledRandom,2022-06-21,June,2022-07-20,July,2.0,9.0,Boring,Electric,300.0,640.0,2.0,0.0,0.0,0.0,0.0,0.0,,2,15.0,32.0,30.0,64.0,Broadcasting,30.0,64.0,26.0,0.0,0.0,,Broadcasting,machine,2022-11-30,November,0.0,0.0,2022-12-02,December,machine,24,40,plowed_in_soil,0.46875,550,1173.333333,True,False,True,False,False,True,False,True,False,True,False,False,True,True,True,True,True,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,29.0,133.0,2.0,162.0,2,B,C,C,A
