# Problem Statement


Recently we have observed the emerging concept of smart farming that makes agriculture more efficient and effective with the help of high-precision algorithms. The mechanism that drives it is Machine Learning — the scientific field that gives machines the ability to learn without being strictly programmed. It has emerged together with big data technologies and high-performance computing to create new opportunities to unravel, quantify, and understand data intensive processes in agricultural operational environments.

Machine learning is everywhere throughout the whole growing and harvesting cycle. It begins with a seed being planted in the soil — from the soil preparation, seeds breeding and water feed measurement — and it ends when neural networks pick up the harvest determining the ripeness with the help of computer vision.

This weekend get ready to harness the power of ML to solve a problem based in agriculture and get a chance to showcase your skills and win AV points!

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Libraries Needed

In [None]:
from matplotlib import rcParams
from sklearn import model_selection
from sklearn import preprocessing
from sklearn import linear_model
from sklearn import metrics
import lightgbm as lgb
import catboost as cb 
import xgboost as xgb
import eli5
import plotly.graph_objects as go
import plotly.express as px

In [None]:
train = pd.read_csv("/kaggle/input/av-janatahack-machine-learning-in-agriculture/train_yaOffsB.csv")
test = pd.read_csv("/kaggle/input/av-janatahack-machine-learning-in-agriculture/test_pFkWwen.csv")
sub = pd.read_csv("/kaggle/input/av-janatahack-machine-learning-in-agriculture/sample_submission_O1oDc4H.csv")

In [None]:
train.head()

In [None]:
train.ID.nunique

# Exploratiory Data Analysis

In [None]:
rcParams["figure.figsize"] = 15,10
train.isna().sum().plot(kind="bar")

In [None]:
train.describe()

In [None]:
# value count for the target variable.
train.Crop_Damage.value_counts().plot(kind='bar',title='Crop Damaged',color=['red'])


In [None]:
train.groupby('Crop_Type')['Estimated_Insects_Count'].sum()

In [None]:
# Crop type which is more prone to pests
train.groupby('Crop_Type')['Estimated_Insects_Count'].sum().plot(kind='bar')

In [None]:
rcParams["figure.figsize"] = 10,10
train.groupby('Soil_Type')['Estimated_Insects_Count'].sum().plot(kind='bar')

Soil type 0 is more prone to Insects

In [None]:
fig = px.line(data_frame=train[:5000],x="ID",y="Estimated_Insects_Count")
fig.show()

from this we can be sure that ther is some pattern in the **Estimated_Insects_Count**

In [None]:
pesti =  train["Pesticide_Use_Category"].value_counts()
pesti_df = pd.DataFrame({"pesticide":pesti.index,"frequency":pesti.values})

In [None]:
fig = px.bar(data_frame=pesti_df,x="pesticide",y="frequency",color = "pesticide")
fig.show()

In [None]:
# join test and train data
train['train_or_test']='train'
test['train_or_test']='test'
df=pd.concat([train,test])

In [None]:
df.head()

In [None]:
df['Number_Weeks_Used']=df['Number_Weeks_Used'].fillna(df['Number_Weeks_Used'].mode()[0])

In [None]:
df.head()

In [None]:

train=df.loc[df.train_or_test.isin(['train'])]
test=df.loc[df.train_or_test.isin(['test'])]
train.drop(columns={'train_or_test'},axis=1,inplace=True)
test.drop(columns={'train_or_test'},axis=1,inplace=True)

In [None]:
train.head()

In [None]:
train.isna().sum()

In [None]:
test.isna().sum()

# Creating folds using StratifiedKFold

In [None]:
train["kfold"] = -1
train = train.sample(frac=1).reset_index(drop=True)

y = train.Crop_Damage.values

kf =model_selection.StratifiedKFold(n_splits=5)

for f,(t_,v_) in enumerate(kf.split(X=train,y=y)):
    train.loc[v_,'kfold'] = f

In [None]:
train.head()

In [None]:
train = train.drop(["ID"],axis=1)
test = test.drop(["ID","Crop_Damage"],axis=1)

# LGBM Classifier

In [None]:
def run(train,fold):
    scores = []
#     lb = preprocessing.LabelEncoder()

    df_train = train[train.kfold !=fold].reset_index(drop=True)
    df_valid = train[train.kfold == fold].reset_index(drop=True)
    
    x_train = df_train.drop(["Crop_Damage"],axis=1)
#     x_train['ID']= lb.fit_transform(x_train['ID']) 
    y_train = df_train["Crop_Damage"].values
    
    x_valid = df_valid.drop(["Crop_Damage"],axis=1)
#     x_valid['ID']= lb.fit_transform(x_valid['ID']) 

    y_valid = df_valid["Crop_Damage"].values
    
    model = lgb.LGBMClassifier(random_state=27, max_depth=6, n_estimators=400)
    model.fit(x_train, y_train)
    preds = model.predict(x_valid)
    score = metrics.accuracy_score(y_valid, preds)
    print(f"Fold = {fold}, AUC = {score}")


In [None]:
for fold_ in range(5):
    run(train=train,fold=fold_)

# XGB CLassifier

In [None]:
def run(train,fold):
    scores = []
#     lb = preprocessing.LabelEncoder()
    df_train = train[train.kfold !=fold].reset_index(drop=True)
    df_valid = train[train.kfold == fold].reset_index(drop=True)
    
    x_train = df_train.drop(["Crop_Damage","kfold"],axis=1)
#     x_train['ID']= lb.fit_transform(x_train['ID']) 

    y_train = df_train["Crop_Damage"].values
    
    x_valid = df_valid.drop(["Crop_Damage","kfold"],axis=1)
#     x_valid['ID']= lb.fit_transform(x_valid['ID']) 
    y_valid = df_valid["Crop_Damage"].values
    
    model = xgb.XGBClassifier(objective = "multi:softprob",
              num_class = 3,
              max_depth = 8,
              eta = 0.01,
              subsample = 0.7,
              colsample_bytree = 0.8,
              min_child_weight = 40,
              max_delta_step = 3,
              gamma = 0.3,
              eval_metric = "merror",
                             )
    model.fit(x_train, y_train,)
    preds = model.predict(x_valid)
    score = metrics.accuracy_score(y_valid, preds)
    print(f"Fold = {fold}, AUC = {score}")


In [None]:
for fold_ in range(5):
    run(train=train,fold=fold_)

In [None]:
x_train = train.drop(["Crop_Damage","kfold"],axis=1)
y_train = train["Crop_Damage"].values

In [None]:
x_train.head()

In [None]:
model = xgb.XGBClassifier(objective = "multi:softprob",
              num_class = 3,
              max_depth = 8,
              eta = 0.01,
              subsample = 0.7,
              colsample_bytree = 0.8,
              min_child_weight = 40,
              max_delta_step = 3,
              gamma = 0.3,
              eval_metric = "merror")

In [None]:
model.fit(x_train,y_train)

In [None]:
y_pred_xgb = model.predict_proba(test)
y_pred_xgb

In [None]:
sub["Crop_Damage"] = y_pred_xgb
sub.head()
sub.to_csv("XGB_Model.csv")

# ELI5 Implementation for XGB Model

In [None]:
eli5.explain_weights_xgboost(model,top=10)

In [None]:
eli5.show_weights(model,feature_names=x_train.columns.tolist())

Pesticide_Use_Category and Number_Weeks_Used are more important for this model

In [None]:
eli5.show_prediction(model, x_train.iloc[1], feature_names = x_train.columns.tolist(), 
                show_feature_values=True)

# CatBoost Classifier

In [None]:
def run(train,fold):
    scores = []
    df_train = train[train.kfold !=fold].reset_index(drop=True)
    df_valid = train[train.kfold == fold].reset_index(drop=True)
    
    x_train = df_train.drop(["Crop_Damage","kfold"],axis=1)
    y_train = df_train["Crop_Damage"].values
    
    x_valid = df_valid.drop(["Crop_Damage","kfold"],axis=1)
    y_valid = df_valid["Crop_Damage"].values
    
    model = cb.CatBoostClassifier(random_state=27, max_depth=4, task_type="CPU", devices="0:1", n_estimators=1000, verbose=500)
    model.fit(x_train, y_train)
    preds_t = model.predict(x_valid)
    score = metrics.accuracy_score(y_valid, preds_t)
    print(f"Fold = {fold}, AUC = {score}")

In [None]:
for fold_ in range(5):
    run(train=train,fold=fold_)

In [None]:
cat_model = cb.CatBoostClassifier(random_state=27, max_depth=4, task_type="CPU", devices="0:1", n_estimators=1000, verbose=500)
cat_model.fit(x_train,y_train)

In [None]:
y_pred_cat = cat_model.predict_proba(test)

In [None]:
y_pred_cat[:10]

In [None]:
predictions = list()
cb_weight=0.6 # Catboost
lb_weight=0.4 # LGBM
for i, j in zip(y_pred_cat, y_pred_xgb):
    xx = [(cb_weight * i[0]) + (lb_weight * j[0]),
          (cb_weight * i[1]) + (lb_weight * j[1]),
          (cb_weight * i[2]) + (lb_weight * j[2])]
    predictions.append(xx)
# print(predictions[:10])
preds_ensemble=np.argmax(predictions,axis=1)


In [None]:
preds_ensemble[:10]

In [None]:
sub["Crop_Damage"] = preds_ensemble
sub.to_csv("Ensemble.csv")
sub.head()