## Building a model with least possible error to predict Crop yield based on various factors

In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_absolute_error, confusion_matrix
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn import linear_model
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from xgboost import XGBRegressor

## Importing Dataset

In [2]:
data = pd.read_csv('Final.csv')
data.head()

Unnamed: 0,S.No,Crop,District,Year,Min Temp,Max. Temp,Total Rainfall,Total Yield
0,1,bengal gram,ADILABAD,2016,12.34,42.66,1130.15,1551
1,4,groundnut,ADILABAD,2016,12.34,42.66,1130.15,2068
2,7,maize,ADILABAD,2016,12.34,42.66,1130.15,5712
3,10,bengal gram,BHADRADRI,2016,16.42,42.65,1145.008696,0
4,13,groundnut,BHADRADRI,2016,16.42,42.65,1145.008696,2186


In [3]:
data.describe()

Unnamed: 0,S.No,Year,Min Temp,Max. Temp,Total Rainfall,Total Yield
count,279.0,279.0,279.0,279.0,279.0,279.0
mean,140.0,2017.0,14.844946,41.425054,751.382193,2614.860215
std,80.684571,0.817964,1.467885,1.012367,207.994113,1850.392756
min,1.0,2016.0,12.34,39.1,422.9185,0.0
25%,70.5,2016.0,13.68,40.59,585.302609,1516.0
50%,140.0,2017.0,14.67,41.49,703.738889,2068.0
75%,209.5,2018.0,15.89,42.19,894.181818,3126.0
max,279.0,2018.0,17.75,43.34,1290.163636,9282.0


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 279 entries, 0 to 278
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   S.No            279 non-null    int64  
 1   Crop            279 non-null    object 
 2   District        279 non-null    object 
 3   Year            279 non-null    int64  
 4   Min Temp        279 non-null    float64
 5   Max. Temp       279 non-null    float64
 6   Total Rainfall  279 non-null    float64
 7   Total Yield     279 non-null    int64  
dtypes: float64(3), int64(3), object(2)
memory usage: 17.6+ KB


## Splitting into Train and Test set

In [5]:
X = data.drop(['S.No','Total Yield'], axis = 1)
Y = np.array(data['Total Yield'])
Y = Y.reshape((-1,1))

In [6]:
X_train, X_test, Y_train, Y_test  = train_test_split(X, Y, test_size=0.2, random_state=10)
print(X_train.shape, X_test.shape, Y_train.shape, Y_test.shape)

(223, 6) (56, 6) (223, 1) (56, 1)


## Data Processing using Pipeline

In [7]:
num_transform = Pipeline([('imputer', SimpleImputer(strategy="median"))])
num_cols = [cname for cname in X_train.columns if X_train[cname].dtype in ['float64']]

cat_transform = Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent')),('onehot', OneHotEncoder(handle_unknown='ignore'))])
cat_cols = [cname for cname in X_train.columns if X_train[cname].dtype in ['object']]

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', cat_transform, cat_cols),
        ('num', num_transform, num_cols)
    ])
X_train_prep = preprocessor.fit_transform(X_train)

In [8]:
X_train_prep.shape

(223, 37)

In [9]:
X_test_prep = preprocessor.transform(X_test)

## Linear Regression

In [10]:
reg = linear_model.LinearRegression()
reg.fit(X_train_prep, Y_train)

LinearRegression()

In [11]:
Y_pred = reg.predict(X_test_prep)

In [12]:
lin_mae = mean_absolute_error(Y_test, Y_pred)
print(f'Mean Absolute Error using Linear Regression : {lin_mae}')

Mean Absolute Error using Linear Regression : 996.5247686871434


## XGBoost

#### Parameter Tuning

In [13]:
tuned_parameters = {'learning_rate' : [i*0.01 for i in range(1,3)], 'n_estimators' : [i for i in range(500,800,100)], 
                    'max_depth' : [i for i in range(5,9)], 'n_jobs' : [i for i in range(3,8)]}

In [14]:
xgb_reg = XGBRegressor()
xgb = GridSearchCV(xgb_reg, tuned_parameters, cv=10, scoring='neg_mean_absolute_error')
xgb.fit(X_train_prep, Y_train)
xgb.best_score_

-630.1944661423152

In [15]:
xgb.best_params_

{'learning_rate': 0.01, 'max_depth': 6, 'n_estimators': 500, 'n_jobs': 3}

## Predicting Values using Best Model

In [17]:
Y_pred = xgb.predict(X_test_prep)

In [18]:
xgb_mae = mean_absolute_error(Y_test, Y_pred)
print(f'Mean Absolute Error using XGBoost: {xgb_mae}')

Mean Absolute Error using XGBoost: 449.4672498703003


## Check for underfitting

In [26]:
print(f'Training set score: {abs(xgb.score(X_train_prep,Y_train))}')
print(f'Test set score: {abs(xgb.score(X_test_prep,Y_test))}')

Training set score: 194.29075506663642
Test set score: 449.4672498703003
