In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Import Libraries

In [None]:
from __future__ import print_function
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import classification_report
import sklearn.metrics as mt
from sklearn import metrics
from sklearn import tree
from sklearn.preprocessing import StandardScaler, LabelEncoder

## Reading the Dataset

In [None]:
df = pd.read_csv("../input/crop-recommender-dataset-with-soil-nutrients/dataset.csv")

In [None]:
df.head()

In [None]:
df.tail()

In [None]:
df.info()

## No of Categories VS No of Samples

In [None]:
labels = df["label"].unique()
df["label"].value_counts().plot(kind="bar")
plt.xlabel('Categories')
plt.ylabel('No of Samples Each')
plt.show()

## Range of Values Vs Count for Each Parameter

In [None]:
all_columns = df.columns[:-1]

plt.figure(figsize=(15,13))
i = 1
for column in all_columns[:-1]:
    plt.subplot(4,3,i)
    sns.histplot(df[column])
    i+=1
plt.show()

sns.histplot(df[all_columns[-1]])
plt.show()

## Parameter Values vs Crop Type

In [None]:
for column in all_columns:
    plt.figure(figsize=(19,7))
    sns.barplot(x = "label", y = column, data = df)
    plt.xticks(rotation=90)
    plt.title(f"{column} vs Crop Type")
    plt.show()

## Pairplot Showing Distribution Graph of Parameter Values for Crop Types

In [None]:
plt.figure(figsize=(100,80))
sns.pairplot(df, hue = "label")
plt.show()

## Correlation Matrix Using Heat Map

In [None]:
plt.figure(figsize = (20,15))
sns.heatmap(df.corr(), center = 0, annot = True)
plt.show()

## Label Encoding

In [None]:
label_encoder = LabelEncoder()
X = df[all_columns]
y = label_encoder.fit_transform(df["label"])
print(X.shape, y.shape)

In [None]:
label_dict = {}
for i in range(6):
    label_dict[i] = label_encoder.inverse_transform([i])[0]
label_dict

## Splitting the Dataset

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X.values, y, test_size = 0.2, random_state = 0)
print(f"Train Data: {X_train.shape}, {y_train.shape}")
print(f"Train Data: {X_test.shape}, {y_test.shape}")

## Importing the Essential Models for Training

In [None]:
from sklearn.neighbors import  KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score, confusion_matrix
import pickle
from sklearn.pipeline import make_pipeline

In [None]:
acc = []
acc_test = []
model = []
f1scores = []

## Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

lr_pipeline = make_pipeline(StandardScaler(), LogisticRegression(random_state=2))
lr_pipeline.fit(X_train, y_train)

# Accuray On Test Data
predictions = lr_pipeline.predict(X_test)
accuracy = accuracy_score(y_test, predictions)
acc_test.append(accuracy*100)
print(f"Accuracy on Test Data: {accuracy*100}%")
plt.figure(figsize = (15,9))
sns.heatmap(confusion_matrix(y_test, predictions), annot = True)
plt.title("Confusion Matrix for Test Data")
plt.show()

print()

# Accuray On Whole Data
predictions = lr_pipeline.predict(X.values)
accuracy = accuracy_score(y, predictions)
acc.append(accuracy*100)
model.append('Logistic Regression')
print(f"Accuracy on Whole Data: {accuracy*100}%")
plt.figure(figsize = (15,9))
sns.heatmap(confusion_matrix(y, predictions), annot = True)
plt.title("Confusion Matrix for Whole Data")
plt.show()

## K Neighbors Classifier

In [None]:
error_rate = []
for i in range(1, 50):
    pipeline = make_pipeline(StandardScaler(), KNeighborsClassifier(n_neighbors = i))
    pipeline.fit(X_train, y_train)
    predictions = pipeline.predict(X_test)
    accuracy = accuracy_score(y_test, predictions)
    print(f"Accuracy at k = {i} is {accuracy}")
    error_rate.append(np.mean(predictions != y_test))

plt.figure(figsize=(10,6))
plt.plot(range(1,50),error_rate,color='blue', linestyle='dashed', 
         marker='o',markerfacecolor='red', markersize=10)
plt.title('Error Rate vs. K Value')
plt.xlabel('K')
plt.ylabel('Error Rate')
print("Minimum error:-",min(error_rate),"at K =",error_rate.index(min(error_rate))+1)

In [None]:
knn_pipeline = make_pipeline(StandardScaler(), KNeighborsClassifier(n_neighbors = 3))
knn_pipeline.fit(X_train, y_train)

# Test Data Metrics
predictions = knn_pipeline.predict(X_test)
accuracy = accuracy_score(y_test, predictions)
acc_test.append(accuracy*100)
print(f"Accuracy on Test Data: {accuracy*100}%")
plt.figure(figsize = (15,9))
sns.heatmap(confusion_matrix(y_test, predictions), annot = True)
plt.title("Confusion Matrix for Test Data")
plt.show()

print()

# Whole Data Metrics
predictions = knn_pipeline.predict(X.values)
accuracy = accuracy_score(y, predictions)
acc.append(accuracy*100)
model.append('K Neighbor Classifier')
print(f"Accuracy on Whole Data: {accuracy*100}%")
plt.figure(figsize = (15,9))
sns.heatmap(confusion_matrix(y, predictions), annot = True)
plt.title("Confusion Matrix for Whole Data")
plt.show()

## Random Forest Classifier

In [None]:
rf_pipeline = make_pipeline(StandardScaler(), RandomForestClassifier(random_state = 18))
rf_pipeline.fit(X_train, y_train)

# Accuray On Test Data
predictions = rf_pipeline.predict(X_test)
accuracy = accuracy_score(y_test, predictions)
acc_test.append(accuracy*100)
print(f"Accuracy on Test Data: {accuracy*100}%")
plt.figure(figsize = (15,9))
sns.heatmap(confusion_matrix(y_test, predictions), annot = True)
plt.title("Confusion Matrix for Test Data")
plt.show()

print()

# Accuray On Whole Data
predictions = rf_pipeline.predict(X.values)
accuracy = accuracy_score(y, predictions)
acc.append(accuracy*100)
model.append('Random Forest Classifier')
print(f"Accuracy on Whole Data: {accuracy*100}%")
plt.figure(figsize = (15,9))
sns.heatmap(confusion_matrix(y, predictions), annot = True)
plt.title("Confusion Matrix for Whole Data")
plt.show()

## XGBoost Classifier

In [None]:
import xgboost
from xgboost import XGBClassifier

In [None]:
xgb_pipeline = make_pipeline(StandardScaler(), XGBClassifier(random_state = 18))
xgb_pipeline.fit(X_train, y_train)

# Accuray On Test Data
predictions = xgb_pipeline.predict(X_test)
accuracy = accuracy_score(y_test, predictions)
acc_test.append(accuracy*100)
print(f"Accuracy on Test Data: {accuracy*100}%")
plt.figure(figsize = (30,20))
sns.heatmap(confusion_matrix(y_test, predictions), annot = True)
plt.title("Confusion Matrix for Test Data")
plt.show()

print()

# Accuray On Whole Data
predictions = xgb_pipeline.predict(X.values)
accuracy = accuracy_score(y, predictions)
acc.append(accuracy*100)
model.append('XGBoost Classifier')
print(f"Accuracy on Whole Data: {accuracy*100}%")
plt.figure(figsize = (30,20))
sns.heatmap(confusion_matrix(y, predictions), annot = True)
plt.title("Confusion Matrix for Whole Data")
plt.show()

## Decision Tree Classifier

In [None]:
from sklearn.tree import DecisionTreeClassifier

dt_pipeline = make_pipeline(StandardScaler(), DecisionTreeClassifier(criterion="entropy",random_state=2,max_depth=5))
dt_pipeline.fit(X_train, y_train)

# Accuray On Test Data
predictions = dt_pipeline.predict(X_test)
accuracy = accuracy_score(y_test, predictions)
acc_test.append(accuracy*100)
print(f"Accuracy on Test Data: {accuracy*100}%")
plt.figure(figsize = (15,9))
sns.heatmap(confusion_matrix(y_test, predictions), annot = True)
plt.title("Confusion Matrix for Test Data")
plt.show()

print()

# Accuray On Whole Data
predictions = dt_pipeline.predict(X.values)
accuracy = accuracy_score(y, predictions)
acc.append(accuracy*100)
model.append('Decision Tree Classifier')
print(f"Accuracy on Whole Data: {accuracy*100}%")
plt.figure(figsize = (15,9))
sns.heatmap(confusion_matrix(y, predictions), annot = True)
plt.title("Confusion Matrix for Whole Data")
plt.show()

## Gaussian Naive Bayes (GaussianNB)

In [None]:
from sklearn.naive_bayes import GaussianNB

nb_pipeline = make_pipeline(StandardScaler(), GaussianNB())
nb_pipeline.fit(X_train, y_train)

# Accuray On Test Data
predictions = nb_pipeline.predict(X_test)
accuracy = accuracy_score(y_test, predictions)
acc_test.append(accuracy*100)
print(f"Accuracy on Test Data: {accuracy*100}%")
plt.figure(figsize = (15,9))
sns.heatmap(confusion_matrix(y_test, predictions), annot = True)
plt.title("Confusion Matrix for Test Data")
plt.show()

print()

# Accuray On Whole Data
predictions = nb_pipeline.predict(X.values)
accuracy = accuracy_score(y, predictions)
acc.append(accuracy*100)
model.append('Naive Bayes Classifier')
print(f"Accuracy on Whole Data: {accuracy*100}%")
plt.figure(figsize = (15,9))
sns.heatmap(confusion_matrix(y, predictions), annot = True)
plt.title("Confusion Matrix for Whole Data")
plt.show()

## Comparison on Accuracy of Entire Data for The Models Trained

In [None]:
plt.figure(figsize=[12,8],dpi = 100)
plt.title('Accuracy on Whole Data Comparison')
plt.xlabel('Accuracy')
plt.ylabel('Algorithm')
sns.barplot(x = acc,y = model,palette='dark')
for index, value in enumerate(acc):
    plt.text(value, index, str(round(value,2)))

## Comparison on Accuracy of Test Data for The Models Trained

In [None]:
plt.figure(figsize=[12,8],dpi = 100)
plt.title('Accuracy on Test Data Comparison')
plt.xlabel('Accuracy')
plt.ylabel('Algorithm')
sns.barplot(x = acc_test,y = model,palette='dark')
for index, value in enumerate(acc_test):
    plt.text(value, index, str(round(value,2)))

## Saving Models

In [None]:
pickle.dump(lr_pipeline, open("nb_pipeline.pkl", "wb"))
pickle.dump(knn_pipeline, open("knn_pipeline.pkl", "wb"))
pickle.dump(rf_pipeline, open("rf_pipeline.pkl", "wb"))
pickle.dump(xgb_pipeline, open("xgb_pipeline.pkl", "wb"))
pickle.dump(dt_pipeline, open("dt_pipeline.pkl", "wb"))
pickle.dump(nb_pipeline, open("nb_pipeline.pkl", "wb"))

pickle.dump(label_dict, open("label_dictionary.pkl", "wb"))
print("Saved All Models")