In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
%matplotlib inline
import matplotlib.pyplot as plt

from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from pandas_profiling import ProfileReport
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.linear_model import LinearRegression, LogisticRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, RandomForestRegressor,     GradientBoostingRegressor, ExtraTreesRegressor
from sklearn.svm import LinearSVC, SVC
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, r2_score, classification_report
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

In [None]:
crop_filepath = "../input/crop-recommendation-dataset/Crop_recommendation.csv"
data = pd.read_csv(crop_filepath)

In [None]:
data.head()

In [None]:
data.tail()

In [None]:
data.info()

In [None]:
data.describe()

In [None]:
data.isnull().sum()

In [None]:
data.nunique()

In [None]:
data.profile_report()

# Visualization

In [None]:
plt.figure(figsize=(8,8))
plt.title("Correlation between features")
corr = data.corr()
sns.heatmap(corr, annot=True)

In [None]:
plt.figure(figsize=(6,8))
plt.title("Temperature relation with crops")
sns.barplot(y="label", x="temperature", data=data)
plt.ylabel("crops")

In [None]:
plt.figure(figsize=(6,8))
plt.title("Humidity relation with crops")
sns.barplot(y="label", x="humidity", data=data)
plt.ylabel("crops")

In [None]:
plt.figure(figsize=(6,8))
plt.title("pH relation with crops")
sns.barplot(y="label", x="ph", data=data)
plt.ylabel("crops")

In [None]:
plt.figure(figsize=(6,8))
plt.title("Rainfall relation with crops")
sns.barplot(y="label", x="rainfall", data=data)
plt.ylabel("crops")

In [None]:
plt.figure(figsize=(8,6))
plt.title("Temperature and pH effect values for crops")
sns.scatterplot(data=data, x="temperature", y="label", hue="ph")
plt.ylabel("Crops")

In [None]:
plt.figure(figsize=(8,6))
plt.title("Temperature and humidity effect values for crops")
sns.scatterplot(data=data, x="temperature", y="label", hue="humidity")
plt.ylabel("Crops")

In [None]:
plt.figure(figsize=(8,6))
plt.title("Temperature and Rainfall effect values for crops")
sns.scatterplot(data=data, x="temperature", y="label", hue="rainfall")
plt.ylabel("Crops")

In [None]:
data_npk = pd.melt(data, id_vars=["label"], value_vars=["N", "P", "K"],var_name='element', value_name='value')
sns.catplot(y="label", x="value", hue="element", data=data_npk)
plt.ylabel("Crop")
plt.title("Values of N, P, K in soil")

# Predictions

In [None]:
encoder = LabelEncoder()
data.label = encoder.fit_transform(data.label)

In [None]:
features = data.drop("label", axis=1)
target = data.label

In [None]:
X_train, X_test, y_train, y_test = train_test_split(features, target, random_state=42)

## Linear Regression

In [None]:
lr = LinearRegression().fit(X_train, y_train)
lr_pred= lr.score(X_test, y_test)

print("Training score: {:.3f}".format(lr.score(X_train, y_train)))
print("Test score: {:.3f}".format(lr.score(X_test, y_test)))

## Decision Tree Classifier

In [None]:
tree = DecisionTreeClassifier(max_depth=15,random_state=0).fit(X_train, y_train)
tree_pred= tree.score(X_test, y_test)

print("Training score: {:.3f}".format(tree.score(X_train, y_train)))
print("Test score: {:.3f}".format(tree.score(X_test, y_test)))

## Random Forests

In [None]:
rf = RandomForestClassifier(n_estimators=10, max_features=3, random_state=0).fit(X_train, y_train)
rf_pred= rf.score(X_test, y_test)

print("Training score: {:.3f}".format(rf.score(X_train, y_train)))
print("Test score: {:.3f}".format(rf.score(X_test, y_test)))

## GradientBoostingClassifier

In [None]:
gbr = GradientBoostingClassifier(n_estimators=20, max_depth=4, max_features=2, random_state=0).fit(X_train, y_train)
gbr_pred= gbr.score(X_test, y_test)

print("Training score: {:.3f}".format(gbr.score(X_train, y_train)))
print("Test score: {:.3f}".format(gbr.score(X_test, y_test)))

## Support Vector Classifier

In [None]:
svm = SVC(C=100, gamma=0.001).fit(X_train, y_train)
svm_pred= svm.score(X_test, y_test)

print("Training score: {:.3f}".format(svm.score(X_train, y_train)))
print("Test score: {:.3f}".format(svm.score(X_test, y_test)))

## Logistic regression

In [None]:
log_reg = LogisticRegression(C=0.1, max_iter=100000).fit(X_train, y_train)
log_reg_pred= log_reg.score(X_test, y_test)

print("Training score: {:.3f}".format(log_reg.score(X_train, y_train)))
print("Test score: {:.3f}".format(log_reg.score(X_test, y_test)))

## MLPClassifer

In [None]:
# compute the mean value per feature on the training set
mean_on_train = X_train.mean(axis=0)
# compute the standard deviation of each feature on the training set
std_on_train = X_train.std(axis=0)
# subtract the mean, and scale by inverse standard deviation
# afterward, mean=0 and std=1
X_train_scaled = (X_train - mean_on_train) / std_on_train
# use THE SAME transformation (using training mean and std) on the test set
X_test_scaled = (X_test - mean_on_train) / std_on_train

mlp = MLPClassifier(random_state=0, max_iter=10000, alpha=0.01,hidden_layer_sizes=[100,50])
mlp.fit(X_train_scaled, y_train)

mlp_pred= mlp.score(X_test_scaled, y_test)

print("Training score: {:.3f}".format(mlp.score(X_train_scaled, y_train)))
print("Test score: {:.3f}".format(mlp.score(X_test_scaled, y_test)))

In [None]:
predictions_acc = { "Model": ['Decision Tree', 'Random Forest', 'Gradient Boosting', 'SVC', 'Logistic Regression', 'MLP'],
"Accuracy": [tree_pred, rf_pred, gbr_pred, svm_pred, log_reg_pred, mlp_pred]}

In [None]:
model_acc = pd.DataFrame(predictions_acc, columns=["Model", "Accuracy"])

In [None]:
model_acc

In [None]:
print("Average accuracy (Ex linear regression): {:.3f}".format(model_acc.Accuracy.mean()))