In [None]:
# Mount google drive.
from google.colab import drive
drive.mount('/content/drive',force_remount=True)

In [None]:
# Import libraries for data manipulations.
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
import numpy as np

# Import libraries for data visualizations.
import seaborn as sns
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt


# Import libraries for algorithms.
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier

# Import libraries to analyze system metrics.
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, precision_score, recall_score, f1_score


In [None]:
data_dir = '/content/drive/My Drive/DataScience'

In [None]:
# Importing data.
df = pd.read_csv(f"{data_dir}/Crop_recommendation.csv")
print(df.shape)

**Data Analysis**

In [None]:
df.head()

df.info()

df.describe()

df.isnull().sum()

df['label'].value_counts()

In [None]:
# Analysis correlation between different features.
corr = df.corr()
ax = sns.heatmap(corr,annot=True, cbar=True, cmap='coolwarm')


plt.title("Correlation Matrix", fontsize = 20); # title with fontsize 20

**Data Preparation**

In [None]:
# Encode label to numerical values
LE = LabelEncoder()
df['label'] = LE.fit_transform(df['label'])
print(LE.classes_)


In [None]:
features = df[['N','P','K','temperature','humidity','ph','rainfall']]
labels = df['label']

In [None]:
accuracies=[]
f1=[]
X_train, X_test, Y_train, Y_test = train_test_split(features, labels, test_size = 0.20, random_state = 42)

In [None]:
scaler = MinMaxScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)
print(X_train.shape)
print(X_train)

## DecisionTree Classifier

In [None]:
tree = DecisionTreeClassifier()

# Training
tree = tree.fit(X_train, Y_train)

y_pred_tree = tree.predict(X_test)

accuracy_dt = accuracy_score(Y_test, y_pred_tree)

f1_dt=f1_score(Y_test,y_pred_tree, average='macro')

accuracies.append(accuracy_dt*100)

f1.append(f1_dt)

print("Accuracy of Decision Tree", accuracy_dt*100)
print("f1_score of Decision Tree", f1_dt)

In [None]:
feature_importance = tree.feature_importances_

print(feature_importance)
# Print feature importances
for i, importance in enumerate(feature_importance):
    print(f"Feature {i+1}: {importance}")

In [None]:
cm= confusion_matrix(Y_test, y_pred_tree)

ax = sns.heatmap(cm, annot=True, cmap='Blues')

plt.title("Confusion Matrix for Decision Tree Classifier", fontsize = 17) # title with fontsize 20
plt.xlabel("Actual Values", fontsize = 15) # x-axis label with fontsize 15
plt.ylabel("Predicted Values", fontsize = 15) # y-axis label with fontsize 15

## KNN Classifier

In [None]:
k_values = [3,5,7,9]
knn_accuracy=[]
knn_f1_score=[]


for k_value in k_values:
  model = KNeighborsClassifier(n_neighbors = k_value)

    # Training
  model.fit(X_train,Y_train)

  y_pred = model.predict(X_test)
  accuracy_knn = accuracy_score(Y_test, y_pred)
  f1_knn=f1_score(Y_test,y_pred, average='macro')
  accuracy_knn*100
  knn_accuracy.append(accuracy_knn)
  knn_f1_score.append(f1_knn)


In [None]:
compare_knn=pd.DataFrame()
compare_knn['K Value']= k_values
compare_knn["Accuracy"]= knn_accuracy
compare_knn["F1_Score"]= knn_f1_score

In [None]:
plt.figure(figsize=(8, 6))
sns.set(style="whitegrid")
ax=sns.barplot(y='Accuracy',x='K Value',data=compare_knn)
ax.bar_label(ax.containers[0])
plt.title('Accuracy For Different K Value');

In [None]:
plt.figure(figsize=(8, 6))
sns.set(style="whitegrid")
ax=sns.barplot(y='F1_Score',x='K Value',data=compare_knn)
ax.bar_label(ax.containers[0])
plt.title('F1_Score For Different K Values');

In [None]:
model = KNeighborsClassifier(n_neighbors=3)

# Training
model.fit(X_train,Y_train)

y_pred = model.predict(X_test)
accuracy_knn = accuracy_score(Y_test, y_pred)
f1_knn=f1_score(Y_test,y_pred_tree, average='macro')
accuracies.append(accuracy_knn*100)
f1.append(f1_knn)
accuracy_knn*100
print("Accuracy of KNN", accuracy_knn*100)
print("f1_score of KNN", f1_knn)

In [None]:
cpm = confusion_matrix(Y_test, y_pred)

ax = sns.heatmap(cpm, annot=True, cmap='Blues')


plt.title("Confusion Matrix for KNN Classifier", fontsize = 17) # title with fontsize 20
plt.xlabel("Actual Values", fontsize = 15) # x-axis label with fontsize 15
plt.ylabel("Predicted Values", fontsize = 15) # y-axis label with fontsize 15

## Naive Bayes

In [None]:
gb = GaussianNB()

# Training
gb.fit(X_train,Y_train)

y_pred_gb = gb.predict(X_test)

accuracy_gb = accuracy_score(Y_test, y_pred_gb)

f1_gb=f1_score(Y_test,y_pred_gb, average='weighted')

accuracies.append(accuracy_gb*100)

print("Accuracy of Naive Bayes", accuracy_gb*100)
print("f1_score of Naive Bayes", f1_gb)

In [None]:
cm_gb = confusion_matrix(Y_test, y_pred_gb)

ax = sns.heatmap(cm_gb, annot=True, cmap='Blues')


plt.title("Confusion Matrix for Naive Bayes Classifier", fontsize = 17) # title with fontsize 20
plt.xlabel("Actual Values", fontsize = 15) # x-axis label with fontsize 15
plt.ylabel("Predicted Values", fontsize = 15) # y-axis label with fontsize 15

## Support Vector Classifier

In [None]:
accuracies = []

# Values of C to iterate over
C_values = [0.1, 1, 5, 10, 100, 1000, 10000]

for c in C_values:
    sv = SVC(kernel='rbf', gamma='scale', C=c)

    # Training
    sv.fit(X_train, Y_train)

    y_pred_svc = sv.predict(X_test)

    accuracy_svc = accuracy_score(Y_test, y_pred_svc)

    # Calculate F1 score for the current model
    f1_svc = f1_score(Y_test, y_pred_svc, average='macro')

    accuracies.append(accuracy_svc * 100)

    print(f"Results for C = {c}:")
    print("Accuracy of Support Vector Classifier:", accuracy_svc * 100)
    print("F1 score of Support Vector Classifier:", f1_svc)
    print("--------")

plt.bar(np.log10(C_values), accuracies, color='blue')  # Use log scale for x-axis
plt.xlabel('C values (log base 10)')
plt.ylabel('Accuracy (%)')
plt.title('Accuracy of Support Vector Classifier for Different C values')
plt.ylim(95, 100)  # adjust y-axis limits
plt.show()


In [None]:
cm_svc = confusion_matrix(Y_test, y_pred_svc)

ax = sns.heatmap(cm_svc, annot=True, cmap='Blues')


plt.title("Confusion Matrix for Support Vector Classifier", fontsize = 17) # title with fontsize 20
plt.xlabel("Actual Values", fontsize = 15) # x-axis label with fontsize 15
plt.ylabel("Predicted Values", fontsize = 15) # y-axis label with fontsize 15

## Model Performance Comparisons

In [None]:
compare=pd.DataFrame()
compare['Model']=["DT","KNN","NB","SVC"]
compare["Accuracy"]=[accuracy_dt*100,accuracy_knn*100,accuracy_gb*100,accuracy_svc*100]
compare["F1_Score"]=[f1_dt, f1_knn, f1_gb, f1_svc]
compare["Legend"]=['with phosphorous', 'with phosphorous', 'with phosphorous', 'with phosphorous']

In [None]:
plt.figure(figsize=(8, 6))
sns.set(style="whitegrid")
ax=sns.barplot(y='Accuracy',x='Model',data=compare)
ax.bar_label(ax.containers[0]);
plt.title('Accuracy For Different Machine Learning Models');

In [None]:
plt.figure(figsize=(8, 6))
sns.set(style="whitegrid")
ax=sns.barplot(y='F1_Score',x='Model',data=compare)
ax.bar_label(ax.containers[0])
plt.title('F1_Score For Different Machine Learning Model');

In [None]:
corr = df.corr()
ax = sns.heatmap(corr,annot=True, cbar=True, cmap='coolwarm')


plt.title("Correlation Matrix", fontsize = 20); # title with fontsize 20

**Phosphorous and potassium has high correlation**.

**Evaluating model performance by dropping a feature(Phosphorous)**

In [None]:
features = df[['N','K', 'temperature','humidity','ph','rainfall']]
labels = df['label']

In [None]:
accuracies=[]
X_train, X_test, Y_train, Y_test = train_test_split(features, labels, test_size = 0.20, random_state = 0)

In [None]:
scaler = MinMaxScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)
print(X_train.shape)
print(X_train)

## Decision Tree

In [None]:
tree = DecisionTreeClassifier()
# Training
tree = tree.fit(X_train, Y_train)

y_pred_tree = tree.predict(X_test)
accuracy_dt = accuracy_score(Y_test, y_pred_tree)
f1_dt=f1_score(Y_test,y_pred_tree, average='macro')
accuracies.append(accuracy_dt*100)
f1.append(f1_dt)
print("Accuracy of Decision Tree", accuracy_dt*100)
print("f1_score of Decision Tree", f1_dt)

In [None]:
cm= confusion_matrix(Y_test, y_pred_tree)

ax = sns.heatmap(cm, annot=True, cmap='Blues')

plt.title("Confusion Matrix for Decision Tree Classifier without phosphorous", fontsize = 15) # title with fontsize 20
plt.xlabel("Actual Values", fontsize = 15) # x-axis label with fontsize 15
plt.ylabel("Predicted Values", fontsize = 15) # y-axis label with fontsize 15

## KNN Classifier

In [None]:
model = KNeighborsClassifier(n_neighbors=3)

# Training
model.fit(X_train,Y_train)

y_pred = model.predict(X_test)
accuracy_knn = accuracy_score(Y_test, y_pred)
f1_knn=f1_score(Y_test,y_pred_tree, average='macro')
accuracies.append(accuracy_knn*100)
f1.append(f1_knn)
accuracy_knn*100
print("Accuracy of KNN", accuracy_knn*100)
print("f1_score of KNN", f1_knn)

In [None]:
cpm = confusion_matrix(Y_test, y_pred)

ax = sns.heatmap(cpm, annot=True, cmap='Blues')


plt.title("Confusion Matrix for KNN Classifier without phosphorous", fontsize = 15) # title with fontsize 15
plt.xlabel("Actual Values", fontsize = 15) # x-axis label with fontsize 15
plt.ylabel("Predicted Values", fontsize = 15); # y-axis label with fontsize 15

## Naive Bayes

In [None]:
gb = GaussianNB()

# Training
gb=gb.fit(X_train,Y_train)
GaussianNB()
y_pred_gb = gb.predict(X_test)
accuracy_gb = accuracy_score(Y_test, y_pred_gb)
f1_gb=f1_score(Y_test,y_pred_tree, average='weighted')
accuracies.append(accuracy_gb*100)
accuracy_gb*100
print("Accuracy of Naive Bayes", accuracy_gb*100)
print("f1_score of Naive Bayes", f1_gb)

In [None]:
cpm = confusion_matrix(Y_test, y_pred)

ax = sns.heatmap(cpm, annot=True, cmap='Blues')


plt.title("Confusion Matrix for Naive Bayes Classifier without phosphorous", fontsize = 15) # title with fontsize 20
plt.xlabel("Actual Values", fontsize = 15) # x-axis label with fontsize 15
plt.ylabel("Predicted Values", fontsize = 15) # y-axis label with fontsize 15

## Support Vector Classifier

In [None]:
accuracies = []

# Values of C to iterate over
C_values = [0.1, 1, 5, 10, 100, 1000, 10000]

for c in C_values:
    sv = SVC(kernel='rbf', gamma='scale', C=c)

    # Training
    sv.fit(X_train, Y_train)

    y_pred_svc = sv.predict(X_test)

    accuracy_svc = accuracy_score(Y_test, y_pred_svc)

    # Calculate F1 score for the current model
    f1_svc = f1_score(Y_test,y_pred_svc, average='macro')

    accuracies.append(accuracy_svc * 100)

    print(f"Results for C = {c}:")
    print("Accuracy of Support Vector Classifier:", accuracy_svc * 100)
    print("F1 score of Support Vector Classifier:", f1_svc)
    print("--------")

plt.bar(np.log10(C_values), accuracies, color='blue')  # Use log scale for x-axis
plt.xlabel('log10(C values)')
plt.ylabel('Accuracy (%)')
plt.title('Accuracy of Support Vector Classifier for Different C values')
plt.ylim(95, 100)  # adjust y-axis limits
plt.show()


In [None]:
cpm = confusion_matrix(Y_test, y_pred)

ax = sns.heatmap(cpm, annot=True, cmap='Blues')


plt.title("Confusion Matrix for Support Vector Classifier Without Phosphorous", fontsize = 15) # title with fontsize 20
plt.xlabel("Actual Values", fontsize = 15) # x-axis label with fontsize 15
plt.ylabel("Predicted Values", fontsize = 15); # y-axis label with fontsize 15

## Model Performance Comparisons After Dropping a Feature

In [None]:
compare_without_P=pd.DataFrame()
compare_without_P['Model']=["DT","KNN","NB","SVC"]
compare_without_P["Accuracy"]=[accuracy_dt*100,accuracy_knn*100,accuracy_gb*100,accuracy_svc*100]
compare_without_P["F1_Score"]=[f1_dt, f1_knn, f1_gb, f1_svc]
compare_without_P["Legend"]=['without phosphorous','without phosphorous', 'without phosphorous', 'without phosphorous']

In [None]:
plt.figure(figsize=(8, 6))
sns.set(style="whitegrid")
ax=sns.barplot(y='Accuracy',x='Model',data=compare_without_P)
ax.bar_label(ax.containers[0])
plt.title('Accuracy For Different Machine Learning Models');

In [None]:
plt.figure(figsize=(8, 6))
sns.set(style="whitegrid")
ax=sns.barplot(y='F1_Score',x='Model',data=compare_without_P)
ax.bar_label(ax.containers[0])
plt.title('F1_Score For Different Machine Learning Model');

In [None]:
combine=compare.append(compare_without_P)

In [None]:
g = sns.catplot(
    data=combine, kind="bar",
    x="Model", y="Accuracy", hue="Legend",
    errorbar="sd", palette="dark", alpha=.6, height=6
)
ax = g.facet_axis(0, 0)
for c in ax.containers:
    labels = [f'{(v.get_height()):.2f}' for v in c]
    ax.bar_label(c, labels=labels, label_type='edge')

plt.title('Comparing Accuracy Before And After Dropping Phosphorous Column');

In [None]:
g = sns.catplot(
    data=combine, kind="bar",
    x="Model", y="F1_Score", hue="Legend",
    errorbar="sd", palette="dark", alpha=.6, height=6
)
ax = g.facet_axis(0, 0)
for c in ax.containers:
    labels = [f'{(v.get_height()):.3f}' for v in c]
    ax.bar_label(c, labels=labels, label_type='edge')

plt.title('Comparing F1_Score Before and After Dropping Phosphorous');

## Predictions

In [None]:
values = [[27,	200,	21.452787,	90.745319,	6.110219,	116.703658]]
values = scaler.transform(values)
print(values)

In [None]:
values = [[0.19285714, 0.975, 0.36232935, 0.89225218, 0.40327048, 0.3463106 ]]
y_pred= gb.predict(values);
y_pred[0]

In [None]:
crops=['apple', 'banana', 'blackgram', 'chickpea', 'coconut', 'coffee', 'cotton',
 'grapes', 'jute', 'kidneybeans', 'lentil', 'maize', 'mango', 'mothbeans',
 'mungbean', 'muskmelon', 'orange', 'papaya', 'pigeonpeas', 'pomegranate',
 'rice' ,'watermelon']
print(crops[y_pred[0]])