# 1. Loading Libraries and importing dataset 💽

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv("../input/smart-agricultural-production-optimizing-engine/Crop_recommendation.csv")
df.head()

In [None]:
df.shape

# 2. EDA - Exploratory Data Analysis 🔎

In [None]:
df.info()


In [None]:
df.describe()

In [None]:
df.isnull().sum()

In [None]:
df.duplicated().sum()

In [None]:
df.columns = ['Nitrogen','Phosphorus','Potassium','Temp','Humidity','pH','Rainfall','Label']

In [None]:
sns.pairplot(df, hue = 'Label')
plt.show()

In [None]:
mean = sum(df['Nitrogen']) / len(df['Nitrogen'])
median = sorted(df['Nitrogen'])[len(df['Nitrogen']) // 2]

sns.histplot(data=df, x='Nitrogen',bins=25, kde=True, color='#251188')

plt.axvline(mean, color='r', linestyle='--', label='mean')
plt.axvline(median, color='g', linestyle='--', label='median')


plt.legend()

plt.show()


In [None]:
sns.catplot(data=df, x='Label', y='Nitrogen', kind='box', height=5, aspect = 2/1)
plt.xticks(rotation='vertical')
plt.title("Temperature", size=20)
plt.show()

In [None]:
mean = sum(df['Phosphorus']) / len(df['Phosphorus'])
median = sorted(df['Phosphorus'])[len(df['Phosphorus']) // 2]

sns.histplot(data=df, x='Phosphorus',bins=25, kde=True, edgecolor="black",color='#251188',facecolor='orange')

plt.axvline(mean, color='r', linestyle='--', label='mean')
plt.axvline(median, color='g', linestyle='--', label='median')


plt.legend()

plt.show()

In [None]:
sns.catplot(data=df, x='Label', y='Phosphorus', kind='box', height=5, aspect = 2/1)
plt.xticks(rotation='vertical')
plt.title("Temperature", size=20)
plt.show()

In [None]:
mean = sum(df['Potassium']) / len(df['Potassium'])
median = sorted(df['Potassium'])[len(df['Potassium']) // 2]

sns.histplot(data=df, x='Potassium',bins=25, kde=True, edgecolor="black",color='#251188',facecolor='#f1b27b')

plt.axvline(mean, color='r', linestyle='--', label='mean')
plt.axvline(median, color='g', linestyle='--', label='median')


plt.legend()

plt.show()




In [None]:
sns.catplot(data=df, x='Label', y='Potassium', kind='box', height=5, aspect = 2/1)
plt.xticks(rotation='vertical')
plt.title("Temperature", size=20)
plt.show()

In [None]:
mean = sum(df['Temp']) / len(df['Temp'])
median = sorted(df['Temp'])[len(df['Temp']) // 2]

sns.histplot(data=df, x='Temp',bins=25, kde=True, edgecolor="black",color='#251188',facecolor='#d2c6e2')

plt.axvline(mean, color='r', linestyle='--', label='mean')
plt.axvline(median, color='g', linestyle='--', label='median')


plt.legend()

plt.show()


In [None]:
sns.catplot(data=df, x='Label', y='Temp', kind='box', height=5, aspect = 2/1)
plt.xticks(rotation='vertical')
plt.title("Temperature", size=20)
plt.show()

In [None]:
mean = sum(df['Humidity']) / len(df['Humidity'])
median = sorted(df['Humidity'])[len(df['Humidity']) // 2]

sns.histplot(data=df, x='Humidity',bins=25, kde=True, edgecolor="black",color='#251188',facecolor='#3d85c6')

plt.axvline(mean, color='r', linestyle='--', label='mean')
plt.axvline(median, color='g', linestyle='--', label='median')


plt.legend()

plt.show()


In [None]:
sns.catplot(data=df, x='Label', y='Humidity', kind='box', height=5, aspect = 2/1)
plt.xticks(rotation='vertical')
plt.title("Temperature", size=20)
plt.show()

In [None]:
mean = sum(df['pH']) / len(df['pH'])
median = sorted(df['pH'])[len(df['pH']) // 2]

sns.histplot(data=df, x='pH',bins=25, kde=True, edgecolor="black",color='#251188',facecolor='#003366')

plt.axvline(mean, color='r', linestyle='--', label='mean')
plt.axvline(median, color='g', linestyle='--', label='median')


plt.legend()

plt.show()


In [None]:
sns.catplot(data=df, x='Label', y='pH', kind='box', height=5, aspect = 2/1)
plt.xticks(rotation='vertical')
plt.title("Temperature", size=20)
plt.show()

In [None]:
mean = sum(df['Rainfall']) / len(df['Rainfall'])
median = sorted(df['Rainfall'])[len(df['Rainfall']) // 2]

sns.histplot(data=df, x='Rainfall',bins=25, kde=True, edgecolor="black",color='#251188',facecolor='#00bfb8')

plt.axvline(mean, color='r', linestyle='--', label='mean')
plt.axvline(median, color='g', linestyle='--', label='median')


plt.legend()

plt.show()

In [None]:
sns.catplot(data=df, x='Label', y='Rainfall', kind='box', height=5, aspect = 2/1)
plt.xticks(rotation='vertical')
plt.title("Temperature", size=20)
plt.show()

In [None]:
group = df.groupby(by='Label').mean().reset_index()
group

Now we can observe each plant and their insights


In [None]:
print(f'--------------------------------')
for i in group.columns[1:]:
    print(f'Top 5 Most {i} requiring crops:')
    print(f'--------------------------------')
    for j ,k in group.sort_values(by=i,ascending=False)[:5][['Label',i]].values:
        print(f'{j} --> {k}')
    print(f'-------------------------------')

In [None]:
print(f'--------------------------------')
for i in group.columns[1:]:
    print(f'Top 5 Least {i} requiring crops:')
    print(f'--------------------------------')
    for j ,k in group.sort_values(by=i)[:5][['Label',i]].values:
        print(f'{j} --> {k}')
    print(f'-------------------------------')

In [None]:
fig,ax=plt.subplots(7,1,figsize=(25,25))
for index,i in enumerate(group.columns[1:]):
    sns.barplot(data=group,x='Label',y=i,ax=ax[index])
    plt.suptitle("Comparision of Mean Attributes of various classes",size=25)
    plt.xlabel("")

In [None]:
def detect_outlier(x):
    q1 = x.quantile(0.25)
    q3 = x.quantile(0.75)
    IQR = q3-q1
    lower_limit = q1 - (1.5*IQR)
    upper_limit = q3 + (1.5*IQR)
    print(f"Lower limit: {lower_limit} Upper limit: {upper_limit}")
    print(f"Minimum value: {x.min()}   Maximum Value: {x.max()}")
    for i in [x.min(),x.max()]:
        if i == x.min():
            if lower_limit > x.min():
                print("Lower limit failed - Need to remove minimum value")
            elif lower_limit < x.min():
                print("Lower limit passed - No need to remove outlier")
        elif i == x.max():
            if upper_limit > x.max():
                print("Upper limit passed - No need to remove outlier")
            elif upper_limit < x.max():
                print("Upper limit failed - Need to remove maximum value")
                

In [None]:
for i in df['Label'].unique():
    print(f"Label: {i}")
    detect_outlier(df['Nitrogen'][df['Label']==i])
    
    print('---------------------------------------------')

In [None]:
plt.figure(figsize=(12, 6))
sns.heatmap(df.corr(),annot=True)

In [None]:
plt.figure(figsize=(12, 6))

corr = df.apply(lambda x: pd.factorize(x)[0]).corr()
ax = sns.heatmap(corr, xticklabels=corr.columns, yticklabels=corr.columns, 
                 linewidths=.2, cmap="YlGnBu", annot=True)

In [None]:
from sklearn.decomposition import PCA
import plotly.express as px


pca=PCA(n_components=2)
df_pca=pca.fit_transform(df.drop(['Label'],axis=1))
df_pca=pd.DataFrame(df_pca)
fig = px.scatter(x=df_pca[0],y=df_pca[1],color=df['Label'],title="Decomposed using PCA")
fig.show()

# 3. Data Preprocessing 🥼

In [None]:
from sklearn.preprocessing import LabelEncoder
encoder=LabelEncoder()
df['Label']=encoder.fit_transform(df['Label'])
df.head()

In [None]:
X=df.drop(['Label'],axis=1)
y=df['Label']

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3,shuffle = True, random_state = 42,stratify=y)

## Scaling

In [None]:
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
X_train=scaler.fit_transform(X_train)
X_train=pd.DataFrame(X_train,columns=X.columns)
X_train.head()

In [None]:
X_test=scaler.transform(X_test)
X_test=pd.DataFrame(X_test,columns=X.columns)
X_test.head()

# 4. Model Building 🎯

In [None]:
from sklearn.svm import SVC
from sklearn.model_selection import RandomizedSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

import warnings 
warnings.filterwarnings('ignore')

## Logistic Regression

In [None]:

log=RandomizedSearchCV(LogisticRegression(solver='lbfgs'),param_grid,cv=5)
log.fit(X_train,y_train)
y_pred_log=log.predict(X_test)
confusion_log=confusion_matrix(y_test,log.predict(X_test))
plt.figure(figsize=(8,8))
sns.heatmap(confusion_log,annot=True)
plt.xlabel("Predicted")
plt.ylabel("Actual")
print(classification_report(y_test,y_pred_log))

In [None]:
param_grid={'C':[0.001,0.01,0.1,1,10,100], 'max_iter':[50,75,100,200,300,400,500,700]}
log=RandomizedSearchCV(LogisticRegression(solver='lbfgs'),param_grid,cv=5)
log.fit(X_train,y_train)
y_pred_log=log.predict(X_test)
confusion_log=confusion_matrix(y_test,log.predict(X_test))
plt.figure(figsize=(8,8))
sns.heatmap(confusion_log,annot=True)
plt.xlabel("Predicted")
plt.ylabel("Actual")
print(classification_report(y_test,y_pred_log))

## KNeighbors Classifier

In [None]:
knn_scores=[]
for k in range(1,20):
    knn=KNeighborsClassifier(n_neighbors=k)
    scores=cross_val_score(knn,X_train,y_train,cv=5)
    knn_scores.append(scores.mean())

x_ticks = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20]
x_labels = x_ticks

plt.plot([k for k in range(1,20)],knn_scores)
plt.xticks(ticks=x_ticks, labels=x_labels)
plt.grid()

In [None]:
knn=KNeighborsClassifier(n_neighbors=1)
knn.fit(X_train,y_train)
from sklearn.metrics import confusion_matrix
confusion_knn=confusion_matrix(y_test,knn.predict(X_test))
plt.figure(figsize=(8,8))
sns.heatmap(confusion_knn,annot=True)
plt.xlabel("Predicted")
plt.ylabel("Actual")
from sklearn.metrics import classification_report
print(classification_report(y_test,knn.predict(X_test)))

## Support Vector Classifier

In [None]:
rcv=RandomizedSearchCV(SVC(),param_grid,cv=5)
rcv.fit(X_train,y_train)
y_pred_svc=rcv.predict(X_test)
confusion_svc=confusion_matrix(y_test,rcv.predict(X_test))
plt.figure(figsize=(8,8))
sns.heatmap(confusion_svc,annot=True)
plt.xlabel("Predicted")
plt.ylabel("Actual")
print(classification_report(y_test,y_pred_svc))

In [None]:
param_grid={'C':[0.001,0.01,0.1,1,10,100], 'gamma':[0.001,0.01,0.1,1,10,100]}
rcv=RandomizedSearchCV(SVC(),param_grid,cv=5)
rcv.fit(X_train,y_train)
y_pred_svc=rcv.predict(X_test)
confusion_svc=confusion_matrix(y_test,rcv.predict(X_test))
plt.figure(figsize=(8,8))
sns.heatmap(confusion_svc,annot=True)
plt.xlabel("Predicted")
plt.ylabel("Actual")
print(classification_report(y_test,y_pred_svc))

## Random Forest Classifier 

In [None]:
param_grid = {
'n_estimators': [50, 75,100, 150, 200,300],
}
rcv=RandomizedSearchCV(RandomForestClassifier(random_state=42),param_grid,cv=5)
rcv.fit(X_train,y_train)
y_pred_rcv=rcv.predict(X_test)
confusion_rcv=confusion_matrix(y_test,rcv.predict(X_test))
plt.figure(figsize=(8,8))
sns.heatmap(confusion_rcv,annot=True)
plt.xlabel("Predicted")
plt.ylabel("Actual")
print(classification_report(y_test,y_pred_rcv))

## Gradient Boosting Classifier

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
param_grid = {
    'learning_rate' : [0.05, 0.075, 0.1, 0.25, 0.5, 0.75, 1],
    'n_estimators': [50, 75,100, 150, 200,300],
}

gbc=RandomizedSearchCV(GradientBoostingClassifier(random_state=42),param_grid,cv=5)
gbc.fit(X_train,y_train)
y_pred_gbc=gbc.predict(X_test)
confusion_gbc=confusion_matrix(y_test,y_pred_gbc)
plt.figure(figsize=(8,8))
sns.heatmap(confusion_gbc,annot=True)
plt.xlabel("Predicted")
plt.ylabel("Actual")
print(classification_report(y_test,y_pred_gbc))

## LightGBM Classifier

In [None]:
import lightgbm as lgb
model = lgb.LGBMClassifier()
model.fit(X_train, y_train)
y_pred=model.predict(X_test)
confusion=confusion_matrix(y_test,y_pred)
plt.figure(figsize=(8,8))
sns.heatmap(confusion,annot=True)
plt.xlabel("Predicted")
plt.ylabel("Actual")
print(classification_report(y_test,y_pred))