# Importing the required libraries

In [25]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
import seaborn as sns
import pickle

# Reading Dataset

In [2]:
#Reading the dataset
data = pd.read_csv('crop_recommendation.csv')

# Data Analysis

In [3]:
#Checking the Head of the Dataset
data .head()

Unnamed: 0,N,P,K,temperature,humidity,ph,rainfall,label
0,90,42,43,20.879744,82.002744,6.502985,202.935536,rice
1,85,58,41,21.770462,80.319644,7.038096,226.655537,rice
2,60,55,44,23.004459,82.320763,7.840207,263.964248,rice
3,74,35,40,26.491096,80.158363,6.980401,242.864034,rice
4,78,42,42,20.130175,81.604873,7.628473,262.71734,rice


In [4]:
# Used to display information about a dataset 
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2200 entries, 0 to 2199
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   N            2200 non-null   int64  
 1   P            2200 non-null   int64  
 2   K            2200 non-null   int64  
 3   temperature  2200 non-null   float64
 4   humidity     2200 non-null   float64
 5   ph           2200 non-null   float64
 6   rainfall     2200 non-null   float64
 7   label        2200 non-null   object 
dtypes: float64(4), int64(3), object(1)
memory usage: 137.6+ KB


# This dataset consists of **2200** rows in total.

**Each row has 8 columns representing Nitrogen, Phosphorous, Potassium, Temperature, Humidity, PH, Rainfall and Label**

**NPK(Nitrogen, Phosphorous and Potassium)** values represent the NPK values in the soil.
**Temperature**, **humidity** and **rainfall** are the average values of the sorroundings environment respectively.
**PH** is the PH value present in the soil.
**Label** is the value of the type of crop which is suitable to be grown in the above given variable conditions. **Label is the value we will be predicting**

In [5]:
#Displaying the columns present in the data
data.columns

Index(['N', 'P', 'K', 'temperature', 'humidity', 'ph', 'rainfall', 'label'], dtype='object')

In [6]:
#Lets check if there is any missing values in the dataset
data.isnull().sum()

N              0
P              0
K              0
temperature    0
humidity       0
ph             0
rainfall       0
label          0
dtype: int64

In [7]:
#Displaying different types of label present in dataset
data['label'].unique()

array(['rice', 'maize', 'chickpea', 'kidneybeans', 'pigeonpeas',
       'mothbeans', 'mungbean', 'blackgram', 'lentil', 'pomegranate',
       'banana', 'mango', 'grapes', 'watermelon', 'muskmelon', 'apple',
       'orange', 'papaya', 'coconut', 'cotton', 'jute', 'coffee'],
      dtype=object)

In [8]:
#Let's check Crops present in this Dataset
data['label'].value_counts()

rice           100
maize          100
jute           100
cotton         100
coconut        100
papaya         100
orange         100
apple          100
muskmelon      100
watermelon     100
grapes         100
mango          100
banana         100
pomegranate    100
lentil         100
blackgram      100
mungbean       100
mothbeans      100
pigeonpeas     100
kidneybeans    100
chickpea       100
coffee         100
Name: label, dtype: int64

# Seperating features,output labels, creating training and test data.

In [9]:
features = data[['N' ,'P','K','temperature','humidity','ph','rainfall']]
labels = data['label']

In [10]:
#spliting into training and test dataset(Train:Test = 4:1)
X_train,X_test,Y_train,Y_test = train_test_split(features,labels,test_size=0.2,random_state = 42)

In [11]:
#list of accuracy values typically used to store accuracy values or evaluation 
acc = []

#list of models used to store references or names of different models. 
models_list = []

# Using Logistic Regression 

In [18]:
# Scale the input data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [19]:
# Create and fit the Logistic Regression model
LogReg = LogisticRegression(random_state=42,max_iter=100).fit(X_train_scaled,Y_train)

In [21]:
# Make predictions on the test set
predicted_values = LogReg.predict(X_test_scaled)

# Compute accuracy
accuracy = metrics.accuracy_score(Y_test, predicted_values)
acc.append(accuracy)
models_list.append("Logistic Regression")

In [22]:
#printing the accuracy
print("Logistic Regression accuracy: ", accuracy)

Logistic Regression accuracy:  0.9636363636363636


In [23]:
#To print the classification report
print(metrics.classification_report(Y_test, predicted_values))

              precision    recall  f1-score   support

       apple       1.00      1.00      1.00        23
      banana       1.00      1.00      1.00        21
   blackgram       0.90      0.95      0.93        20
    chickpea       1.00      1.00      1.00        26
     coconut       1.00      1.00      1.00        27
      coffee       0.94      1.00      0.97        17
      cotton       0.94      1.00      0.97        17
      grapes       1.00      1.00      1.00        14
        jute       0.83      0.87      0.85        23
 kidneybeans       0.95      0.95      0.95        20
      lentil       0.85      1.00      0.92        11
       maize       1.00      0.95      0.98        21
       mango       1.00      1.00      1.00        19
   mothbeans       1.00      0.92      0.96        24
    mungbean       1.00      1.00      1.00        19
   muskmelon       1.00      1.00      1.00        17
      orange       1.00      1.00      1.00        14
      papaya       0.96    