In [1]:
import warnings
warnings.filterwarnings('ignore')

import io
import pickle
import numpy as np
import pandas as pd 
import seaborn as sns
from scipy import stats
import matplotlib.pyplot as plt 
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [2]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

In [3]:
sns.set_style("whitegrid", {'axes.grid' : False})

In [4]:
crop_data = pd.read_csv("./Crop_recommendation.csv")
crop_data.head()

Unnamed: 0,N,P,K,temperature,humidity,ph,rainfall,label
0,90,42,43,20.879744,82.002744,6.502985,202.935536,rice
1,85,58,41,21.770462,80.319644,7.038096,226.655537,rice
2,60,55,44,23.004459,82.320763,7.840207,263.964248,rice
3,74,35,40,26.491096,80.158363,6.980401,242.864034,rice
4,78,42,42,20.130175,81.604873,7.628473,262.71734,rice


In [5]:
crop_data['label'].value_counts()

mungbean       100
orange         100
mango          100
banana         100
pomegranate    100
grapes         100
watermelon     100
jute           100
kidneybeans    100
chickpea       100
muskmelon      100
maize          100
rice           100
lentil         100
cotton         100
pigeonpeas     100
mothbeans      100
papaya         100
coffee         100
blackgram      100
apple          100
coconut        100
Name: label, dtype: int64

In [6]:
def explore_dataset(df):
    print(f"Number of Columns: {df.shape[1]}\n")
    print(f"Columns of Crop Dataset:\n{', '.join(df.columns)}\n")
    buf = io.StringIO()
    df.info(buf=buf)
    print(f"Information about dataset:\n{buf.getvalue()}")

explore_dataset(crop_data)

Number of Columns: 8

Columns of Crop Dataset:
N, P, K, temperature, humidity, ph, rainfall, label

Information about dataset:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2200 entries, 0 to 2199
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   N            2200 non-null   int64  
 1   P            2200 non-null   int64  
 2   K            2200 non-null   int64  
 3   temperature  2200 non-null   float64
 4   humidity     2200 non-null   float64
 5   ph           2200 non-null   float64
 6   rainfall     2200 non-null   float64
 7   label        2200 non-null   object 
dtypes: float64(4), int64(3), object(1)
memory usage: 137.6+ KB



In [7]:
def remove_duplicates(df):
    count_dups = df.duplicated().sum()
    print("Number of Duplicates: ", count_dups)
    if count_dups >= 1:
        df.drop_duplicates(inplace=True)
        print('Duplicate values removed!')
    else:
        print('No Duplicate values!')
        
remove_duplicates(crop_data)

Number of Duplicates:  0
No Duplicate values!


In [8]:
def split_dataset(df):
    X = df.drop(['label'], axis=1)
    Y = df['label']
    x_train, x_test, y_train, y_test = train_test_split(X,Y,test_size=0.2,random_state=0)
    
    #Normalize Data
    std = StandardScaler()
    x_train = std.fit_transform(x_train)
    x_test = std.transform(x_test)
    
    return x_train, x_test, y_train, y_test, std

x_train, x_test, y_train, y_test, scalar = split_dataset(crop_data)
print(f"Data Samples In X_TRAIN: {x_train.shape[0]}")
print(f"Data Samples In X_TEST: {x_test.shape[0]}")
print(f"Data Samples In Y_TRAIN: {y_train.shape[0]}")
print(f"Data Samples In Y_TEST: {y_test.shape[0]}")

Data Samples In X_TRAIN: 1760
Data Samples In X_TEST: 440
Data Samples In Y_TRAIN: 1760
Data Samples In Y_TEST: 440


In [9]:
def get_models():
    models = []
    models.append(('LogisticRegression'   , LogisticRegression()))
    models.append(('KNN'  , KNeighborsClassifier()))
    models.append(('DecisionTree' , DecisionTreeClassifier()))
    models.append(('GaussianNB'   , GaussianNB()))
    models.append(('SVM'  , SVC(probability=True)))
    return models

models = get_models()
print(models)

[('LogisticRegression', LogisticRegression()), ('KNN', KNeighborsClassifier()), ('DecisionTree', DecisionTreeClassifier()), ('GaussianNB', GaussianNB()), ('SVM', SVC(probability=True))]


In [10]:
def fit_model(x_train, y_train,models):
    trained_models = []
    
    for name, model in models:
        model.fit(x_train,y_train)
        trained_models.append((name,model))
        
    return trained_models

In [11]:
trained_models = fit_model(x_train,y_train,models)

In [12]:
def test_models(x_test, y_test, models):
    for name, model in models:
        y_pred = model.predict(x_test)
        accuracy = accuracy_score(y_test, y_pred)
        print(f"{name} : {accuracy:0.4f}")

In [13]:
test_models(x_test, y_test, trained_models)

LogisticRegression : 0.9636
KNN : 0.9773
DecisionTree : 0.9886
GaussianNB : 0.9932
SVM : 0.9773


In [14]:
def get_prediction(data, models, scalar):
    predictions = []
    
    #data = scalar.transform(data)
    for name, model in models:
        pred = model.predict(data)
        #print(name, pred)
        predictions.append(pred[0])
    
    output = stats.mode(predictions)[0][0]
    return output

In [15]:
x_test[3]

array([ 0.03035926,  0.58727738,  0.5983019 , -1.65624513, -2.39295929,
        1.80265402, -0.17434891])

In [16]:
input_val = x_test[3].reshape(1,7)
output = get_prediction(input_val, trained_models, scalar)
output

'chickpea'

In [17]:
with open('./model.pkl','wb') as f:
    pickle.dump((trained_models,scalar),f)

In [18]:
with open('./model.pkl','rb') as f:
    models,scalar = pickle.load(f)