In [1]:
#importing the necessary modules for data preprocessing

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.impute import SimpleImputer  #used to fill missing data
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder  #both used for encoding
from sklearn.preprocessing import LabelEncoder  
from sklearn.model_selection import train_test_split  #spliting data for train and test
from sklearn.preprocessing import StandardScaler  #used for feature scaling


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
#imported dataset

dataset = pd.read_csv('./drive/MyDrive/AI enabled Leaf Detection /leaf_features.csv')
x = dataset.iloc[:, :-1].values  #created the matrix of features
y = dataset.iloc[:, -1].values   #created dependent variable vector


In [None]:
#Filled the missing data with mean values

imputer = SimpleImputer(missing_values=np.nan, strategy='mean')   
imputer.fit(x[:, 1:])                                                                                           
x[:, 1:] = imputer.transform(x[:, 1:])


In [None]:
#encoded the dependent variable i.e. first column

ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0])], remainder='passthrough')
x = np.array(ct.fit_transform(x))   


In [None]:
print(x)

[[1.0 0.0 0.0 ... 1021.0 0.690386652 -0.41530749]
 [1.0 0.0 0.0 ... 624.0 0.721623922 -0.721623922]
 [1.0 0.0 0.0 ... 1459.0 0.581762223 -0.227531757]
 ...
 [0.0 0.0 0.0 ... 1100.0 0.479663901 -0.197560266]
 [0.0 0.0 0.0 ... 1550.0 0.463163762 -0.106349798]
 [0.0 0.0 0.0 ... 1117.0 0.543434381 -0.200151951]]


In [None]:
le = LabelEncoder()    
y = le.fit_transform(y)       #labelled the dependent variable

In [None]:
print(y)

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
 3 3 3 3 3 3 3 3 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4
 4 4 4 4 4 4 4 4 4 4 4 4 

In [None]:
#splitted data into training and testing set

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 1)


In [None]:
#feature scaling

sc = StandardScaler()        #standadize values
x_train[:, 1:] = sc.fit_transform(x_train[:, 1:])  #we dont apply the feature scaling on dummy variable
x_test[:, 1:] = sc.transform(x_test[:, 1:])


In [None]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
classifier.fit(x_train, y_train)

RandomForestClassifier(criterion='entropy', n_estimators=10, random_state=0)

In [None]:
x_test

array([[0.0, 1.709482406194161, -0.46175016319781886, ...,
        -0.003154159816022185, 0.7116430955482006, -0.028117131235538627],
       [0.0, 1.709482406194161, -0.46175016319781886, ...,
        -1.6573709745505523, 0.003511280651565153, -0.9549516150212783],
       [1.0, -0.5849723848438491, -0.46175016319781886, ...,
        -0.8185229639819455, 0.8655966424082526, -1.3183963393155111],
       ...,
       [1.0, -0.5849723848438491, -0.46175016319781886, ...,
        0.9317451496726542, -0.1407353296008395, 0.6883347273048789],
       [0.0, -0.5849723848438491, -0.46175016319781886, ...,
        1.0534101283047423, 0.9520207734820672, 0.3894063888393145],
       [0.0, -0.5849723848438491, -0.46175016319781886, ...,
        0.9573588293846728, 1.3282700222740058, 0.015113479771049468]],
      dtype=object)

In [None]:
y_pred = classifier.predict(x_test)
y_pred

array([2, 2, 0, 0, 2, 0, 2, 0, 1, 0, 4, 2, 0, 0, 2, 3, 2, 1, 1, 0, 2, 0,
       2, 2, 0, 3, 0, 2, 2, 3, 2, 1, 1, 3, 2, 4, 3, 0, 3, 4, 2, 2, 1, 4,
       4, 3, 1, 3, 1, 0, 4, 1, 0, 1, 4, 0, 3, 2, 0, 1, 3, 0, 3, 3, 1, 1,
       1, 2, 2, 1, 2, 3, 2, 0, 0, 4, 2, 2, 1, 2, 0, 0, 1, 3, 2, 2, 3, 2,
       4, 0, 4, 2, 4, 0, 3, 0, 4, 0, 1, 0, 3, 3])

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[26  0  0  0  0]
 [ 0 18  0  0  0]
 [ 0  0 28  0  0]
 [ 0  0  0 18  0]
 [ 0  0  0  0 12]]


1.0

In [None]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators = 10, random_state = 42)

model.fit(x_train, y_train)

pred_test = model.predict(x_test)
print(pred_test)

[2 2 0 0 2 0 2 0 1 0 4 2 0 0 2 3 2 1 1 0 2 0 2 2 0 3 0 2 2 3 2 1 1 3 2 4 3
 0 3 4 2 2 1 4 4 3 1 3 1 0 4 1 0 1 4 0 3 2 0 1 3 0 3 3 1 1 1 2 2 1 2 3 2 0
 0 4 2 2 1 2 0 0 1 3 2 2 3 2 4 0 4 2 4 0 3 0 4 0 1 0 3 3]


In [None]:
from sklearn import metrics
print('accuracy = ', metrics.accuracy_score(y_test, pred_test))

accuracy =  1.0
