In [1]:
import pandas as pd
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv('soybean.csv')

In [3]:
df.head()

Unnamed: 0,date,plant-stand,precip,temp,hail,crop-hist,area-damaged,severity,seed-tmt,germination,...,sclerotia,fruit-pods,fruit-spots,seed,mold-growth,seed-discolor,seed-size,shriveling,roots,class
0,october,normal,gt-norm,norm,yes,same-lst-yr,low-areas,pot-severe,none,90-100,...,absent,norm,dna,norm,absent,absent,norm,absent,norm,diaporthe-stem-canker
1,august,normal,gt-norm,norm,yes,same-lst-two-yrs,scattered,severe,fungicide,80-89,...,absent,norm,dna,norm,absent,absent,norm,absent,norm,diaporthe-stem-canker
2,july,normal,gt-norm,norm,yes,same-lst-yr,scattered,severe,fungicide,lt-80,...,absent,norm,dna,norm,absent,absent,norm,absent,norm,diaporthe-stem-canker
3,july,normal,gt-norm,norm,yes,same-lst-yr,scattered,severe,none,80-89,...,absent,norm,dna,norm,absent,absent,norm,absent,norm,diaporthe-stem-canker
4,october,normal,gt-norm,norm,yes,same-lst-two-yrs,scattered,pot-severe,none,lt-80,...,absent,norm,dna,norm,absent,absent,norm,absent,norm,diaporthe-stem-canker


In [4]:
df.shape

(683, 36)

In [5]:
# Selecionando previsores e classificador
previsores = df.iloc[:, 0:-1].values
classe = df.iloc[:, -1].values

In [6]:
# Convertendo valores categóricos em numéricos
le = LabelEncoder()

for i in range(0,35):
    previsores[:,i] = le.fit_transform(previsores[:,i])

previsores

array([[6, 2, 1, ..., 2, 1, 2],
       [2, 2, 1, ..., 2, 1, 2],
       [3, 2, 1, ..., 2, 1, 2],
       ...,
       [4, 1, 0, ..., 0, 0, 3],
       [1, 1, 0, ..., 0, 0, 3],
       [4, 1, 0, ..., 0, 0, 3]], dtype=object)

In [7]:
# Separação dos dados de treino e teste
x_treino, x_teste, y_treino, y_teste = train_test_split(previsores, classe, test_size=0.3, random_state=0)
x_teste

array([[4, 2, 1, ..., 2, 1, 2],
       [2, 1, 3, ..., 0, 0, 3],
       [5, 1, 0, ..., 0, 0, 3],
       ...,
       [7, 2, 1, ..., 2, 1, 2],
       [4, 1, 1, ..., 2, 1, 3],
       [5, 1, 3, ..., 1, 2, 0]], dtype=object)

In [8]:
# Treinando nosso modelo
modelo = GaussianNB()
modelo.fit(x_treino, y_treino)
modelo

In [9]:
# Previsões usando dados de teste
previsoes = modelo.predict(x_teste)

In [10]:
previsoes

array(['alternarialeaf-spot', 'phytophthora-rot', 'herbicide-injury',
       'purple-seed-stain', 'cyst-nematode', 'bacterial-blight',
       'brown-stem-rot', 'bacterial-pustule', 'brown-spot',
       'alternarialeaf-spot', 'charcoal-rot', 'bacterial-blight',
       'anthracnose', 'anthracnose', 'alternarialeaf-spot', 'brown-spot',
       'alternarialeaf-spot', 'alternarialeaf-spot',
       'rhizoctonia-root-rot', 'phytophthora-rot', 'anthracnose',
       'brown-stem-rot', 'phyllosticta-leaf-spot',
       'phyllosticta-leaf-spot', 'diaporthe-stem-canker', 'brown-spot',
       'anthracnose', 'diaporthe-stem-canker', 'bacterial-pustule',
       'phytophthora-rot', 'frog-eye-leaf-spot', 'alternarialeaf-spot',
       'phyllosticta-leaf-spot', 'frog-eye-leaf-spot',
       'frog-eye-leaf-spot', 'bacterial-pustule', 'brown-stem-rot',
       'charcoal-rot', 'brown-spot', 'alternarialeaf-spot',
       'brown-stem-rot', 'alternarialeaf-spot', 'phytophthora-rot',
       'anthracnose', 'bacterial

In [11]:
# Matriz de confusão e validação do modelo
cm = confusion_matrix(y_teste, previsoes)
cm

array([[ 3,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0],
       [ 0, 26,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  0,
         0,  0,  0],
       [ 0,  0, 14,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0],
       [ 0,  0,  0,  8,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0],
       [ 0,  0,  0,  0,  7,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0],
       [ 0,  7,  0,  0,  0, 10,  0,  0,  0,  0,  0,  0,  1,  0,  5,  0,
         0,  0,  0],
       [ 0,  0,  0,  0,  0,  0, 13,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0],
       [ 0,  0,  0,  0,  0,  0,  0,  9,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  5,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  6,  0,  0,  0,  0,  0,  0,
         0,  0,  0],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  8,  0,  0,  0,  0,  0

In [12]:
accuracy_score(y_teste, previsoes)

0.8585365853658536