In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler


In [None]:
# Carregar o arquivo
from google.colab import files
uploaded = files.upload()


df = pd.read_excel('Dry_Bean_Dataset.xlsx', engine='openpyxl')


Saving Dry_Bean_Dataset.xlsx to Dry_Bean_Dataset.xlsx


In [None]:
#analise exploratoria

# Mostrar as primeiras linhas do dataset pra análise
print("Primeiras linhas do dataset:")
print(df.head())

# Verificar informações gerais do dataset pra análise
print("\nInformações gerais do dataset:")
print(df.info())

# Estatísticas descritivas pra análise
print("\nEstatísticas descritivas:")
print(df.describe())

# Verificar valores ausentes pra análise
print("\nValores ausentes em cada coluna:")
print(df.isnull().sum())

# Verificar duplicatas pra análise
print("\nNúmero de duplicatas:")
print(df.duplicated().sum())

Primeiras linhas do dataset:
    Area  Perimeter  MajorAxisLength  MinorAxisLength  AspectRation  \
0  28395    610.291       208.178117       173.888747      1.197191   
1  28734    638.018       200.524796       182.734419      1.097356   
2  29380    624.110       212.826130       175.931143      1.209713   
3  30008    645.884       210.557999       182.516516      1.153638   
4  30140    620.134       201.847882       190.279279      1.060798   

   Eccentricity  ConvexArea  EquivDiameter    Extent  Solidity  roundness  \
0      0.549812       28715     190.141097  0.763923  0.988856   0.958027   
1      0.411785       29172     191.272750  0.783968  0.984986   0.887034   
2      0.562727       29690     193.410904  0.778113  0.989559   0.947849   
3      0.498616       30724     195.467062  0.782681  0.976696   0.903936   
4      0.333680       30417     195.896503  0.773098  0.990893   0.984877   

   Compactness  ShapeFactor1  ShapeFactor2  ShapeFactor3  ShapeFactor4  Class  
0

Dividir os dados em características (X) e rótulo (y)


In [None]:
# Supondo que a última coluna seja o rótulo
X = df.iloc[:, :-1].values
y = df.iloc[:, -1].values


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


 Normalizar ou padronizar os dados


In [None]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


Importar os modelos necessários

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier

 Treinar cada modelo

In [None]:
# 1. Regressão Logística
logreg = LogisticRegression(max_iter=1000)
logreg.fit(X_train, y_train)

# 2. K-Nearest Neighbors (KNN)
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)

# 3. Random Forest
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# 4. Support Vector Machine (SVM)
svm = SVC()
svm.fit(X_train, y_train)

# 5. Rede Neural (MLP)
mlp = MLPClassifier(max_iter=1000, random_state=42)
mlp.fit(X_train, y_train)


Importar as funções para cálculo de métricas

In [None]:
from sklearn.metrics import accuracy_score, f1_score


Calcular as métricas para cada modelo

In [None]:
# Função para calcular métricas
def evaluate_model(model, X_train, y_train, X_test, y_test):
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    train_accuracy = accuracy_score(y_train, y_train_pred)
    test_accuracy = accuracy_score(y_test, y_test_pred)

    train_f1 = f1_score(y_train, y_train_pred, average='weighted')
    test_f1 = f1_score(y_test, y_test_pred, average='weighted')

    return train_accuracy, test_accuracy, train_f1, test_f1

# Avaliando cada modelo
models = [logreg, knn, rf, svm, mlp]
model_names = ['Logistic Regression', 'KNN', 'Random Forest', 'SVM', 'MLP']
results = {}

for model, name in zip(models, model_names):
    train_acc, test_acc, train_f1, test_f1 = evaluate_model(model, X_train, y_train, X_test, y_test)
    results[name] = {
        'Train Accuracy': train_acc,
        'Test Accuracy': test_acc,
        'Train F1-Score': train_f1,
        'Test F1-Score': test_f1
    }

# Exibindo os resultados
results_df = pd.DataFrame(results).T
print(results_df)


                     Train Accuracy  Test Accuracy  Train F1-Score  \
Logistic Regression        0.925475       0.922870        0.925558   
KNN                        0.941325       0.921645        0.941375   
Random Forest              1.000000       0.922625        1.000000   
SVM                        0.930198       0.932909        0.930314   
MLP                        0.939120       0.932664        0.939130   

                     Test F1-Score  
Logistic Regression       0.923256  
KNN                       0.921767  
Random Forest             0.922656  
SVM                       0.933080  
MLP                       0.932773  
