In [1]:
import pandas as pd
import numpy as np
from collections import Counter
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder

In [2]:
def apply_algorithms(X, y):
    if isinstance(X, np.ndarray):
        X = pd.DataFrame(X)
    
    # Impute missing values
    imputer = SimpleImputer(strategy='most_frequent')
    X = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)
    
    # Identify categorical columns
    categorical_cols = X.select_dtypes(include='object').columns
    encoder = LabelEncoder()
    
    # Encode categorical columns
    for col in categorical_cols:
        X[col] = encoder.fit_transform(X[col])
    
    # Convert X to a NumPy array after preprocessing
    X = X.values
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    models = {
        'Naive Bayes': GaussianNB(),
        'Random Forest': RandomForestClassifier(n_estimators=50, max_depth=10),
        'SVM': SVC(kernel='sigmoid'),
        'Logistic Regression': LogisticRegression(max_iter=95)
    }

    results = {}

    for name, model in models.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, average='weighted')
        results[name] = {'Accuracy': accuracy, 'Precision': precision}

    return results

In [3]:
rice_data = pd.read_csv('rice_dataset.csv')
X_rice = rice_data.drop(' Class', axis=1)
y_rice = rice_data[' Class']
rice_results = apply_algorithms(X_rice, y_rice)

salary_data = pd.read_csv('adult1.csv')
X_salary = salary_data.drop(columns='income',axis=1)
y_salary = salary_data['income']
salary_results = apply_algorithms(X_salary, y_salary)

heart_data = pd.read_csv('echocardiogram.csv')
X_heart = heart_data.drop(columns='alive-at-1',axis=1)
y_heart = heart_data['alive-at-1']
heart_results = apply_algorithms(X_heart, y_heart)

for dataset, results in zip(['rice','Adult','echocardiogram'], [rice_results,salary_results,heart_results]):
    print(f"\nResults for {dataset} Dataset:")
    for model, metrics in results.items():
        print(f"{model}:")
        print(f"  Accuracy: {metrics['Accuracy']:.4f}")
        print(f"  Precision: {metrics['Precision']:.4f}")
    
    best_model = max(results, key=lambda x: results[x]['Accuracy'])
    best_accuracy = results[best_model]['Accuracy']
    print(f"\nBest model for {dataset} Dataset: {best_model} (Accuracy: {best_accuracy:.4f})")


Results for rice Dataset:
Naive Bayes:
  Accuracy: 0.9147
  Precision: 0.9148
Random Forest:
  Accuracy: 0.9226
  Precision: 0.9225
SVM:
  Accuracy: 0.1942
  Precision: 0.1789
Logistic Regression:
  Accuracy: 0.9291
  Precision: 0.9291

Best model for rice Dataset: Logistic Regression (Accuracy: 0.9291)

Results for Adult Dataset:
Naive Bayes:
  Accuracy: 0.8196
  Precision: 0.8072
Random Forest:
  Accuracy: 0.8592
  Precision: 0.8540
SVM:
  Accuracy: 0.6510
  Precision: 0.6473
Logistic Regression:
  Accuracy: 0.8012
  Precision: 0.7835

Best model for Adult Dataset: Random Forest (Accuracy: 0.8592)

Results for echocardiogram Dataset:
Naive Bayes:
  Accuracy: 0.5926
  Precision: 0.6557
Random Forest:
  Accuracy: 0.6296
  Precision: 0.6777
SVM:
  Accuracy: 0.7037
  Precision: 0.6905
Logistic Regression:
  Accuracy: 0.5926
  Precision: 0.6278

Best model for echocardiogram Dataset: SVM (Accuracy: 0.7037)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
