In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

# Load the dataset from a CSV file
diabetes_data = pd.read_csv('diabetes_dataset.csv')

# Display basic information about the dataset
dataset_id = 'Dataset'
num_features = len(diabetes_data.columns) - 1  # Exclude the target column
num_instances = len(diabetes_data)
missing_values = diabetes_data.isnull().sum().sum()
outliers = (np.abs(diabetes_data) > 3).sum().sum()
future_correlation = None  # You can calculate this based on the dataset if needed
data_type = diabetes_data.dtypes.unique()

print("Dataset ID:", dataset_id)
print("No. of Features:", num_features)
print("Number of Instances:", num_instances)
print("Missing Values:", missing_values)
print("Outliers:", outliers)
print("Future Correlation:", future_correlation)
print("Data Type:", data_type)

# Handling missing values
diabetes_data.fillna(diabetes_data.mean(), inplace=True)

# Split the dataset into features and target
X = diabetes_data.drop('Outcome', axis=1)
y = diabetes_data['Outcome']

# Split the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train and evaluate the decision tree classifier
decision_tree = DecisionTreeClassifier()
decision_tree.fit(X_train, y_train)
y_pred_dt = decision_tree.predict(X_test)
accuracy_dt = accuracy_score(y_test, y_pred_dt)

# Train and evaluate the random forest classifier
random_forest = RandomForestClassifier()
random_forest.fit(X_train, y_train)
y_pred_rf = random_forest.predict(X_test)
accuracy_rf = accuracy_score(y_test, y_pred_rf)

# Train and evaluate the logistic regression classifier
logistic_regression = LogisticRegression()
logistic_regression.fit(X_train, y_train)
y_pred_lr = logistic_regression.predict(X_test)
accuracy_lr = accuracy_score(y_test, y_pred_lr)

# Train and evaluate the support vector machine classifier
svm = SVC()
svm.fit(X_train, y_train)
y_pred_svm = svm.predict(X_test)
accuracy_svm = accuracy_score(y_test, y_pred_svm)

# Train and evaluate the naive bayes classifier
naive_bayes = GaussianNB()
naive_bayes.fit(X_train, y_train)
y_pred_nb = naive_bayes.predict(X_test)
accuracy_nb = accuracy_score(y_test, y_pred_nb)

# Train and evaluate the k-nearest neighbors classifier
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
y_pred_knn = knn.predict(X_test)
accuracy_knn = accuracy_score(y_test, y_pred_knn)

# Compare the accuracies of different algorithms
accuracies = {
    'Decision Tree': accuracy_dt,
    'Random Forest': accuracy_rf,
    'Logistic Regression': accuracy_lr,
    'Support Vector Machine': accuracy_svm,
    'Naive Bayes': accuracy_nb,
    'K-Nearest Neighbors': accuracy_knn
}

best_algorithm = max(accuracies, key=accuracies.get)

# Display the results
print("\nAccuracy Scores:")
for algorithm, accuracy in accuracies.items():
    print(algorithm, ":", accuracy)

print("\nBest Algorithm:", best_algorithm)

# Display the head of the diabetes dataset
print("\nDiabetes Dataset:")
print(diabetes_data.head())


FileNotFoundError: [Errno 2] No such file or directory: 'Dataset.csv'