In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# Step 1: Load the dataset
dataset = pd.read_csv("diabetes_dataset.csv")

# Step 2: Perform exploratory data analysis (EDA) and preprocessing
missing_values = dataset.isnull().sum()
dataset[["Glucose", "BloodPressure", "SkinThickness", "Insulin", "BMI"]] = dataset[["Glucose", "BloodPressure", "SkinThickness", "Insulin", "BMI"]].replace(0, np.nan)
dataset["Glucose"].fillna(dataset["Glucose"].mean(), inplace=True)
dataset["BloodPressure"].fillna(dataset["BloodPressure"].mean(), inplace=True)
dataset["SkinThickness"].fillna(dataset["SkinThickness"].median(), inplace=True)
dataset["Insulin"].fillna(dataset["Insulin"].median(), inplace=True)
dataset["BMI"].fillna(dataset["BMI"].median(), inplace=True)

X = dataset.drop("Outcome", axis=1)
y = dataset["Outcome"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Step 3: Define and train machine learning models
logreg = LogisticRegression()
logreg.fit(X_train, y_train)

dtree = DecisionTreeClassifier()
dtree.fit(X_train, y_train)

rf = RandomForestClassifier()
rf.fit(X_train, y_train)

# Step 4: Evaluate the algorithms
logreg_preds = logreg.predict(X_test)
dtree_preds = dtree.predict(X_test)
rf_preds = rf.predict(X_test)

logreg_accuracy = accuracy_score(y_test, logreg_preds)
dtree_accuracy = accuracy_score(y_test, dtree_preds)
rf_accuracy = accuracy_score(y_test, rf_preds)

# Step 5: Select the best algorithm
best_accuracy = max(logreg_accuracy, dtree_accuracy, rf_accuracy)
best_algorithm = ""

if best_accuracy == logreg_accuracy:
    best_algorithm = "Logistic Regression"
elif best_accuracy == dtree_accuracy:
    best_algorithm = "Decision Tree"
else:
    best_algorithm = "Random Forest"

print("Best Algorithm:", best_algorithm)


Best Algorithm: Logistic Regression


In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# Step 1: Load the dataset
dataset = pd.read_csv("diabetes_dataset.csv")

# Step 2: Data analysis
dataset_id = "XYZ"  # Assign an appropriate dataset ID
num_features = dataset.shape[1] - 1
num_instances = dataset.shape[0]
missing_values = dataset.isnull().sum().sum()  # No missing values in this dataset
outliers = None  # No information provided on outliers
future_correlation = None  # No information provided on future correlation
data_type = "Numerical"  # All features are numerical

# Step 3: Preprocess the data
X = dataset.drop("Outcome", axis=1)
y = dataset["Outcome"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Step 4: Train and evaluate algorithms
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
logreg_preds = logreg.predict(X_test)
logreg_accuracy = accuracy_score(y_test, logreg_preds)

dtree = DecisionTreeClassifier()
dtree.fit(X_train, y_train)
dtree_preds = dtree.predict(X_test)
dtree_accuracy = accuracy_score(y_test, dtree_preds)

rf = RandomForestClassifier()
rf.fit(X_train, y_train)
rf_preds = rf.predict(X_test)
rf_accuracy = accuracy_score(y_test, rf_preds)

# Step 5: Determine the best algorithm
best_accuracy = max(logreg_accuracy, dtree_accuracy, rf_accuracy)
if best_accuracy == logreg_accuracy:
    best_algorithm = "Logistic Regression"
elif best_accuracy == dtree_accuracy:
    best_algorithm = "Decision Tree"
else:
    best_algorithm = "Random Forest"

# Step 6: Predict the best algorithm for future datasets
future_data = {
    'Pregnancies': [6, 1, 8, 1, 0],
    'Glucose': [148, 85, 183, 89, 137],
    'BloodPressure': [72, 66, 64, 66, 40],
    'SkinThickness': [35, 29, 0, 23, 35],
    'Insulin': [0, 0, 0, 94, 168],
    'BMI': [33.6, 26.6, 23.3, 28.1, 43.1],
    'DiabetesPedigreeFunction': [0.627, 0.351, 0.672, 0.167, 2.288],
    'Age': [50, 31, 32, 21, 33]
}

future_dataset = pd.DataFrame(future_data)
future_dataset = future_dataset.drop("Unnamed: 0", axis=1)  # Drop the "Unnamed: 0" column

# Preprocess the future dataset using the same steps as above (Step 3)
future_dataset_scaled = scaler.transform(future_dataset)

# Make predictions using all three algorithms
logreg_predictions = logreg.predict(future_dataset_scaled)
dtree_predictions = dtree.predict(future_dataset_scaled)
rf_predictions = rf.predict(future_dataset_scaled)

# Determine the most frequent prediction
predictions = [logreg_predictions, dtree_predictions, rf_predictions]
best_prediction = max(set(predictions), key=predictions.count)
if best_prediction == logreg_predictions:
    best_algorithm_future = "Logistic Regression"
elif best_prediction == dtree_predictions:
    best_algorithm_future = "Decision Tree"
else:
    best_algorithm_future = "Random Forest"

# Print the analysis results
print("Dataset ID:", dataset_id)
print("No of Features:", num_features)
print("Number of Instances:", num_instances)
print("Missing Values:", missing_values)
print("Outliers:", outliers)
print("Future Correlation:", future_correlation)
print("Data Type:", data_type)
print("Best Algorithm:", best_algorithm)

# Print the prediction for future datasets
print("Best Algorithm (Future):", best_algorithm_future)


KeyError: "['Unnamed: 0'] not found in axis"

In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# Step 1: Load the dataset
dataset = pd.read_csv("diabetes_dataset.csv")

# Step 2: Data analysis
dataset_id = "XYZ"  # Assign an appropriate dataset ID
num_features = dataset.shape[1] - 1
num_instances = dataset.shape[0]
missing_values = dataset.isnull().sum().sum()  # No missing values in this dataset
outliers = None  # No information provided on outliers
future_correlation = None  # No information provided on future correlation
data_type = "Numerical"  # All features are numerical

# Step 3: Preprocess the data
X = dataset.drop("Outcome", axis=1)
y = dataset["Outcome"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Step 4: Train and evaluate algorithms
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
logreg_preds = logreg.predict(X_test)
logreg_accuracy = accuracy_score(y_test, logreg_preds)

dtree = DecisionTreeClassifier()
dtree.fit(X_train, y_train)
dtree_preds = dtree.predict(X_test)
dtree_accuracy = accuracy_score(y_test, dtree_preds)

rf = RandomForestClassifier()
rf.fit(X_train, y_train)
rf_preds = rf.predict(X_test)
rf_accuracy = accuracy_score(y_test, rf_preds)

# Step 5: Determine the best algorithm
best_accuracy = max(logreg_accuracy, dtree_accuracy, rf_accuracy)
if best_accuracy == logreg_accuracy:
    best_algorithm = "Logistic Regression"
elif best_accuracy == dtree_accuracy:
    best_algorithm = "Decision Tree"
else:
    best_algorithm = "Random Forest"

# Step 6: Predict the best algorithm for future datasets
future_data = {
    'Pregnancies': [6, 1, 8, 1, 0],
    'Glucose': [148, 85, 183, 89, 137],
    'BloodPressure': [72, 66, 64, 66, 40],
    'SkinThickness': [35, 29, 0, 23, 35],
    'Insulin': [0, 0, 0, 94, 168],
    'BMI': [33.6, 26.6, 23.3, 28.1, 43.1],
    'DiabetesPedigreeFunction': [0.627, 0.351, 0.672, 0.167, 2.288],
    'Age': [50, 31, 32, 21, 33]
}

future_dataset = pd.DataFrame(future_data)

# Preprocess the future dataset using the same steps as above (Step 3)
future_dataset_scaled = scaler.transform(future_dataset)

# Make predictions using all three algorithms
logreg_predictions = logreg.predict(future_dataset_scaled)
dtree_predictions = dtree.predict(future_dataset_scaled)
rf_predictions = rf.predict(future_dataset_scaled)

# Determine the most frequent prediction
predictions = [logreg_predictions, dtree_predictions, rf_predictions]
best_prediction = max(set(predictions), key=predictions.count)
if best_prediction == logreg_predictions:
    best_algorithm_future = "Logistic Regression"
elif best_prediction == dtree_predictions:
    best_algorithm_future = "Decision Tree"
else:
    best_algorithm_future = "Random Forest"

# Print the analysis results
print("Dataset ID:", dataset_id)
print("No of Features:", num_features)
print("Number of Instances:", num_instances)
print("Missing Values:", missing_values)
print("Outliers:", outliers)
print("Future Correlation:", future_correlation)
print("Data Type:", data_type)
print("Best Algorithm:", best_algorithm)

# Print the prediction for future datasets
print("Best Algorithm (Future):", best_algorithm_future)


ValueError: The feature names should match those that were passed during fit.
Feature names seen at fit time, yet now missing:
- Unnamed: 0


In [10]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# Step 1: Load the dataset
dataset = pd.read_csv("diabetes_dataset.csv")

# Step 2: Data analysis
dataset_id = "XYZ"  # Assign an appropriate dataset ID
num_features = dataset.shape[1] - 1
num_instances = dataset.shape[0]
missing_values = dataset.isnull().sum().sum()  # No missing values in this dataset
outliers = None  # No information provided on outliers
future_correlation = None  # No information provided on future correlation
data_type = "Numerical"  # All features are numerical

# Step 3: Preprocess the data
X = dataset.drop("Outcome", axis=1)
y = dataset["Outcome"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Step 4: Train and evaluate algorithms
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
logreg_preds = logreg.predict(X_test)
logreg_accuracy = accuracy_score(y_test, logreg_preds)

dtree = DecisionTreeClassifier()
dtree.fit(X_train, y_train)
dtree_preds = dtree.predict(X_test)
dtree_accuracy = accuracy_score(y_test, dtree_preds)

rf = RandomForestClassifier()
rf.fit(X_train, y_train)
rf_preds = rf.predict(X_test)
rf_accuracy = accuracy_score(y_test, rf_preds)

# Step 5: Determine the best algorithm
best_accuracy = max(logreg_accuracy, dtree_accuracy, rf_accuracy)
if best_accuracy == logreg_accuracy:
    best_algorithm = "Logistic Regression"
elif best_accuracy == dtree_accuracy:
    best_algorithm = "Decision Tree"
else:
    best_algorithm = "Random Forest"

# Step 6: Predict the best algorithm for future datasets
future_data = {
    'Pregnancies': [6, 1, 8, 1, 0],
    'Glucose': [148, 85, 183, 89, 137],
    'BloodPressure': [72, 66, 64, 66, 40],
    'SkinThickness': [35, 29, 0, 23, 35],
    'Insulin': [0, 0, 0, 94, 168],
    'BMI': [33.6, 26.6, 23.3, 28.1, 43.1],
    'DiabetesPedigreeFunction': [0.627, 0.351, 0.672, 0.167, 2.288],
    'Age': [50, 31, 32, 21, 33]
}

future_dataset = pd.DataFrame(future_data, columns=X.columns)  # Make sure the columns match X

# Preprocess the future dataset using the same scaler
future_dataset_scaled = scaler.transform(future_dataset)

# Make predictions using all three algorithms
logreg_predictions = logreg.predict(future_dataset_scaled)
dtree_predictions = dtree.predict(future_dataset_scaled)
rf_predictions = rf.predict(future_dataset_scaled)

# Determine the most frequent prediction
predictions = [logreg_predictions, dtree_predictions, rf_predictions]
best_prediction = max(set(predictions), key=predictions.count)
if best_prediction == logreg_predictions:
    best_algorithm_future = "Logistic Regression"
elif best_prediction == dtree_predictions:
    best_algorithm_future = "Decision Tree"
else:
    best_algorithm_future = "Random Forest"

# Print the analysis results
print("Dataset ID:", dataset_id)
print("No of Features:", num_features)
print("Number of Instances:", num_instances)
print("Missing Values:", missing_values)
print("Outliers:", outliers)
print("Future Correlation:", future_correlation)
print("Data Type:", data_type)
print("Best Algorithm:", best_algorithm)

# Print the prediction for future datasets
print("Best Algorithm (Future):", best_algorithm_future)


ValueError: Input X contains NaN.
LogisticRegression does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values