# Capstone Project

In [9]:
#Step 1: Setup and Import Necessary Libraries
# Import necessary libraries

import pandas as pd           
import numpy as np           
import matplotlib.pyplot as plt 
import seaborn as sns        

#Additional libraries for machine learning
from sklearn.preprocessing import LabelEncoder, Normalizer
from sklearn.linear_model import LogisticRegressions
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, roc_curve, auc

# Set plot style
sns.set(style="whitegrid")

#Step 2: Import the Dataset

# Load the dataset
df = pd.read_csv("Dentistry Dataset.csv")

# Display the first few rows of the dataset
df.head()


#Step 3: Data Preprocessing

# Check for missing values
print(df.isnull().sum())

# Drop rows with missing values (if any)
df.dropna(inplace=True)

# Encode the 'Gender' column
label_encoder = LabelEncoder()
df['Gender'] = label_encoder.fit_transform(df['Gender'])

# Display the unique values in the 'Gender' column
df['Gender'].unique()

# Define the independent variables (features) and the dependent variable (target)
X = df.drop(['Sl No', 'Sample ID', 'Gender'], axis=1)
y = df['Gender']

# Normalize the independent variables
normalizer = Normalizer()
X_normalized = normalizer.fit_transform(X)
                                        
# 4: Exploratory Data Analysis
# Compute the correlation matrix
corr = df.corr()

# Generate a heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(corr, annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()
#Step 5: Model Building

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_normalized, y, test_size=0.2, random_state=42)

# Train Logistic Regression model
logistic_model = LogisticRegression()
logistic_model.fit(X_train, y_train)

# Predict and evaluate the Logistic Regression model
y_pred_logistic = logistic_model.predict(X_test)
logistic_accuracy = accuracy_score(y_test, y_pred_logistic)
print(f'Logistic Regression Accuracy: {logistic_accuracy}')

# Train Decision Tree Classifier
decision_tree_model = DecisionTreeClassifier()
decision_tree_model.fit(X_train, y_train)

# Predict and evaluate the Decision Tree Classifier
y_pred_tree = decision_tree_model.predict(X_test)
tree_accuracy = accuracy_score(y_test, y_pred_tree)
print(f'Decision Tree Classifier Accuracy: {tree_accuracy}')

# Train Random Forest Classifier
random_forest_model = RandomForestClassifier()
random_forest_model.fit(X_train, y_train)

# Predict and evaluate the Random Forest Classifier
y_pred_forest = random_forest_model.predict(X_test)
forest_accuracy = accuracy_score(y_test, y_pred_forest)
print(f'Random Forest Classifier Accuracy: {forest_accuracy}')

# Train XGBoost Classifier
xgboost_model = XGBClassifier()
xgboost_model.fit(X_train, y_train)

# Predict and evaluate the XGBoost Classifier
y_pred_xgboost = xgboost_model.predict(X_test)
xgboost_accuracy = accuracy_score(y_test, y_pred_xgboost)
print(f'XGBoost Classifier Accuracy: {xgboost_accuracy}')

# Train XGBoost Classifier
xgboost_model = XGBClassifier()
xgboost_model.fit(X_train, y_train)

# Predict and evaluate the XGBoost Classifier
y_pred_xgboost = xgboost_model.predict(X_test)
xgboost_accuracy = accuracy_score(y_test, y_pred_xgboost)
print(f'XGBoost Classifier Accuracy: {xgboost_accuracy}')

#Step 6: Model 

# Define a function to plot ROC curve
def plot_roc_curve(y_test, y_pred, model_name):
    fpr, tpr, _ = roc_curve(y_test, y_pred)
    roc_auc = auc(fpr, tpr)
    
    plt.figure(figsize=(8, 6))
    plt.plot(fpr, tpr, label=f'ROC curve (area = {roc_auc:0.2f})')
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(f'Receiver Operating Characteristic - {model_name}')
    plt.legend(loc='lower right')
    plt.show()

# Evaluate Logistic Regression
conf_matrix_logistic = confusion_matrix(y_test, y_pred_logistic)
print(f'Logistic Regression Confusion Matrix:\n{conf_matrix_logistic}')
plot_roc_curve(y_test, y_pred_logistic, 'Logistic Regression')

# Evaluate Decision Tree
conf_matrix_tree = confusion_matrix(y_test, y_pred_tree)
print(f'Decision Tree Confusion Matrix:\n{conf_matrix_tree}')
plot_roc_curve(y_test, y_pred_tree, 'Decision Tree')

# Evaluate Random Forest
conf_matrix_forest = confusion_matrix(y_test, y_pred_forest)
print(f'Random Forest Confusion Matrix:\n{conf_matrix_forest}')
plot_roc_curve(y_test, y_pred_forest, 'Random Forest')

# Evaluate XGBoost
conf_matrix_xgboost = confusion_matrix(y_test, y_pred_xgboost)
print(f'XGBoost Confusion Matrix:\n{conf_matrix_xgboost}')
plot_roc_curve(y_test, y_pred_xgboost, 'XGBoost')


ModuleNotFoundError: No module named 'xgboost'

1.Project Title and Description: A brief overview of what the project is about and its goals.
2.Dataset Information: Information about the dataset, including the columns and how the data was prepared.
3.Exploratory Data Analysis: Important findings from analyzing the data, including a heatmap showing correlations.
4.Model Building: Details about the machine learning models used and how well they performed.
5.Model Evaluation: Metrics and visualizations to evaluate the models, like confusion matrices and ROC curves.
6.Conclusion: A summary of the results and suggestions for future improvements.
Summary
This project shows how dental metrics can be used to predict gender with different machine learning models. The steps include data preparation, analysis, model building, and evaluation. The best model is chosen based on accuracy and ROC-AUC scores.