In [1]:
"""
# Import necessary libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_curve, roc_auc_score
from sklearn.model_selection import cross_val_score
from sklearn import tree

# Q1: Import the dataset and examine the variables
file_path = 'path_to_diabetes.csv'  # Replace with the actual path
diabetes_data = pd.read_csv(file_path)

# Display the first few rows of the dataset
print(diabetes_data.head())

# Descriptive statistics
print(diabetes_data.describe())

# Visualizations
sns.pairplot(diabetes_data, hue='Outcome', diag_kind='kde')
plt.show()

# Q2: Preprocess the data
# Handle missing values
diabetes_data.fillna(diabetes_data.median(), inplace=True)

# Remove outliers (using z-score as an example)
z_scores = (diabetes_data - diabetes_data.mean()) / diabetes_data.std()
diabetes_data = diabetes_data[(z_scores < 3).all(axis=1)]

# Transform categorical variables into dummy variables if necessary

# Q3: Split the dataset
X = diabetes_data.drop('Outcome', axis=1)
y = diabetes_data['Outcome']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Q4: Train the decision tree model
dt_model = DecisionTreeClassifier()
cross_val_accuracy = cross_val_score(dt_model, X_train, y_train, cv=5, scoring='accuracy')
print("Cross-Validation Accuracy:", cross_val_accuracy.mean())

# Q5: Evaluate the performance of the model
dt_model.fit(X_train, y_train)
y_pred = dt_model.predict(X_test)

# Metrics
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred))

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()

# ROC Curve
fpr, tpr, thresholds = roc_curve(y_test, dt_model.predict_proba(X_test)[:, 1])
plt.plot(fpr, tpr, label='Decision Tree')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend()
plt.show()

# AUC Score
print("AUC Score:", roc_auc_score(y_test, dt_model.predict_proba(X_test)[:, 1]))

# Q6: Interpret the decision tree
plt.figure(figsize=(15, 10))
tree.plot_tree(dt_model, feature_names=X.columns, class_names=['Non-Diabetic', 'Diabetic'], filled=True, rounded=True)
plt.show()

# Q7: Validate the decision tree model (sensitivity analysis and scenario testing)
# Apply the model to new data or introduce changes to the dataset/environment
# Explore the impact of variations in input features on model predictions
"""

'\n# Import necessary libraries\nimport pandas as pd\nimport matplotlib.pyplot as plt\nimport seaborn as sns\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.tree import DecisionTreeClassifier\nfrom sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_curve, roc_auc_score\nfrom sklearn.model_selection import cross_val_score\nfrom sklearn import tree\n\n# Q1: Import the dataset and examine the variables\nfile_path = \'path_to_diabetes.csv\'  # Replace with the actual path\ndiabetes_data = pd.read_csv(file_path)\n\n# Display the first few rows of the dataset\nprint(diabetes_data.head())\n\n# Descriptive statistics\nprint(diabetes_data.describe())\n\n# Visualizations\nsns.pairplot(diabetes_data, hue=\'Outcome\', diag_kind=\'kde\')\nplt.show()\n\n# Q2: Preprocess the data\n# Handle missing values\ndiabetes_data.fillna(diabetes_data.median(), inplace=True)\n\n# Remove outliers (using z-score as an example)\nz_scores = (diab