In [11]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load dataset from GitHub
url = "https://raw.githubusercontent.com/4GeeksAcademy/decision-tree-project-tutorial/main/diabetes.csv"
df = pd.read_csv(url)

# Split features (X) and target (y)
X = df.drop(columns=["Outcome"])  
y = df["Outcome"]  

# Split into 80% train, 20% test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# ✅ Check if X_train and y_train exist
print(f"X_train shape: {X_train.shape}, y_train shape: {y_train.shape}")
print(f"X_test shape: {X_test.shape}, y_test shape: {y_test.shape}")

# Combine into DataFrames
train_data = pd.concat([X_train, y_train], axis=1)
test_data = pd.concat([X_test, y_test], axis=1)

# ✅ Check if train_data is created
print("Train data preview:\n", train_data.head())


import os

# Create the folder if it doesn't exist
os.makedirs("../data/processed", exist_ok=True)

# Now save the files
train_data.to_csv("../data/processed/clean_train.csv", index=False)
test_data.to_csv("../data/processed/clean_test.csv", index=False)


print("✅ Train and test datasets saved successfully!")


X_train shape: (614, 8), y_train shape: (614,)
X_test shape: (154, 8), y_test shape: (154,)
Train data preview:
      Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
353            1       90             62             12       43  27.2   
711            5      126             78             27       22  29.6   
373            2      105             58             40       94  34.9   
46             1      146             56              0        0  29.7   
682            0       95             64             39      105  44.6   

     DiabetesPedigreeFunction  Age  Outcome  
353                     0.580   24        0  
711                     0.439   40        0  
373                     0.225   25        0  
46                      0.564   29        0  
682                     0.366   22        0  
✅ Train and test datasets saved successfully!


In [12]:
X_train = train_data.drop(["Outcome"], axis = 1)
y_train = train_data["Outcome"]
X_test = test_data.drop(["Outcome"], axis = 1)
y_test = test_data["Outcome"]

In [19]:
from xgboost import XGBClassifier

model = XGBClassifier(n_estimators = 200, learning_rate = 0.05, random_state = 42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
y_pred



array([1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1,
       1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1,
       0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1,
       1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0,
       0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1,
       0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0])

In [20]:
from sklearn.metrics import accuracy_score

accuracy_score(y_test, y_pred)

0.7597402597402597