In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score
import joblib

# Load the dataset
data = pd.read_csv('diabetes.csv')  # Replace with your actual file path

# Step 1: Create BMI_category column
def categorize_bmi(bmi):
    if bmi < 18.5:
        return 'Underweight'
    elif 18.5 <= bmi < 24.9:
        return 'Normal weight'
    elif 25 <= bmi < 29.9:
        return 'Overweight'
    else:
        return 'Obesity'

data['BMI_category'] = data['BMI'].apply(categorize_bmi)

# Step 2: Split the data into train and validation sets
X = data.drop(columns=['BMI_category', 'Outcome'])  # Features
y = data['Outcome']  # Target

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 3: Apply Standard Scaler on numeric features
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
scaler = StandardScaler()

# Step 4: Apply One-Hot Encoding on categorical features
categorical_features = X.select_dtypes(include=['object']).columns
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')


# Preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', scaler, numeric_features),
        ('cat', encoder, categorical_features)
    ])

# Step 5: KNN Classifier with different k values
knn_scores = {}
for k in [3, 5, 7]:
    knn = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', KNeighborsClassifier(n_neighbors=k))
    ])

    knn.fit(X_train, y_train)
    y_pred = knn.predict(X_val)
    f1 = f1_score(y_val, y_pred)
    knn_scores[k] = f1

best_k = max(knn_scores, key=knn_scores.get)
print(f'Best KNN k: {best_k}, F1 Score: {knn_scores[best_k]}')

# Step 6: Decision Tree Classifier with different max_depth values
dt_scores = {}
for max_depth in [3, 5, 7]:
    dt = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', DecisionTreeClassifier(max_depth=max_depth))
    ])

    dt.fit(X_train, y_train)
    y_pred = dt.predict(X_val)
    f1 = f1_score(y_val, y_pred)
    dt_scores[max_depth] = f1

best_depth = max(dt_scores, key=dt_scores.get)
print(f'Best Decision Tree max_depth: {best_depth}, F1 Score: {dt_scores[best_depth]}')

# Step 7: Build the inference pipeline
best_model = KNeighborsClassifier(n_neighbors=best_k)  # Or DecisionTreeClassifier(max_depth=best_depth)
inference_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', best_model)
])

# Fit the best model on the training data
inference_pipeline.fit(X_train, y_train)

# Save the scaler, encoder, and model
joblib.dump(inference_pipeline.named_steps['preprocessor'], 'preprocessor.pkl')
joblib.dump(inference_pipeline.named_steps['classifier'], 'best_model.pkl')

# Inference script example:
# Load the saved components
scaler = joblib.load('preprocessor.pkl')
model = joblib.load('best_model.pkl')

# Apply the transformations and make predictions on new test data
sample = X_val.iloc[0:5]  # Replace with actual test samples
sample_transformed = scaler.transform(sample)
predictions = model.predict(sample_transformed)

print(f'Predictions: {predictions}')


Best KNN k: 3, F1 Score: 0.5544554455445545
Best Decision Tree max_depth: 5, F1 Score: 0.6862745098039216
Predictions: [0 0 0 1 1]


**To improve the model, we'll apply a few techniques that can help:**

**Handle Class Imbalance:** We'll use SMOTE (Synthetic Minority Over-sampling Technique) to balance the dataset by generating synthetic samples for the minority class.

**Hyperparameter Tuning:** We'll use GridSearchCV to tune the hyperparameters of the Decision Tree and KNN models.

**Cross-Validation:** Use cross-validation to evaluate the models more reliably.

**SMOTE:** Applied SMOTE to balance the dataset before training. This generates synthetic data for the minority class to reduce the class imbalance problem.

**GridSearchCV:** Used GridSearchCV for hyperparameter tuning to find the best parameters for both the KNN and Decision Tree classifiers. The cv=5 performs 5-fold cross-validation.

**Cross-Validation:** Evaluated both models with cross-validation to get a more reliable estimate of their performance.

**Next Steps:**

After running this, check the F1 scores for both models after hyperparameter tuning.

The one with the higher F1 score will be chosen as the best model.

The best model will be saved, and predictions will be made on the test samples.

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import cross_val_score
import joblib

# Load the diabetes dataset
url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv"
columns = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome']
data = pd.read_csv(url, names=columns)

# 1. Create BMI category
def bmi_category(bmi):
    if bmi < 18.5:
        return 'Underweight'
    elif 18.5 <= bmi < 24.9:
        return 'Normal'
    elif 25 <= bmi < 29.9:
        return 'Overweight'
    else:
        return 'Obese'

data['BMI_category'] = data['BMI'].apply(bmi_category)

# 2. Split data into train and val sets
X = data.drop('Outcome', axis=1)
y = data['Outcome']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# 3. Apply Standard Scaler on numeric features
numeric_features = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age']
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

# 4. Apply One-Hot Encoding on categorical features
categorical_features = ['BMI_category']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# 5. Combine transformers into a ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# 6. Apply SMOTE after preprocessing
# Apply preprocessing to both training and validation sets
X_train_transformed = preprocessor.fit_transform(X_train)
X_val_transformed = preprocessor.transform(X_val)

# Apply SMOTE to the transformed training data
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_transformed, y_train)

# 7. Hyperparameter Tuning using GridSearchCV for KNN and Decision Tree
# KNN GridSearch
knn_model = Pipeline(steps=[
    ('classifier', KNeighborsClassifier())
])

knn_param_grid = {'classifier__n_neighbors': [3, 5, 7, 9]}
knn_grid_search = GridSearchCV(knn_model, knn_param_grid, cv=5, scoring='f1', n_jobs=-1)

# Fit the grid search on the resampled data
knn_grid_search.fit(X_train_resampled, y_train_resampled)

# Decision Tree GridSearch
dt_model = Pipeline(steps=[
    ('classifier', DecisionTreeClassifier())
])

dt_param_grid = {'classifier__max_depth': [3, 5, 7, 9], 'classifier__min_samples_split': [2, 5, 10]}
dt_grid_search = GridSearchCV(dt_model, dt_param_grid, cv=5, scoring='f1', n_jobs=-1)

# Fit the grid search on the resampled data
dt_grid_search.fit(X_train_resampled, y_train_resampled)

# 8. Evaluate both models using cross-validation and F1 score
knn_best_model = knn_grid_search.best_estimator_
dt_best_model = dt_grid_search.best_estimator_

knn_f1 = cross_val_score(knn_best_model, X_train_resampled, y_train_resampled, cv=5, scoring='f1').mean()
dt_f1 = cross_val_score(dt_best_model, X_train_resampled, y_train_resampled, cv=5, scoring='f1').mean()

print(f"Best KNN F1 score (after tuning): {knn_f1}")
print(f"Best Decision Tree F1 score (after tuning): {dt_f1}")

# 9. Save the best model and preprocessing pipeline
best_model = knn_best_model if knn_f1 > dt_f1 else dt_best_model
best_model.fit(X_train_resampled, y_train_resampled)

# Save the preprocessor, encoder, and model
joblib.dump(preprocessor, 'preprocessor.pkl')
joblib.dump(best_model.named_steps['classifier'], 'best_model.pkl')

# 10. Load and apply the saved models on test sample
preprocessor = joblib.load('preprocessor.pkl')
model = joblib.load('best_model.pkl')

# Example: apply on 5 validation samples
test_samples = X_val.head(5)
test_samples_transformed = preprocessor.transform(test_samples)
predictions = model.predict(test_samples_transformed)

print("Predictions for 5 test samples:", predictions)


Best KNN F1 score (after tuning): 0.8233687898240785
Best Decision Tree F1 score (after tuning): 0.7767446727757633
Predictions for 5 test samples: [0 0 1 0 1]
