In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc, mean_absolute_error, mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVC

# Wine Quality Classification

# Load the dataset
wine_url = "https://github.com/FlipRoboTechnologies/ML-Datasets/raw/main/Red%20Wine/winequality-red.csv"
wine_data = pd.read_csv(wine_url)

# Check for missing values
print(wine_data.isnull().sum())

# Convert the quality score into binary classification
wine_data['quality'] = np.where(wine_data['quality'] >= 7, 1, 0)

# Separate features and target variable
X_wine = wine_data.drop('quality', axis=1)
y_wine = wine_data['quality']

# Split the data into training and testing sets
X_wine_train, X_wine_test, y_wine_train, y_wine_test = train_test_split(X_wine, y_wine, test_size=0.2, random_state=42)

# Normalize/scale the input features
scaler = StandardScaler()
X_wine_train = scaler.fit_transform(X_wine_train)
X_wine_test = scaler.transform(X_wine_test)

# Apply SelectKBest to find the best features
selector = SelectKBest(score_func=f_classif, k='all')
selector.fit(X_wine_train, y_wine_train)
scores = selector.scores_

# Visualize the scores
plt.figure(figsize=(10, 6))
plt.bar(range(len(scores)), scores)
plt.xticks(range(len(scores)), X_wine.columns, rotation=90)
plt.title('Feature Scores')
plt.show()

# Train different classification models
wine_models = {
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'Logistic Regression': LogisticRegression(),
    'SVM': SVC(probability=True)
}

# Perform hyperparameter tuning using GridSearchCV
wine_params = {
    'Decision Tree': {'max_depth': [3, 5, 7, 10]},
    'Random Forest': {'n_estimators': [50, 100, 200], 'max_depth': [3, 5, 7, 10]},
    'Logistic Regression': {'C': [0.1, 1, 10]},
    'SVM': {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf']}
}

wine_best_estimators = {}
for model_name in wine_models:
    grid_search = GridSearchCV(wine_models[model_name], wine_params[model_name], cv=5, scoring='roc_auc')
    grid_search.fit(X_wine_train, y_wine_train)
    wine_best_estimators[model_name] = grid_search.best_estimator_
    print(f'Best parameters for {model_name}: {grid_search.best_params_}')

# Evaluate the models
for model_name in wine_best_estimators:
    y_wine_pred = wine_best_estimators[model_name].predict(X_wine_test)
    y_wine_proba = wine_best_estimators[model_name].predict_proba(X_wine_test)[:, 1]
    
    print(f'Classification Report for {model_name}')
    print(classification_report(y_wine_test, y_wine_pred))
    
    # Confusion matrix
    cm = confusion_matrix(y_wine_test, y_wine_pred)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title(f'Confusion Matrix for {model_name}')
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.show()
    
    # ROC curve and AUC
    fpr, tpr, _ = roc_curve(y_wine_test, y_wine_proba)
    roc_auc = auc(fpr, tpr)
    plt.plot(fpr, tpr, label=f'{model_name} (AUC = {roc_auc:.2f})')
    
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend(loc='lower right')
plt.show()

# Medical Cost Prediction

# Load the dataset
medical_url = "https://github.com/FlipRoboTechnologies/ML-Datasets/raw/main/Medical%20Cost%20Insurance/medical_cost_insurance.csv"
medical_data = pd.read_csv(medical_url)

# Check for missing values
print(medical_data.isnull().sum())

# Separate features and target variable
X_medical = medical_data.drop('charges', axis=1)
y_medical = medical_data['charges']

# Encode categorical variables
categorical_features = ['sex', 'smoker', 'region']
numerical_features = ['age', 'bmi', 'children']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(), categorical_features)])

# Split the data into training and testing sets
X_medical_train, X_medical_test, y_medical_train, y_medical_test = train_test_split(X_medical, y_medical, test_size=0.2, random_state=42)

# Apply the preprocessor to the data
X_medical_train = preprocessor.fit_transform(X_medical_train)
X_medical_test = preprocessor.transform(X_medical_test)

# Train different regression models
medical_models = {
    'Linear Regression': LinearRegression(),
    'Decision Tree': DecisionTreeRegressor(),
    'Random Forest': RandomForestRegressor(),
    'Gradient Boosting': GradientBoostingRegressor()
}

# Perform hyperparameter tuning using GridSearchCV
medical_params = {
    'Decision Tree': {'max_depth': [3, 5, 7, 10]},
    'Random Forest': {'n_estimators': [50, 100, 200], 'max_depth': [3, 5, 7, 10]},
    'Gradient Boosting': {'n_estimators': [50, 100, 200], 'learning_rate': [0.01, 0.1, 0.2], 'max_depth': [3, 5, 7]}
}

medical_best_estimators = {}
for model_name in medical_models:
    if model_name == 'Linear Regression':
        medical_models[model_name].fit(X_medical_train, y_medical_train)
        medical_best_estimators[model_name] = medical_models[model_name]
    else:
        grid_search = GridSearchCV(medical_models[model_name], medical_params[model_name], cv=5, scoring='neg_mean_squared_error')
        grid_search.fit(X_medical_train, y_medical_train)
        medical_best_estimators[model_name] = grid_search.best_estimator_
        print(f'Best parameters for {model_name}: {grid_search.best_params_}')

# Evaluate the models
for model_name in medical_best_estimators:
    y_medical_pred = medical_best_estimators[model_name].predict(X_medical_test)
    
    print(f'Evaluation Report for {model_name}')
    print(f'Mean Absolute Error: {mean_absolute_error(y_medical_test, y_medical_pred)}')
    print(f'Mean Squared Error: {mean_squared_error(y_medical_test, y_medical_pred)}')
    print(f'Root Mean Squared Error: {np.sqrt(mean_squared_error(y_medical_test, y_medical_pred))}')
    print(f'R-squared: {r2_score(y_medical_test, y_medical_pred)}')
