In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns
from sklearn import preprocessing 

In [None]:
red_wine_original = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv',sep=';')
wine = red_wine_original.copy()

In [None]:
wine['quality_label'] = wine['quality'].apply(lambda value: 'low'
if value <= 5 else 'medium'
if value <= 7 else 'high')
wine

In [None]:
wine.drop_duplicates(inplace= True, ignore_index=True)

In [None]:
red_wine = wine.drop(["quality","sulphates","pH"], axis = 1)


In [None]:
red_wine

In [None]:
red_wine.info()

In [None]:
cleanup_nums = {"quality_label":{"low": 0, "medium": 1,"high" : 2}}
red_wine = red_wine.replace(cleanup_nums)

In [None]:
red_wine.tail(100)

In [None]:
red_wine['quality_label'] = red_wine.quality_label.astype('category')

In [None]:
X = red_wine.drop(["quality_label"], axis = 1)
y = red_wine["quality_label"]

In [None]:
# K Fold cross validation :
from sklearn.model_selection import KFold, StratifiedKFold, cross_val_score
from sklearn.model_selection import train_test_split

kf =KFold(n_splits=5, shuffle=True, random_state=42)
cnt = 1
# split()  method generate indices to split data into training and test set.
for train_index, test_index in kf.split(X, y):
    X_train, X_test = X.iloc[train_index].reset_index(drop=True), X.iloc[test_index].reset_index(drop=True)
    y_train, y_test = y.iloc[train_index].reset_index(drop=True), y.iloc[test_index].reset_index(drop=True)
    print(f'Fold:{cnt}, Train set: {len(train_index)}, Test set:{len(test_index)}')
    cnt += 1

In [None]:
#Logisticregression
from sklearn.linear_model import LogisticRegression
from sklearn import linear_model
model1 = linear_model.LogisticRegression(max_iter=5000)
score = cross_val_score(model1, X, y, cv= kf)
print(f'Scores for each fold: {score}')

# Calculate the mean accuracy across all folds
mean_accuracy_log = np.mean(score)
print("Mean Accuracy:", mean_accuracy_log)

In [None]:
#Decisiontree

from sklearn.tree import DecisionTreeClassifier
model2 = DecisionTreeClassifier()
score = cross_val_score(model2, X, y, cv= kf)
print(f'Scores for each fold: {score}')

# Calculate the mean accuracy across all folds
mean_accuracy_dt = np.mean(score)
print("Mean Accuracy:", mean_accuracy_dt)

In [None]:
#RandomForest

from sklearn.ensemble import RandomForestClassifier
model3 = RandomForestClassifier()
score = cross_val_score(model3, X, y, cv= kf)
print(f'Scores for each fold: {score}')

# Calculate the mean accuracy across all folds
mean_accuracy_rf = np.mean(score)
print("Mean Accuracy:", mean_accuracy_rf)

In [None]:
#Using KNeighborsClassifier
  
from sklearn.neighbors import KNeighborsClassifier
model4 = KNeighborsClassifier()
score = cross_val_score(model4, X, y, cv= kf)
print(f'Scores for each fold: {score}')

# Calculate the mean accuracy across all folds
mean_accuracy_kn = np.mean(score)
print("Mean Accuracy:", mean_accuracy_kn)

In [None]:
#Predicting the values for test
forest = RandomForestClassifier()
forest.fit(X_train, y_train.ravel())
y_pred = forest.predict(X_test)
y_pred

In [None]:
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.metrics import classification_report

rf_model = RandomForestClassifier()

# Define the hyperparameter search space
param_dist = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

# Use RandomizedSearchCV to search for the best hyperparameters

random_search = RandomizedSearchCV(estimator=rf_model, param_distributions=param_dist, n_iter=10, cv=kf, scoring='accuracy', random_state=42)
random_search.fit(X_train, y_train)

# Print the best hyperparameters found
best_params = random_search.best_params_
print("Best Hyperparameters:", best_params)

# Evaluate the model with the best hyperparameters on the test set
best_model = random_search.best_estimator_
y_pred = best_model.predict(X_test)

# Print classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

In [None]:
# Hyperparameter tuning for RandomForestClassifier :

from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.metrics import classification_report


# Define the Random Forest Classifier model
rf_model = RandomForestClassifier()

# Define the hyperparameters and their possible values to search
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Use GridSearchCV to search for the best hyperparameters
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=kf, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Print the best hyperparameters found
best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)

# Evaluate the model with the best hyperparameters on the test set
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

# Print classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

In [None]:
from sklearn.metrics import confusion_matrix

conf_matrix = confusion_matrix(y_test, y_pred)
conf_matrix

In [None]:
report = classification_report(y_test, y_pred)
print("Classification Report:\n", report)

In [None]:
# Example model comparison plot

models = ['LR', 'DT', 'RF','KN']
accuracies = [mean_accuracy_log, mean_accuracy_dt, mean_accuracy_rf,mean_accuracy_kn]
plt.bar(models, accuracies, color=['lightsalmon', 'pink', 'bisque','lightsteelblue'])
plt.xlabel('Models')
plt.ylabel('Accuracy')
plt.title('Model Comparison - Red wine')
plt.show()

In [None]:
from sklearn.model_selection import learning_curve
train_sizes, train_scores, test_scores = learning_curve(model1, X, y, cv=5, scoring='accuracy')
train_scores_mean = np.mean(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)

plt.plot(train_sizes, train_scores_mean, label='Training Accuracy')
plt.plot(train_sizes, test_scores_mean, label='Cross-Validation Accuracy')
plt.xlabel('Training Examples')
plt.ylabel('Accuracy')
plt.title('Learning Curve Across Folds')
plt.legend(loc='best')
plt.show()

In [None]:
from sklearn.model_selection import learning_curve
train_sizes, train_scores, test_scores = learning_curve(model2, X, y, cv=5, scoring='accuracy')
train_scores_mean = np.mean(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)

plt.plot(train_sizes, train_scores_mean, label='Training Accuracy')
plt.plot(train_sizes, test_scores_mean, label='Cross-Validation Accuracy')
plt.xlabel('Training Examples')
plt.ylabel('Accuracy')
plt.title('Learning Curve Across Folds')
plt.legend(loc='best')
plt.show()

In [None]:
from sklearn.model_selection import learning_curve
train_sizes, train_scores, test_scores = learning_curve(model3, X, y, cv=5, scoring='accuracy')
train_scores_mean = np.mean(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)

plt.plot(train_sizes, train_scores_mean, label='Training Accuracy')
plt.plot(train_sizes, test_scores_mean, label='Cross-Validation Accuracy')
plt.xlabel('Training Examples')
plt.ylabel('Accuracy')
plt.title('Learning Curve Across Folds')
plt.legend(loc='best')
plt.show()

In [None]:
from sklearn.model_selection import learning_curve
train_sizes, train_scores, test_scores = learning_curve(model4, X, y, cv=5, scoring='accuracy')
train_scores_mean = np.mean(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)

plt.plot(train_sizes, train_scores_mean, label='Training Accuracy')
plt.plot(train_sizes, test_scores_mean, label='Cross-Validation Accuracy')
plt.xlabel('Training Examples')
plt.ylabel('Accuracy')
plt.title('Learning Curve Across Folds')
plt.legend(loc='best')
plt.show()

In [None]:
from sklearn.model_selection import cross_val_predict, KFold
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
import numpy as np

def generate_confusion_matrix(model, X, y, cv=5):
    """
    Generate confusion matrix using K-fold cross-validation.

    Parameters:
    - model: Machine learning model (e.g., RandomForestClassifier, DecisionTreeClassifier, LogisticRegression)
    - X: Feature matrix
    - y: Target variable
    - cv: Number of folds for cross-validation (default: 5)

    Returns:
    - Confusion matrix
    """
    kf = KFold(n_splits=cv, shuffle=True, random_state=42)
    all_actual = []
    all_predicted = []

    for train_index, test_index in kf.split(X):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        # Fit the model
        model.fit(X_train, y_train)

        # Make predictions
        y_pred = model.predict(X_test)

        # Collect actual and predicted values
        all_actual.extend(y_test)
        all_predicted.extend(y_pred)

    # Generate confusion matrix
    conf_matrix = confusion_matrix(all_actual, all_predicted)
    return conf_matrix

# Example usage with RandomForestClassifier
# Random Forest Classifier
rf_model = RandomForestClassifier()
rf_conf_matrix = generate_confusion_matrix(rf_model, X, y)
print("Random Forest Confusion Matrix:")
print(rf_conf_matrix)

# Decision Tree Classifier
dt_model = DecisionTreeClassifier()
dt_conf_matrix = generate_confusion_matrix(dt_model, X, y)
print("\nDecision Tree Confusion Matrix:")
print(dt_conf_matrix)

# Logistic Regression
lr_model = LogisticRegression(max_iter=5000)
lr_conf_matrix = generate_confusion_matrix(lr_model, X, y)
print("\nLogistic Regression Confusion Matrix:")
print(lr_conf_matrix)

#Using KNeighborsClassifier
kn_model = KNeighborsClassifier()
kn_conf_matrix = generate_confusion_matrix(kn_model, X, y)
print("\nKNeighbors Confusion Matrix:")
print(kn_conf_matrix)
