Importing pandas and ydata_profiling libraries

In [1]:
import pandas as pd
from ydata_profiling import ProfileReport

Reading the csv file into a dataframe

In [2]:
data_frame = pd.read_csv("red_wine.csv")

Pandas profiling for the red_Wine dataset.

In [3]:
profile = ProfileReport(data_frame)

Saving the report as html file.

In [4]:
profile.to_file("red_wine_report.html")

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Splitting the dataset into features and target

In [5]:
X = data_frame.drop('type', axis=1)
Y = data_frame['type']

DEFINING THE MODELS TO FIT THE DATA AND ANALYZE THE METRICS

In [6]:
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
classification_models = {
    'ZeroR': DummyClassifier(strategy='most_frequent'),
    'OneR': DecisionTreeClassifier(max_depth=1),
    'Logistic Regression': LogisticRegression(),
    'Naive Bayes': GaussianNB(),
    'Decision Tree': DecisionTreeClassifier(),
    'Support Vector Machine': SVC(),
    'Random Forest': RandomForestClassifier()
}

Analyzing the performance metrics of 10-fold cross-validation for the dataset

In [7]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_auc_score
metrics = []
for name, model in classification_models.items():
    scores = cross_val_score(model, X, Y, cv=10, scoring='accuracy')
    auc_scores = cross_val_score(model, X, Y, cv=10, scoring='roc_auc')
    mean_score = scores.mean()
    mean_auc = auc_scores.mean()
    metrics.append((name, mean_score, mean_auc))

Convert the results to a Pandas DataFrame and print it

In [8]:
final_metrics = pd.DataFrame(metrics, columns=['Model', 'Accuracy (mean)', 'AUC (mean)'])
print(final_metrics)

                    Model  Accuracy (mean)  AUC (mean)
0                   ZeroR         0.528887    0.500000
1                    OneR         0.798790    0.802581
2     Logistic Regression         0.784785    0.879902
3             Naive Bayes         0.821627    0.895408
4           Decision Tree         0.742710    0.751491
5  Support Vector Machine         0.535844    0.868920
6           Random Forest         0.800575    0.888833


Encoding the target variable type 0 and 1

In [9]:
from sklearn.preprocessing import LabelEncoder
l = LabelEncoder()
data_frame['type'] = l.fit_transform(data_frame['type'])

Split the data into training and testing sets

In [10]:
from sklearn.model_selection import train_test_split
X_train, x_test, Y_train, y_test = train_test_split(data_frame.drop('type', axis=1), data_frame['type'], test_size=0.2, random_state=42)

Fit the model to the training data


In [11]:
random_forest = RandomForestClassifier()
random_forest.fit(X_train, Y_train)

RandomForestClassifier()

Predicting the probabilities for the test data

In [14]:
y_pred = random_forest.predict_proba(x_test)

Computing false positive rate, true positive rate, threshold values and AUC score

In [15]:
from sklearn.metrics import roc_curve, auc
fpr, tpr, thresholds = roc_curve(y_test, y_pred[:,1])
roc_auc = auc(fpr, tpr)

Plotting the ROC CURVE

In [16]:
from matplotlib import get_backend

In [21]:
import matplotlib.pyplot as plt
plt.figure()
plt.plot(fpr, tpr, color='grey', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='red', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Reciever Charactersticks')
plt.legend(loc="lower right")
plt.show()

Loading white_wine dataset

In [22]:
data_frame2 = pd.read_csv("white_wine.csv")

Split the dataset into features and target

In [23]:
P = data_frame2.drop('type', axis=1)
Q = data_frame2['type']


classification_models = {
    'Naive Bayes': GaussianNB()
}


results2 = []
for name, model in classification_models.items():
    scores = cross_val_score(model, P, Q, cv=10, scoring='accuracy')
    auc_scores = cross_val_score(model, P, Q, cv=10, scoring='roc_auc')
    mean_score_end = scores.mean()
    mean_auc_end = auc_scores.mean()
    results2.append((name, mean_score_end, mean_auc_end))

Converting the results to a Pandas DataFrame and printing it

In [25]:
results_dataframe = pd.DataFrame(results2, columns=['Model', 'Accuracy (mean)', 'AUC(mean)'])
print(results_dataframe)

         Model  Accuracy (mean)  AUC(mean)
0  Naive Bayes         0.932143       0.95
