In [1]:
### Author : Prasad Meesala
# Importing the necessary modules

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import model_selection, linear_model, ensemble, svm, datasets

In [2]:
# Collecting the data

data = datasets.load_wine()
print(dir(data))

df = pd.DataFrame(data.data, columns = data.feature_names)
df['target'] = data.target
df['target_names'] = df['target'].apply(lambda x : data.target_names[x])
df.head()

['DESCR', 'data', 'feature_names', 'frame', 'target', 'target_names']


Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline,target,target_names
0,14.23,1.71,2.43,15.6,127.0,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065.0,0,class_0
1,13.2,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050.0,0,class_0
2,13.16,2.36,2.67,18.6,101.0,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185.0,0,class_0
3,14.37,1.95,2.5,16.8,113.0,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480.0,0,class_0
4,13.24,2.59,2.87,21.0,118.0,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735.0,0,class_0


In [3]:
# Training the models

X = df.drop(['target', 'target_names'], axis = 1).values
y = df['target'].values

X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size = 0.2)

lg = linear_model.LogisticRegression(max_iter = 10000)
svmc = svm.SVC(kernel = "linear")
rf = ensemble.RandomForestClassifier(n_estimators = 50)

lg.fit(X_train, y_train)
svmc.fit(X_train, y_train)
rf.fit(X_train, y_train)

# Checking the accuracy of the models

print(lg.score(X_test, y_test), svmc.score(X_test, y_test), rf.score(X_test, y_test))

0.9722222222222222 1.0 1.0


In [4]:
# Evaluating the model performance using cross validation

avg = lambda x : sum(x) / len(x)

folds = model_selection.StratifiedKFold(n_splits = 3)

lg_scores = []
svm_scores, rf_scores = lg_scores.copy(), lg_scores.copy()

for train_index, test_index in folds.split(X, y):
    X_train, X_test, y_train, y_test = X[train_index], X[test_index], y[train_index], y[test_index]
    lg.fit(X_train, y_train)
    svmc.fit(X_train, y_train)
    rf.fit(X_train, y_train)
    lg_scores.append(lg.score(X_test, y_test))
    svm_scores.append(svmc.score(X_test, y_test))
    rf_scores.append(rf.score(X_test, y_test))

print(avg(lg_scores), avg(svm_scores), avg(rf_scores))

0.9551789077212806 0.9274952919020715 0.9326741996233522
