In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression


In [2]:
# PCA transformation function
def pca_transform(indep_X, n):
    pca = PCA(n_components=n)
    pca_features = pca.fit_transform(indep_X)
    explained_variance = pca.explained_variance_ratio_
    return pca_features, explained_variance

In [3]:
# Data splitting and scaling function
def split_scalar(indep_X, dep_Y):
    X_train, X_test, y_train, y_test = train_test_split(indep_X, dep_Y, test_size=0.25, random_state=0)
    sc = StandardScaler()
    X_train = sc.fit_transform(X_train)
    X_test = sc.transform(X_test)
    return X_train, X_test, y_train, y_test

# Prediction and evaluation function
def cm_prediction(classifier, X_test, y_test):
    y_pred = classifier.predict(X_test)
    cm = confusion_matrix(y_test, y_pred)
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)
    return classifier, accuracy, report, X_test, y_test, cm


In [4]:
# Logistic regression classifier
def logistic(X_train, y_train, X_test, y_test):
    classifier = LogisticRegression(random_state=0)
    classifier.fit(X_train, y_train)
    classifier, accuracy, report, X_test, y_test, cm = cm_prediction(classifier, X_test, y_test)
    return classifier, accuracy, report, X_test, y_test, cm

# Support Vector Machine - linear kernel
def svm_linear(X_train, y_train, X_test, y_test):
    from sklearn.svm import SVC
    classifier = SVC(kernel='linear', random_state=0)
    classifier.fit(X_train, y_train)
    classifier, accuracy, report, X_test, y_test, cm = cm_prediction(classifier, X_test, y_test)
    return classifier, accuracy, report, X_test, y_test, cm

# Support Vector Machine - non-linear kernel
def svm_nl(X_train, y_train, X_test, y_test):
    from sklearn.svm import SVC
    classifier = SVC(kernel='rbf', random_state=0)
    classifier.fit(X_train, y_train)
    classifier, accuracy, report, X_test, y_test, cm = cm_prediction(classifier, X_test, y_test)
    return classifier, accuracy, report, X_test, y_test, cm

# Naive Bayes classifier
def naive(X_train, y_train, X_test, y_test):
    from sklearn.naive_bayes import GaussianNB
    classifier = GaussianNB()
    classifier.fit(X_train, y_train)
    classifier, accuracy, report, X_test, y_test, cm = cm_prediction(classifier, X_test, y_test)
    return classifier, accuracy, report, X_test, y_test, cm

# K-Nearest Neighbors classifier
def knn(X_train, y_train, X_test, y_test):
    from sklearn.neighbors import KNeighborsClassifier
    classifier = KNeighborsClassifier(n_neighbors=5, metric='minkowski', p=2)
    classifier.fit(X_train, y_train)
    classifier, accuracy, report, X_test, y_test, cm = cm_prediction(classifier, X_test, y_test)
    return classifier, accuracy, report, X_test, y_test, cm

# Decision Tree classifier
def decision(X_train, y_train, X_test, y_test):
    from sklearn.tree import DecisionTreeClassifier
    classifier = DecisionTreeClassifier(criterion='entropy', random_state=0)
    classifier.fit(X_train, y_train)
    classifier, accuracy, report, X_test, y_test, cm = cm_prediction(classifier, X_test, y_test)
    return classifier, accuracy, report, X_test, y_test, cm

# Random Forest classifier
def random_forest(X_train, y_train, X_test, y_test):
    from sklearn.ensemble import RandomForestClassifier
    classifier = RandomForestClassifier(n_estimators=10, criterion='entropy', random_state=0)
    classifier.fit(X_train, y_train)
    classifier, accuracy, report, X_test, y_test, cm = cm_prediction(classifier, X_test, y_test)
    return classifier, accuracy, report, X_test, y_test, cm

# Classification results function
def select_pca_classification(acclog, accsvml, accsvmnl, accknn, accnav, accdes, accrf):
    dataframe = pd.DataFrame(index=['PCA'], columns=['Logistic', 'SVMl', 'SVMnl', 'KNN', 'Naive', 'Decision', 'Random'])
    dataframe['Logistic']['PCA'] = acclog
    dataframe['SVMl']['PCA'] = accsvml
    dataframe['SVMnl']['PCA'] = accsvmnl
    dataframe['KNN']['PCA'] = accknn
    dataframe['Naive']['PCA'] = accnav
    dataframe['Decision']['PCA'] = accdes
    dataframe['Random']['PCA'] = accrf
    return dataframe

In [5]:
# Load dataset
dataset1 = pd.read_csv("Wine.csv", index_col=None)

# One-hot encode categorical variables, if any
df2 = pd.get_dummies(dataset1, drop_first=True)

# Split the independent and dependent variables
X = df2.iloc[:, :-1].values  # All columns except the last one (features)
y = df2.iloc[:, -1].values   # Last column (target)

In [23]:
# Apply PCA transformation (reduce to 6 components)
kbestPCA, explained_variance = pca_transform(X, 7)

# Split the data into train and test sets
X_train, X_test, y_train, y_test = split_scalar(kbestPCA, y)

# Initialize lists to store accuracy results
acclog, accsvml, accsvmnl, accknn, accnav, accdes, accrf = [], [], [], [], [], [], []


In [24]:
kbestPCA

array([[ 3.18562979e+02,  2.14921307e+01, -3.13073470e+00, ...,
        -6.77078222e-01,  5.68081040e-01,  6.19641832e-01],
       [ 3.03097420e+02, -5.36471768e+00, -6.82283550e+00, ...,
         4.86095978e-01,  1.43398712e-02, -1.08865121e-01],
       [ 4.38061133e+02, -6.53730945e+00,  1.11322298e+00, ...,
        -3.80651426e-01,  6.72403748e-01, -7.85818858e-01],
       ...,
       [ 8.84580737e+01,  1.87762846e+01,  2.23757651e+00, ...,
         1.05733568e+00,  2.15000027e-01, -6.48488641e-01],
       [ 9.34562419e+01,  1.86708191e+01,  1.78839152e+00, ...,
         2.76956208e-01, -1.00922935e+00, -4.14948247e-01],
       [-1.86943190e+02, -2.13330803e-01,  5.63050984e+00, ...,
        -7.03463092e-03,  7.03635720e-01,  6.93760896e-01]])

In [25]:

# Apply classifiers
classifier, Accuracy, report, X_test, y_test, cm = logistic(X_train, y_train, X_test, y_test)
acclog.append(Accuracy)

classifier, Accuracy, report, X_test, y_test, cm = svm_linear(X_train, y_train, X_test, y_test)
accsvml.append(Accuracy)

classifier, Accuracy, report, X_test, y_test, cm = svm_nl(X_train, y_train, X_test, y_test)
accsvmnl.append(Accuracy)

classifier, Accuracy, report, X_test, y_test, cm = knn(X_train, y_train, X_test, y_test)
accknn.append(Accuracy)

classifier, Accuracy, report, X_test, y_test, cm = naive(X_train, y_train, X_test, y_test)
accnav.append(Accuracy)

classifier, Accuracy, report, X_test, y_test, cm = decision(X_train, y_train, X_test, y_test)
accdes.append(Accuracy)

classifier, Accuracy, report, X_test, y_test, cm = random_forest(X_train, y_train, X_test, y_test)
accrf.append(Accuracy)

# Collect results into a DataFrame
result = select_pca_classification(acclog, accsvml, accsvmnl, accknn, accnav, accdes, accrf)


You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  dataframe['Logistic']['PCA'] = acclog
You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or S

In [14]:
result
#5

Unnamed: 0,Logistic,SVMl,SVMnl,KNN,Naive,Decision,Random
PCA,[0.9333333333333333],[0.9333333333333333],[0.9777777777777777],[0.9555555555555556],[0.9777777777777777],[0.9111111111111111],[0.9555555555555556]


In [22]:
result
#3

Unnamed: 0,Logistic,SVMl,SVMnl,KNN,Naive,Decision,Random
PCA,[0.9111111111111111],[0.8888888888888888],[0.8666666666666667],[0.8222222222222222],[0.8666666666666667],[0.8444444444444444],[0.8444444444444444]


In [18]:
result
#4

Unnamed: 0,Logistic,SVMl,SVMnl,KNN,Naive,Decision,Random
PCA,[0.9111111111111111],[0.9111111111111111],[0.9333333333333333],[0.9111111111111111],[0.9555555555555556],[0.9111111111111111],[0.9111111111111111]


In [10]:
result
#6

Unnamed: 0,Logistic,SVMl,SVMnl,KNN,Naive,Decision,Random
PCA,[0.9333333333333333],[0.9555555555555556],[1.0],[0.9333333333333333],[0.9777777777777777],[0.9333333333333333],[0.9777777777777777]


In [26]:
result
#7

Unnamed: 0,Logistic,SVMl,SVMnl,KNN,Naive,Decision,Random
PCA,[0.9777777777777777],[0.9555555555555556],[1.0],[0.9777777777777777],[0.9555555555555556],[0.8888888888888888],[0.9111111111111111]
