In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

In [None]:
class DataPreprocessor:

    def __init__(self, data):
        self.data = data

    def find_null_values(self):
        missing = self.data.isnull().sum()
        missing = missing[missing > 0].sort_values(ascending = False)
        print(missing)

    def remove_duplicates(self):
        self.data.drop_duplicates(inplace=True)

    def describe_data(self):
        print(self.data.describe())

    def handle_missing_values_mean(self, columns):
        for column in columns:
          mode_imputer = SimpleImputer(missing_values = np.nan, strategy= 'mean')
          values = data[column].values.reshape(-1,1)
          mode_imputer.fit(values)
          self.data[column] = mode_imputer.transform(values)

    def handle_missing_values_frequent(self, columns):
        for column in columns:
          mode_imputer = SimpleImputer(missing_values = np.nan, strategy= 'most_frequent')
          values = data[column].values.reshape(-1,1)
          mode_imputer.fit(values)
          self.data[column] = mode_imputer.transform(values)

    def remove_outliers(self, columns, threshold=3):
        for col in columns:
            z_scores = (self.data[col] - self.data[col].mean()) / self.data[col].std()
            self.data = self.data[(z_scores.abs() < threshold)]

    def calculate_correlation(self):
        correlation_matrix = self.data.corr()
        print(correlation_matrix)

    def label_encoding(self, column):
        label_encoder = LabelEncoder()
        self.data[column] = label_encoder.fit_transform(self.data[column])
        return self.data

    def data_normalization(self, columns):
        scaler = MinMaxScaler()
        self.data[columns] = scaler.fit_transform(self.data[columns])

    def data_standardization(self, columns):
        scaler = StandardScaler()
        self.data[columns] = scaler.fit_transform(self.data[columns])

    def apply_PCA(self, columns, n_components):
        pca = PCA(n_components=n_components)
        pca_result = pca.fit_transform(self.data[columns])
        transformed_df = pd.DataFrame(data=pca_result, columns=[f'PC_{i + 1}' for i in range(n_components)])
        return transformed_df

    def split_data(self, features, target, test_size=0.2, validation_size=0.25, random_state=42):
        x = self.data[features]
        y = self.data[target]
        x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=test_size, random_state=random_state)
        x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=validation_size, random_state=random_state)
        return x_train, x_val, x_test, y_train, y_val, y_test


In [None]:
class Classification:

    def knn( X_train, y_train, X_test, y_test):
        knn = KNeighborsClassifier(n_neighbors=3)
        knn.fit(X_train, y_train)
        y_pred = knn.predict(X_test)
        accuracy = knn.score(X_test, y_test)
        print(f"Accuracy of k-NN (k=3) on test set: {accuracy:.2f}")

    def decision_tree(X_train, y_train, X_test, y_test):
        clf = DecisionTreeClassifier(random_state=42)
        clf.fit(X_train, y_train)
        accuracy = clf.score(X_test, y_test)
        print(f"Accuracy of Decision Tree on test set: {accuracy:.2f}")

    def random_forest(X_train, y_train, X_test, y_test):
        rf = RandomForestClassifier(n_estimators=100, random_state=42)
        rf.fit(X_train, y_train)
        accuracy = rf.score(X_test, y_test)
        print(f"Accuracy of Random Forest on test set: {accuracy:.2f}")

    def naive_bayes(X_train, y_train, X_test, y_test):
        gnb = GaussianNB()
        gnb.fit(X_train, y_train)
        accuracy = gnb.score(X_test, y_test)
        print(f"Accuracy of Gaussian Naive Bayes on test set: {accuracy:.2f}")

    def svm(X_train, y_train, X_test, y_test):
        svm = SVC(kernel='linear', random_state=42)
        svm.fit(X_train, y_train)
        accuracy = svm.score(X_test, y_test)
        print(f"Accuracy of SVM on test set: {accuracy:.2f}")

In [None]:
class Clustering:

  def kmeans(data, n_clusters):
      kmeans = KMeans(n_clusters=n_clusters)
      kmeans.fit(data)
      cluster_labels = kmeans.labels_
      clustered_data = data.copy()
      clustered_data['Cluster'] = cluster_labels
      return clustered_data

  def dbscan(data, eps, min_samples):
      dbscan = DBSCAN(eps=eps, min_samples=min_samples)
      cluster_labels = dbscan.fit_predict(data)
      clustered_data = data.copy()
      clustered_data['Cluster'] = cluster_labels
      return clustered_data

  def agglomerative_clustering(data, n_clusters):
      agg_clustering = AgglomerativeClustering(n_clusters=n_clusters)
      cluster_labels = agg_clustering.fit_predict(data)
      clustered_data = data.copy()
      clustered_data['Cluster'] = cluster_labels
      return clustered_data

In [None]:
class Rule_Mining:

  def perform_apriori(data, min_support, min_threshold):
      frequent_itemsets = apriori(data, min_support=min_support, use_colnames=True)
      rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=min_threshold)
      return rules

In [None]:
# Reading the data
data = pd.read_csv("Wines.csv")
data

In [None]:
# Removing unnamed columns
data = data.iloc[:,1:14]
data

In [None]:
# Loading the data in Preprocessor class
preprocessor = DataPreprocessor(data)

In [None]:
# Check for Null values
preprocessor.find_null_values()

In [None]:
# Imputing the Null Values for numerical columns
preprocessor.handle_missing_values_mean(['Chlorides', 'Volatile_Acidity', 'Sugar', 'Alcohol', 'Citric Acid'])

# Imputing the Null values for categorical columns
preprocessor.handle_missing_values_frequent(['Color'])

In [None]:
# Check for null values after imputing
preprocessor.find_null_values()

In [None]:
# Remove Duplicate Rows
preprocessor.remove_duplicates()

In [None]:
# Details of data
preprocessor.describe_data()

In [None]:
# Correlation Analysis
preprocessor.calculate_correlation()

In [None]:
# Remove Outliers
column_names_array = data.columns.tolist()
column_names_array.remove('Color')
column_names_array.remove('Quality')

preprocessor.remove_outliers(column_names_array)

In [None]:
# Perform Label Encoding in the Categorical column named Color
data = preprocessor.label_encoding('Color')

In [None]:

# Storing the Target column seperately and removing it from the dataset for performing preprocessing
Quality_column = data.pop('Quality')

In [None]:
# Performing Normalization and Standardization
preprocessor.data_normalization(column_names_array)
preprocessor.data_standardization(column_names_array)

In [None]:
# Applying Principal Component Analysis(PCA)
new_data = preprocessor.apply_PCA(['Alcohol', 'Volatile_Acidity'], 2)
new_data

In [None]:
# Adding back the Quality column in dataset
data['Quality'] = Quality_column

In [None]:
# Data Partioning
column_names_array = data.columns.tolist()
column_names_array.remove('Quality')
features = column_names_array

x_train, x_val, x_test, y_train, y_val, y_test = preprocessor.split_data(features=features, target='Quality')

In [None]:
# Performing KNN Classification
Classification.knn(x_train, y_train, x_test, y_test)

In [None]:
# Performing Decision Tree Classifier
Classification.decision_tree(x_train, y_train, x_test, y_test)

In [None]:
# Performing Random Forest Classifier
Classification.random_forest(x_train, y_train, x_test, y_test)

In [None]:
# Performing Naive Bayes Classifier
Classification.naive_bayes(x_train, y_train, x_test, y_test)

In [None]:
# Performing Support Vector Machines(SVM) Classifier
Classification.svm(x_train, y_train, x_test, y_test)

In [None]:
# Performing K Means clustering
clustered_data = Clustering.kmeans(data,3)
clustered_data

In [None]:
# Performing DBSCAN
eps = 0.5
min_samples = 5
clustered_data = Clustering.dbscan(data, eps, min_samples)
clustered_data

In [None]:
# Performing agglomerative clustering
n_clusters = 3
Clustering.agglomerative_clustering(data, n_clusters)

In [None]:
# Performing apriori algorithm (Association Rule Mining)

data_encoded = pd.get_dummies(data)
# Binarize the dataset based on some threshold or specific conditions
data_binarized = data_encoded.applymap(lambda x: 1 if x >= 1 else 0)

min_supp = 0.1
min_thresh = 0.7
association_rules = Rule_Mining.perform_apriori(data_binarized, min_support=min_supp, min_threshold=min_thresh)
association_rules