In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

In [2]:
class DataPreprocessor:

    def __init__(self, data):
        self.data = data

    def find_null_values(self):
        missing = self.data.isnull().sum()
        missing = missing[missing > 0].sort_values(ascending = False)
        print(missing)

    def remove_duplicates(self):
        self.data.drop_duplicates(inplace=True)

    def describe_data(self):
        print(self.data.describe())

    def handle_missing_values_mean(self, columns):
        for column in columns:
          mode_imputer = SimpleImputer(missing_values = np.nan, strategy= 'mean')
          values = data[column].values.reshape(-1,1)
          mode_imputer.fit(values)
          self.data[column] = mode_imputer.transform(values)

    def handle_missing_values_frequent(self, columns):
        for column in columns:
          mode_imputer = SimpleImputer(missing_values = np.nan, strategy= 'most_frequent')
          values = data[column].values.reshape(-1,1)
          mode_imputer.fit(values)
          self.data[column] = mode_imputer.transform(values)

    def remove_outliers(self, columns, threshold=3):
        for col in columns:
            z_scores = (self.data[col] - self.data[col].mean()) / self.data[col].std()
            self.data = self.data[(z_scores.abs() < threshold)]

    def calculate_correlation(self):
        correlation_matrix = self.data.corr()
        print(correlation_matrix)

    def label_encoding(self, column):
        label_encoder = LabelEncoder()
        self.data[column] = label_encoder.fit_transform(self.data[column])
        return self.data

    def data_normalization(self, columns):
        scaler = MinMaxScaler()
        self.data[columns] = scaler.fit_transform(self.data[columns])

    def data_standardization(self, columns):
        scaler = StandardScaler()
        self.data[columns] = scaler.fit_transform(self.data[columns])

    def apply_PCA(self, columns, n_components):
        pca = PCA(n_components=n_components)
        pca_result = pca.fit_transform(self.data[columns])
        transformed_df = pd.DataFrame(data=pca_result, columns=[f'PC_{i + 1}' for i in range(n_components)])
        return transformed_df

    def split_data(self, features, target, test_size=0.2, validation_size=0.25, random_state=42):
        x = self.data[features]
        y = self.data[target]
        x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=test_size, random_state=random_state)
        x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=validation_size, random_state=random_state)
        return x_train, x_val, x_test, y_train, y_val, y_test


  and should_run_async(code)


In [3]:
class Classification:

    def knn( X_train, y_train, X_test, y_test):
        knn = KNeighborsClassifier(n_neighbors=3)
        knn.fit(X_train, y_train)
        y_pred = knn.predict(X_test)
        accuracy = knn.score(X_test, y_test)
        print(f"Accuracy of k-NN (k=3) on test set: {accuracy:.2f}")

    def decision_tree(X_train, y_train, X_test, y_test):
        clf = DecisionTreeClassifier(random_state=42)
        clf.fit(X_train, y_train)
        accuracy = clf.score(X_test, y_test)
        print(f"Accuracy of Decision Tree on test set: {accuracy:.2f}")

    def random_forest(X_train, y_train, X_test, y_test):
        rf = RandomForestClassifier(n_estimators=100, random_state=42)
        rf.fit(X_train, y_train)
        accuracy = rf.score(X_test, y_test)
        print(f"Accuracy of Random Forest on test set: {accuracy:.2f}")

    def naive_bayes(X_train, y_train, X_test, y_test):
        gnb = GaussianNB()
        gnb.fit(X_train, y_train)
        accuracy = gnb.score(X_test, y_test)
        print(f"Accuracy of Gaussian Naive Bayes on test set: {accuracy:.2f}")

    def svm(X_train, y_train, X_test, y_test):
        svm = SVC(kernel='linear', random_state=42)
        svm.fit(X_train, y_train)
        accuracy = svm.score(X_test, y_test)
        print(f"Accuracy of SVM on test set: {accuracy:.2f}")

  and should_run_async(code)


In [4]:
class Clustering:

  def kmeans(data, n_clusters):
      kmeans = KMeans(n_clusters=n_clusters)
      kmeans.fit(data)
      cluster_labels = kmeans.labels_
      clustered_data = data.copy()
      clustered_data['Cluster'] = cluster_labels
      return clustered_data

  def dbscan(data, eps, min_samples):
      dbscan = DBSCAN(eps=eps, min_samples=min_samples)
      cluster_labels = dbscan.fit_predict(data)
      clustered_data = data.copy()
      clustered_data['Cluster'] = cluster_labels
      return clustered_data

  def agglomerative_clustering(data, n_clusters):
      agg_clustering = AgglomerativeClustering(n_clusters=n_clusters)
      cluster_labels = agg_clustering.fit_predict(data)
      clustered_data = data.copy()
      clustered_data['Cluster'] = cluster_labels
      return clustered_data

  and should_run_async(code)


In [5]:
class Rule_Mining:

  def perform_apriori(data, min_support, min_threshold):
      frequent_itemsets = apriori(data, min_support=min_support, use_colnames=True)
      rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=min_threshold)
      return rules

  and should_run_async(code)


In [6]:
# Reading the data
data = pd.read_csv("Wines.csv")
data

  and should_run_async(code)


Unnamed: 0.1,Unnamed: 0,Fixed_Acidity,Volatile_Acidity,Citric Acid,Sugar,Chlorides,Fixed_Sulfur_dioxide,Total_Sulfur_dioxide,Density,pH,...,Color,Quality,Unnamed: 14,Unnamed: 15,Unnamed: 16,Unnamed: 17,Unnamed: 18,Unnamed: 19,Unnamed: 20,Unnamed: 21
0,1,3.8,0.310,0.02,11.1,0.036,20.0,114.0,0.99248,3.75,...,White,6,,,,,,,,
1,2,3.9,0.225,0.40,4.2,0.030,29.0,118.0,0.98900,3.57,...,White,8,,,,,,,,
2,3,4.2,0.170,0.36,1.8,0.029,93.0,161.0,0.98999,3.65,...,White,7,,,,,,,,
3,4,4.2,0.215,0.23,5.1,0.041,64.0,157.0,0.99688,3.42,...,White,3,,,,,,,,
4,5,4.4,0.460,0.10,2.8,0.024,31.0,111.0,0.98816,3.48,...,White,6,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6492,6493,15.5,0.645,0.49,4.2,0.095,10.0,23.0,1.00315,2.92,...,Red,5,,,,,,,,
6493,6494,15.5,0.645,0.49,4.2,0.095,10.0,23.0,1.00315,2.92,...,Red,5,,,,,,,,
6494,6495,15.6,0.685,0.76,3.7,0.100,6.0,43.0,1.00320,2.95,...,Red,7,,,,,,,,
6495,6496,15.6,0.645,0.49,4.2,0.095,10.0,23.0,1.00315,2.92,...,Red,5,,,,,,,,


In [7]:
# Removing unnamed columns
data = data.iloc[:,1:14]
data

  and should_run_async(code)


Unnamed: 0,Fixed_Acidity,Volatile_Acidity,Citric Acid,Sugar,Chlorides,Fixed_Sulfur_dioxide,Total_Sulfur_dioxide,Density,pH,Sulphates,Alcohol,Color,Quality
0,3.8,0.310,0.02,11.1,0.036,20.0,114.0,0.99248,3.75,0.44,12.4,White,6
1,3.9,0.225,0.40,4.2,0.030,29.0,118.0,0.98900,3.57,0.36,12.8,White,8
2,4.2,0.170,0.36,1.8,0.029,93.0,161.0,0.98999,3.65,0.89,12.0,White,7
3,4.2,0.215,0.23,5.1,0.041,64.0,157.0,0.99688,3.42,0.44,8.0,White,3
4,4.4,0.460,0.10,2.8,0.024,31.0,111.0,0.98816,3.48,0.34,13.1,White,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6492,15.5,0.645,0.49,4.2,0.095,10.0,23.0,1.00315,2.92,0.74,11.1,Red,5
6493,15.5,0.645,0.49,4.2,0.095,10.0,23.0,1.00315,2.92,0.74,11.1,Red,5
6494,15.6,0.685,0.76,3.7,0.100,6.0,43.0,1.00320,2.95,0.68,11.2,Red,7
6495,15.6,0.645,0.49,4.2,0.095,10.0,23.0,1.00315,2.92,0.74,11.1,Red,5


In [8]:
# Loading the data in Preprocessor class
preprocessor = DataPreprocessor(data)

  and should_run_async(code)


In [9]:
# Check for Null values
preprocessor.find_null_values()

Chlorides           16
Color               11
Volatile_Acidity    10
Sugar                9
Alcohol              9
Citric Acid          7
dtype: int64


  and should_run_async(code)


In [10]:
# Imputing the Null Values for numerical columns
preprocessor.handle_missing_values_mean(['Chlorides', 'Volatile_Acidity', 'Sugar', 'Alcohol', 'Citric Acid'])

# Imputing the Null values for categorical columns
preprocessor.handle_missing_values_frequent(['Color'])

  and should_run_async(code)


In [11]:
# Check for null values after imputing
preprocessor.find_null_values()

Series([], dtype: int64)


  and should_run_async(code)


In [12]:
# Remove Duplicate Rows
preprocessor.remove_duplicates()

  and should_run_async(code)


In [13]:
# Details of data
preprocessor.describe_data()

       Fixed_Acidity  Volatile_Acidity  Citric Acid        Sugar    Chlorides  \
count    5321.000000       5321.000000   5321.00000  5321.000000  5321.000000   
mean        7.214800          0.343975      0.31806     5.053267     0.056618   
std         1.319836          0.167988      0.14672     4.498275     0.036830   
min         3.800000          0.080000      0.00000     0.600000     0.009000   
25%         6.400000          0.230000      0.24000     1.800000     0.038000   
50%         7.000000          0.300000      0.31000     2.700000     0.047000   
75%         7.700000          0.410000      0.40000     7.500000     0.066000   
max        15.900000          1.580000      1.66000    65.800000     0.611000   

       Fixed_Sulfur_dioxide  Total_Sulfur_dioxide      Density           pH  \
count           5321.000000           5321.000000  5321.000000  5321.000000   
mean              30.039090            114.109942     0.994534     3.224689   
std               17.804258      

  and should_run_async(code)


In [14]:
# Correlation Analysis
preprocessor.calculate_correlation()

                      Fixed_Acidity  Volatile_Acidity  Citric Acid     Sugar  \
Fixed_Acidity              1.000000          0.216835     0.319500 -0.106100   
Volatile_Acidity           0.216835          1.000000    -0.384596 -0.162789   
Citric Acid                0.319500         -0.384596     1.000000  0.146652   
Sugar                     -0.106100         -0.162789     0.146652  1.000000   
Chlorides                  0.282624          0.368182     0.049799 -0.121940   
Fixed_Sulfur_dioxide      -0.281723         -0.348448     0.132752  0.398637   
Total_Sulfur_dioxide      -0.327424         -0.400936     0.198385  0.487126   
Density                    0.478437          0.310440     0.090311  0.520229   
pH                        -0.271346          0.245592    -0.343946 -0.232602   
Sulphates                  0.305115          0.226598     0.056773 -0.174438   
Alcohol                   -0.100216         -0.069864    -0.005436 -0.303413   
Quality                   -0.080746     

  and should_run_async(code)
  correlation_matrix = self.data.corr()


In [15]:
# Remove Outliers
column_names_array = data.columns.tolist()
column_names_array.remove('Color')
column_names_array.remove('Quality')

preprocessor.remove_outliers(column_names_array)

  and should_run_async(code)


In [16]:
# Perform Label Encoding in the Categorical column named Color
data = preprocessor.label_encoding('Color')

  and should_run_async(code)


In [17]:

# Storing the Target column seperately and removing it from the dataset for performing preprocessing
Quality_column = data.pop('Quality')

  and should_run_async(code)


In [18]:
# Performing Normalization and Standardization
preprocessor.data_normalization(column_names_array)
preprocessor.data_standardization(column_names_array)

  and should_run_async(code)


In [19]:
# Applying Principal Component Analysis(PCA)
new_data = preprocessor.apply_PCA(['Alcohol', 'Volatile_Acidity'], 2)
new_data

  and should_run_async(code)


Unnamed: 0,PC_1,PC_2
0,-1.829613,0.828870
1,0.988842,-2.084445
2,-0.894134,2.122585
3,-1.379031,1.279452
4,0.022657,1.964669
...,...,...
4862,0.192850,0.940743
4863,0.923273,-0.358836
4864,-0.486793,1.335807
4865,-0.292048,0.097609


In [20]:
# Adding back the Quality column in dataset
data['Quality'] = Quality_column

  and should_run_async(code)


In [21]:
# Data Partioning
column_names_array = data.columns.tolist()
column_names_array.remove('Quality')
features = column_names_array

x_train, x_val, x_test, y_train, y_val, y_test = preprocessor.split_data(features=features, target='Quality')

  and should_run_async(code)


In [22]:
# Performing KNN Classification
Classification.knn(x_train, y_train, x_test, y_test)

  and should_run_async(code)


Accuracy of k-NN (k=3) on test set: 0.49


In [23]:
# Performing Decision Tree Classifier
Classification.decision_tree(x_train, y_train, x_test, y_test)

Accuracy of Decision Tree on test set: 0.47


  and should_run_async(code)


In [24]:
# Performing Random Forest Classifier
Classification.random_forest(x_train, y_train, x_test, y_test)

  and should_run_async(code)


Accuracy of Random Forest on test set: 0.57


In [25]:
# Performing Naive Bayes Classifier
Classification.naive_bayes(x_train, y_train, x_test, y_test)

Accuracy of Gaussian Naive Bayes on test set: 0.44


  and should_run_async(code)


In [26]:
# Performing Support Vector Machines(SVM) Classifier
Classification.svm(x_train, y_train, x_test, y_test)

  and should_run_async(code)


Accuracy of SVM on test set: 0.53


In [27]:
# Performing K Means clustering
clustered_data = Clustering.kmeans(data,3)
clustered_data

  and should_run_async(code)


Unnamed: 0,Fixed_Acidity,Volatile_Acidity,Citric Acid,Sugar,Chlorides,Fixed_Sulfur_dioxide,Total_Sulfur_dioxide,Density,pH,Sulphates,Alcohol,Color,Quality,Cluster
1,-2.977395,-0.707632,0.660336,-0.196167,-1.015666,-0.069739,0.036637,-1.899166,2.256270,-1.274492,1.879832,1,8,2
3,-2.697191,-0.774708,-0.606986,0.014047,-0.501280,2.071708,0.753707,0.926394,1.273905,-0.635375,-2.173142,1,3,1
4,-2.510388,0.868647,-1.576115,-0.523167,-1.296240,0.052629,-0.092068,-2.200367,1.666851,-1.434271,2.133142,1,6,2
5,-2.510388,-0.070413,0.585788,-0.172810,-1.015666,0.052629,0.202115,-1.884823,1.535869,-1.274492,1.879832,1,8,2
6,-2.510388,1.405252,-1.650664,0.014047,-0.641567,1.337497,-0.349478,-1.461706,1.208414,-0.954933,1.373210,1,7,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6362,3.747500,0.801571,3.120433,-0.429739,0.667778,-1.476975,-1.728460,1.543141,-0.363369,1.122197,0.528840,0,6,0
6364,3.747500,0.399117,1.704013,-0.546525,2.023885,-0.559212,-0.275932,2.081002,-0.625333,-0.076147,-0.906588,0,5,0
6365,3.747500,0.600344,1.182175,-0.558203,1.556262,-1.293423,-1.507823,1.098510,0.095068,2.000983,1.288773,0,7,0
6366,3.747500,-0.137489,1.629465,-0.663310,0.387204,-1.660528,-1.949097,0.510449,-1.345734,2.480321,0.275529,0,7,0


In [28]:
# Performing DBSCAN
eps = 0.5
min_samples = 5
clustered_data = Clustering.dbscan(data, eps, min_samples)
clustered_data

  and should_run_async(code)


Unnamed: 0,Fixed_Acidity,Volatile_Acidity,Citric Acid,Sugar,Chlorides,Fixed_Sulfur_dioxide,Total_Sulfur_dioxide,Density,pH,Sulphates,Alcohol,Color,Quality,Cluster
1,-2.977395,-0.707632,0.660336,-0.196167,-1.015666,-0.069739,0.036637,-1.899166,2.256270,-1.274492,1.879832,1,8,-1
3,-2.697191,-0.774708,-0.606986,0.014047,-0.501280,2.071708,0.753707,0.926394,1.273905,-0.635375,-2.173142,1,3,-1
4,-2.510388,0.868647,-1.576115,-0.523167,-1.296240,0.052629,-0.092068,-2.200367,1.666851,-1.434271,2.133142,1,6,-1
5,-2.510388,-0.070413,0.585788,-0.172810,-1.015666,0.052629,0.202115,-1.884823,1.535869,-1.274492,1.879832,1,8,-1
6,-2.510388,1.405252,-1.650664,0.014047,-0.641567,1.337497,-0.349478,-1.461706,1.208414,-0.954933,1.373210,1,7,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6362,3.747500,0.801571,3.120433,-0.429739,0.667778,-1.476975,-1.728460,1.543141,-0.363369,1.122197,0.528840,0,6,-1
6364,3.747500,0.399117,1.704013,-0.546525,2.023885,-0.559212,-0.275932,2.081002,-0.625333,-0.076147,-0.906588,0,5,-1
6365,3.747500,0.600344,1.182175,-0.558203,1.556262,-1.293423,-1.507823,1.098510,0.095068,2.000983,1.288773,0,7,-1
6366,3.747500,-0.137489,1.629465,-0.663310,0.387204,-1.660528,-1.949097,0.510449,-1.345734,2.480321,0.275529,0,7,-1


In [29]:
# Performing agglomerative clustering
n_clusters = 3
Clustering.agglomerative_clustering(data, n_clusters)

  and should_run_async(code)


Unnamed: 0,Fixed_Acidity,Volatile_Acidity,Citric Acid,Sugar,Chlorides,Fixed_Sulfur_dioxide,Total_Sulfur_dioxide,Density,pH,Sulphates,Alcohol,Color,Quality,Cluster
1,-2.977395,-0.707632,0.660336,-0.196167,-1.015666,-0.069739,0.036637,-1.899166,2.256270,-1.274492,1.879832,1,8,0
3,-2.697191,-0.774708,-0.606986,0.014047,-0.501280,2.071708,0.753707,0.926394,1.273905,-0.635375,-2.173142,1,3,2
4,-2.510388,0.868647,-1.576115,-0.523167,-1.296240,0.052629,-0.092068,-2.200367,1.666851,-1.434271,2.133142,1,6,0
5,-2.510388,-0.070413,0.585788,-0.172810,-1.015666,0.052629,0.202115,-1.884823,1.535869,-1.274492,1.879832,1,8,0
6,-2.510388,1.405252,-1.650664,0.014047,-0.641567,1.337497,-0.349478,-1.461706,1.208414,-0.954933,1.373210,1,7,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6362,3.747500,0.801571,3.120433,-0.429739,0.667778,-1.476975,-1.728460,1.543141,-0.363369,1.122197,0.528840,0,6,1
6364,3.747500,0.399117,1.704013,-0.546525,2.023885,-0.559212,-0.275932,2.081002,-0.625333,-0.076147,-0.906588,0,5,1
6365,3.747500,0.600344,1.182175,-0.558203,1.556262,-1.293423,-1.507823,1.098510,0.095068,2.000983,1.288773,0,7,1
6366,3.747500,-0.137489,1.629465,-0.663310,0.387204,-1.660528,-1.949097,0.510449,-1.345734,2.480321,0.275529,0,7,1


In [30]:
# Performing apriori algorithm (Association Rule Mining)

data_encoded = pd.get_dummies(data)
# Binarize the dataset based on some threshold or specific conditions
data_binarized = data_encoded.applymap(lambda x: 1 if x >= 1 else 0)

min_supp = 0.1
min_thresh = 0.7
association_rules = Rule_Mining.perform_apriori(data_binarized, min_support=min_supp, min_threshold=min_thresh)
association_rules

  and should_run_async(code)


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(Fixed_Acidity),(Quality),0.143209,1.0,0.143209,1.0,1.0,0.0,inf,0.0
1,(Volatile_Acidity),(Quality),0.163961,1.0,0.163961,1.0,1.0,0.0,inf,0.0
2,(Citric Acid),(Color),0.147524,0.778303,0.113417,0.768802,0.987793,-0.001402,0.958907,-0.014289
3,(Citric Acid),(Quality),0.147524,1.0,0.147524,1.0,1.0,0.0,inf,0.0
4,(Sugar),(Color),0.175057,0.778303,0.174029,0.994131,1.277307,0.037782,37.777193,0.263173
5,(Sugar),(Quality),0.175057,1.0,0.175057,1.0,1.0,0.0,inf,0.0
6,(Chlorides),(Quality),0.166221,1.0,0.166221,1.0,1.0,0.0,inf,0.0
7,(Fixed_Sulfur_dioxide),(Color),0.176906,0.778303,0.174646,0.987224,1.268432,0.036959,17.35284,0.257109
8,(Fixed_Sulfur_dioxide),(Quality),0.176906,1.0,0.176906,1.0,1.0,0.0,inf,0.0
9,(Total_Sulfur_dioxide),(Color),0.165811,0.778303,0.165605,0.998761,1.283255,0.036554,178.909595,0.264606
