# Group assignment IBM 322
# Karanjit Singh
# 21117058
# Gaussian Mixture Modelling

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import os
import random
from PIL import Image

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [2]:
def load_images(folder_path, label, num_images):

    # Listing all files in the folder
    files = os.listdir(folder_path)

    # Filtering out only image files
    image_files = [file for file in files if file.lower().endswith(('.png', '.jpg'))]

    selected_files = random.sample(image_files,num_images)

    # lists to store images and labels
    images = []
    labels = []

    # Adding image files to the lists
    for image_file in selected_files:
        image_path = os.path.join(folder_path, image_file)

        try:
            # Opening and resizing the image to a common size
            img = Image.open(image_path).resize((32, 32))

            # Converting the image to a numpy array and flattening it
            img_array = np.array(img).flatten()

            # Append the flattened image to the list of images
            images.append(img_array)

            # Assign the label based on the provided label argument
            labels.append(label)

            # Closing the image
            img.close()
        except Exception as e:
            print(f"Error processing {image_file}: {str(e)}")

    return np.array(images), np.array(labels)

In [3]:
num_images_per_class=50000
real_images, real_labels = load_images(r"C:\Users\hp\OneDrive\Desktop\ibm\train\REAL", label=1,num_images=num_images_per_class)
fake_images, fake_labels = load_images(r"C:\Users\hp\OneDrive\Desktop\ibm\train\FAKE", label=0,num_images=num_images_per_class)

In [4]:
all_images = np.concatenate((real_images, fake_images), axis=0)
all_labels = np.concatenate((real_labels, fake_labels), axis=0)

In [6]:
print(all_images)
print(all_labels)

[[  7   0   1 ...  10  21  27]
 [144 143  89 ... 141 137  50]
 [168 192 228 ...  98 137 154]
 ...
 [  0  46  68 ... 173 155 155]
 [149 146 141 ...  89  98 115]
 [ 18   0   0 ... 123  64  32]]
[1 1 1 ... 0 0 0]


In [20]:
# Shuffling the indices
indices = np.arange(all_images.shape[0])
np.random.shuffle(indices)

# Using the shuffled indices to shuffle the images and labels
shuffled_images = all_images[indices]
shuffled_labels = all_labels[indices]

In [23]:
scaler = StandardScaler()
scaled_images = scaler.fit_transform(shuffled_images)
print(scaled_images)

[[-0.59800536 -0.40321443 -0.51464464 ...  1.75928582  0.3408114
  -0.30047951]
 [ 2.00142726  1.9556149   1.70896529 ...  1.3507758   1.19553184
   0.53476329]
 [ 0.234962    0.31158234  0.58480694 ...  0.99128699  1.27932796
   1.44593726]
 ...
 [-1.41661122 -1.50400144 -1.03348695 ...  1.97171104  2.08377073
   2.00782787]
 [ 1.1684599   1.05497098  1.11600265 ...  0.45205376  0.57544054
   0.73218432]
 [-0.56928235 -0.61765345 -0.62582513 ... -0.08717947 -0.98316733
  -0.92311505]]


In [24]:
print(shuffled_labels)

[0 1 1 ... 1 1 0]


In [25]:
# Splitting the training data into training and testing sets 60/40 split
X_train, X_test, y_train, y_test = train_test_split(scaled_images, shuffled_labels, test_size=0.4, random_state=42)


In [27]:
from sklearn.mixture import GaussianMixture
gmm = GaussianMixture(n_components=2)
gmm.fit(X_train)

In [28]:
z=gmm.predict(X_test)
z

array([1, 1, 0, ..., 0, 1, 0], dtype=int64)

In [41]:
print(gmm.means_)


[[-0.21117905 -0.22053417 -0.3045239  ... -0.07160053 -0.15377584
  -0.28818915]
 [ 0.09495767  0.09802551  0.13751986 ...  0.03557691  0.07243379
   0.135802  ]]


In [56]:
import csv

In [72]:
# with open('Mean for fake.csv', 'w') as csvfile:
#     writer = csv.writer(csvfile, lineterminator='', escapechar='\\')
#     q=[gmm.means_[0]]
#     for row in q:
#         writer.writerow(row)


In [74]:
# with open('Mean for real.csv', 'w') as csvfile:
#     writer = csv.writer(csvfile, lineterminator='', escapechar='\\')
#     q=[gmm.means_[1]]
#     for row in q:
#         writer.writerow(row)

In [29]:
accuracy_score(y_test,z)

0.638725

In [75]:
print(confusion_matrix(y_test,z))

[[ 9492 10400]
 [ 4051 16057]]


In [33]:
print(classification_report(y_test,z))

              precision    recall  f1-score   support

           0       0.70      0.48      0.57     19892
           1       0.61      0.80      0.69     20108

    accuracy                           0.64     40000
   macro avg       0.65      0.64      0.63     40000
weighted avg       0.65      0.64      0.63     40000

