In [2]:
import numpy as np
import matplotlib.image as mpimg
import matplotlib.pyplot as plt
import tensorflow as tf
from IPython.display import display, Image
import shutil
import pandas as pd
from tensorflow.keras.preprocessing.image import ImageDataGenerator
import os

In [3]:
model = tf.keras.models.Sequential([
    tf.keras.layers.Conv2D(16, (3, 3), activation="relu", input_shape=(180, 180, 3)),
    tf.keras.layers.MaxPooling2D(2, 2),
    tf.keras.layers.Conv2D(32, (3, 3), activation="relu"),
    tf.keras.layers.MaxPooling2D(2, 2),
    tf.keras.layers.Conv2D(64, (3, 3), activation="relu"),
    tf.keras.layers.MaxPooling2D(2, 2),
    tf.keras.layers.Conv2D(128, (3, 3), activation="relu"),
    tf.keras.layers.MaxPooling2D(2, 2),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(550, activation="relu"),
    tf.keras.layers.Dropout(0.1, seed=2019),
    tf.keras.layers.Dense(400, activation="relu"),
    tf.keras.layers.Dropout(0.3, seed=2019),
    tf.keras.layers.Dense(300, activation="relu"),
    tf.keras.layers.Dropout(0.4, seed=2019),
    tf.keras.layers.Dense(200, activation="relu"),
    tf.keras.layers.Dropout(0.2, seed=2019),
    tf.keras.layers.Dense(1, activation="sigmoid")  # Output layer for binary classification
])

In [4]:
with open('test_ids.txt', 'r') as file:
    # Read all lines into a list
    lines = file.readlines()
    test_ids = []

    # Print each line
    for line in lines:
        test_ids.append(line.strip() + '.jpg')

In [5]:
with open('train_ids.txt', 'r') as file:
    # Read all lines into a list
    lines = file.readlines()
    train_ids = []

    # Print each line
    for line in lines:
        train_ids.append(line.strip() + '.jpg')

In [6]:
with open('val_ids.txt', 'r') as file:
    # Read all lines into a list
    lines = file.readlines()
    val_ids = []

    # Print each line
    for line in lines:
        val_ids.append(line.strip() + '.jpg')

In [7]:
print("Number of images in training dataset: ", len(train_ids))
print("Number of images in validation dataset: ", len(val_ids))
print("Number of images in testing dataset: ", len(test_ids))

Number of images in training dataset:  134823
Number of images in validation dataset:  5000
Number of images in testing dataset:  10000


In [8]:
#get image files
def get_image_files(folder_path):
    files = os.listdir(folder_path)
    image_files = []
    
    for file in files:
        if file.lower().endswith('.jpg'):
            image_files.append(file)
    
    return image_files
        

In [9]:
train_path = '/Users/raghavraahul/Downloads/Machine Learning under a Modern Optimization Lens/ML Project/train'
test_path = '/Users/raghavraahul/Downloads/Machine Learning under a Modern Optimization Lens/ML Project/test'
val_path = '/Users/raghavraahul/Downloads/Machine Learning under a Modern Optimization Lens/ML Project/valid'

In [10]:
def copy_files(source_path, destination_path, titles):
    for title in titles:
        source_file = os.path.join(data_directory, title)
        destination_file = os.path.join(destination_path, title)
        shutil.copy(source_file, destination_file)

In [11]:
#data_directory = "/Users/raghavraahul/Downloads/Machine Learning under a Modern Optimization Lens/ML Project/img_resized/"

In [12]:
#already done
#copy_files(data_directory, train_path, train_ids)
#copy_files(data_directory, val_path, val_ids)
#copy_files(data_directory, test_path, test_ids)

In [13]:
len(get_image_files(train_path))

134823

In [14]:
import json

# Specify the path to your JSON file
json_file_path = '/Users/raghavraahul/Downloads/Machine Learning under a Modern Optimization Lens/ML Project/MMHS150K_GT.json'

# Open the JSON file for reading
with open(json_file_path, 'r') as json_file:
    # Load the JSON data
    data = json.load(json_file)

In [15]:
df = pd.DataFrame.from_dict(data, orient='index')
df = df.reset_index()
newColumns = {"index": "id"}
df = df.rename(columns=newColumns)

In [16]:
#0 for nothate, #1 for hate
def labels_to_binary(labels):
    if sum(labels) <= 1:
        return 0
    else:
        return 1

In [17]:
df["Hate(1)_vs_NotHate(0)"] = df["labels"].apply(labels_to_binary)
sum(df["Hate(1)_vs_NotHate(0)"]==1)

61547

In [18]:
nothate_ids = df[df["Hate(1)_vs_NotHate(0)"]==0]["id"].tolist()

In [19]:
hate_ids = df[df["Hate(1)_vs_NotHate(0)"]==1]["id"].tolist()

In [20]:
nothate_ids = [x + '.jpg' for x in nothate_ids]
hate_ids = [x + '.jpg' for x in hate_ids]

In [21]:
hate_train_ids = []
hate_val_ids = []
hate_test_ids = []
for id_ in hate_ids:
    if id_ in train_ids:
        hate_train_ids.append(id_)
    elif id_ in test_ids:
        hate_test_ids.append(id_)
    else:
        hate_val_ids.append(id_)

In [22]:
nothate_train_ids = []
nothate_val_ids = []
nothate_test_ids = []
for id_ in nothate_ids:
    if id_ in train_ids:
        nothate_train_ids.append(id_)
    elif id_ in test_ids:
        nothate_test_ids.append(id_)
    else:
        nothate_val_ids.append(id_)

In [23]:
train_path = '/Users/raghavraahul/Downloads/Machine Learning under a Modern Optimization Lens/ML Project/train'
test_path = '/Users/raghavraahul/Downloads/Machine Learning under a Modern Optimization Lens/ML Project/test'
val_path = '/Users/raghavraahul/Downloads/Machine Learning under a Modern Optimization Lens/ML Project/valid'
train_nothate_path = '/Users/raghavraahul/Downloads/Machine Learning under a Modern Optimization Lens/ML Project/train/notHate'
train_hate_path = '/Users/raghavraahul/Downloads/Machine Learning under a Modern Optimization Lens/ML Project/train/hate'
test_nothate_path = '/Users/raghavraahul/Downloads/Machine Learning under a Modern Optimization Lens/ML Project/test/notHate'
test_hate_path = '/Users/raghavraahul/Downloads/Machine Learning under a Modern Optimization Lens/ML Project/test/hate'
val_nothate_path = '/Users/raghavraahul/Downloads/Machine Learning under a Modern Optimization Lens/ML Project/valid/notHate'
val_hate_path = '/Users/raghavraahul/Downloads/Machine Learning under a Modern Optimization Lens/ML Project/valid/hate'

In [24]:
val_nothate = len(get_image_files(val_nothate_path))
val_hate = len(get_image_files(val_hate_path))
test_nothate = len(get_image_files(test_nothate_path))
test_hate = len(get_image_files(test_hate_path))
train_nothate = len(get_image_files(train_nothate_path))
train_hate = len(get_image_files(train_hate_path))

In [25]:
# Specify the data directories
train_dir = train_path
val_dir = val_path
test_dir = test_path

In [26]:
# Image data augmentation for training set
train_datagen = ImageDataGenerator(
    rescale=1./255,
    rotation_range=40,
    width_shift_range=0.2,
    height_shift_range=0.2,
    shear_range=0.2,
    zoom_range=0.2,
    horizontal_flip=True,
    fill_mode='nearest'
)

# Image data augmentation for validation and test sets
val_test_datagen = ImageDataGenerator(rescale=1./255)

In [27]:
# Specify batch size
batch_size = 32

# Create data generators
train_generator = train_datagen.flow_from_directory(
    train_dir,
    target_size=(180, 180),
    batch_size=batch_size,
    class_mode='categorical'
)

val_generator = val_test_datagen.flow_from_directory(
    val_dir,
    target_size=(180, 180),
    batch_size=batch_size,
    class_mode='categorical'
)

test_generator = val_test_datagen.flow_from_directory(
    test_dir,
    target_size=(180, 180),
    batch_size=batch_size,
    class_mode='categorical',
    shuffle=False  # Important to keep the order of predictions
)

Found 134823 images belonging to 2 classes.
Found 5000 images belonging to 2 classes.
Found 10000 images belonging to 2 classes.


In [28]:
# Compile the model
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

In [None]:
# Train the model
history = model.fit(train_generator,
                    steps_per_epoch=len(train_generator),
                    epochs=10,
                    validation_data=val_generator,
                    validation_steps=len(val_generator))

Epoch 1/10

In [None]:
test_loss, test_acc = model.evaluate(test_generator, steps=len(test_generator))
print(f'Test accuracy: {test_acc}')