In [1]:
# Question 1

import nltk
from nltk.corpus import gutenberg, stopwords
import re
import pandas as pd
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori

# Load the novel and preprocess
sentences = gutenberg.sents('carroll-alice.txt')
stop_words = set(stopwords.words('english'))
processed_sentences = [
    [word.lower() for word in sentence if word.lower() not in stop_words and re.search(r'^[a-zA-Z]+$', word)]
    for sentence in sentences
    if sentence
]

# Convert to transaction data
encoder = TransactionEncoder()
transformed_data = encoder.fit_transform(processed_sentences)
df = pd.DataFrame(transformed_data, columns=encoder.columns_)

# Find frequent itemsets
frequent_itemsets = apriori(df, min_support=0.02, use_colnames=True)

# Export to CSV
frequent_itemsets.to_csv('frequent_itemsets.csv', index=False)

# Print some of the frequent itemsets
print(frequent_itemsets.head())


    support itemsets
0  0.228420  (alice)
1  0.022314   (back)
2  0.033470  (began)
3  0.022314   (came)
4  0.021139    (cat)


In [2]:
# Question 1

import pandas as pd

# Load the new CSV file containing frequent itemsets
new_file_path = '/Users/shatakshishewale/Desktop/JHU/Spring 24/Applied ML/jupyter codes/frequent_itemsets.csv'
new_frequent_itemsets = pd.read_csv(new_file_path)

# Display the content of the dataset to understand its structure and contents
new_frequent_itemsets.head()


Unnamed: 0,support,itemsets
0,0.22842,frozenset({'alice'})
1,0.022314,frozenset({'back'})
2,0.03347,frozenset({'began'})
3,0.022314,frozenset({'came'})
4,0.021139,frozenset({'cat'})


In [3]:
# Question 1

# Convert string representation of frozenset to actual sets and filter itemsets with more than one element
new_frequent_itemsets['itemsets'] = new_frequent_itemsets['itemsets'].apply(lambda x: eval(x))
new_frequent_itemsets['length'] = new_frequent_itemsets['itemsets'].apply(lambda x: len(x))
multi_word_itemsets = new_frequent_itemsets[new_frequent_itemsets['length'] > 1]

# Display itemsets with more than one element
multi_word_itemsets.sort_values(by='support', ascending=False).head(10)


Unnamed: 0,support,itemsets,length
49,0.095713,"(alice, said)",2
50,0.034058,"(thought, alice)",2
52,0.032883,"(mock, turtle)",2
51,0.02525,"(king, said)",2
48,0.021726,"(alice, little)",2


**Report Analysis**

Here are some of the most interesting patterns identified from the frequent itemsets of "Alice in Wonderland" by Lewis Carroll:

1. **(said, alice)**: This phrase appears with a significant support value of approximately 9.57%, indicating that conversations involving Alice are frequently mentioned throughout the novel.
2. **(thought, alice)**: Occurring with a support of about 3.41%, this phrase suggests recurring instances where Alice's thoughts or inner reflections are narrated.
3. **(turtle, mock)**: With a support value of around 3.29%, this likely refers to the character "Mock Turtle," a prominent figure in the story.
4. **(king, said)**: This phrase, showing up with a support of about 2.53%, reflects frequent dialogues or statements involving the King character.
5. **(little, alice)**: Occurring with a support of approximately 2.17%, this phrase might describe Alice, emphasizing her size or her perceived insignificance in various scenarios within the novel.

In [4]:
# Question 2

def load_mnist(path, kind='train'):
        from numpy import fromfile, uint8
        import os
        import struct

        labels_path = os.path.join(path, '%s-labels-idx1-ubyte' % kind)
        images_path = os.path.join(path, '%s-images-idx3-ubyte' % kind)
        with open(labels_path, 'rb') as lbpath:
            magic, n = struct.unpack('>II', lbpath.read(8))
            labels = fromfile(lbpath, dtype=uint8)
            with open(images_path, 'rb') as imgpath:
                magic, num, rows, cols = struct.unpack(">IIII",imgpath.read(16))
                images = fromfile(imgpath, dtype=uint8).reshape(len(labels), 784)
                images = ((images / 255.) - .5) * 2
        return images, labels
    

X_train_mnist, y_train_mnist = load_mnist('/Users/shatakshishewale/Desktop/JHU/Spring 24/Applied ML/jupyter codes/datasets/', kind='train')
print(f'Rows= {X_train_mnist.shape[0]}, columns= {X_train_mnist.shape[1]}')

X_test_mnist, y_test_mnist = load_mnist('/Users/shatakshishewale/Desktop/JHU/Spring 24/Applied ML/jupyter codes/datasets/', kind='t10k')
print(f'Rows= {X_test_mnist.shape[0]}, columns= {X_test_mnist.shape[1]}')

# Split the training data for training and validation
X_train, X_valid = X_train_mnist[:55000], X_train_mnist[55000:]
y_train, y_valid = y_train_mnist[:55000], y_train_mnist[55000:]

Rows= 60000, columns= 784
Rows= 10000, columns= 784


In [5]:
# Question 2

import numpy as np
import sys

class NeuralNetMLP(object):
    def __init__(self, n_hidden1=30, n_hidden2=30, epochs=100, eta=0.001, minibatch_size=1, seed=None):
        self.random = np.random.RandomState(seed)
        self.n_hidden1 = n_hidden1
        self.n_hidden2 = n_hidden2
        self.epochs = epochs
        self.eta = eta
        self.minibatch_size = minibatch_size
        self.w_out, self.w_h2, self.w_h1 = None, None, None

    @staticmethod
    def onehot(y, n_classes):
        onehot = np.zeros((n_classes, y.shape[0]))
        for idx, val in enumerate(y.astype(int)):
            onehot[val, idx] = 1.0
        return onehot.T
    
    @staticmethod
    def sigmoid(z):
        return 1.0 / (1.0 + np.exp(-np.clip(z, -250, 250)))

    def _forward(self, X):
        z_h1 = np.dot(X, self.w_h1)
        a_h1 = self.sigmoid(z_h1)
        z_h2 = np.dot(a_h1, self.w_h2)
        a_h2 = self.sigmoid(z_h2)
        z_out = np.dot(a_h2, self.w_out)
        a_out = self.sigmoid(z_out)
        return z_h1, a_h1, z_h2, a_h2, z_out, a_out

    @staticmethod
    def compute_cost(y_enc, output):
        term1 = -y_enc * (np.log(output))
        term2 = (1 - y_enc) * (np.log(1 - output))
        cost = np.sum(term1 - term2)
        return cost

    def predict(self, X):
        _, _, _, _, z_out, a_out = self._forward(X)
        y_pred = np.argmax(a_out, axis=1)
        return y_pred

    def fit(self, X_train, y_train, X_valid, y_valid):
        n_output = np.unique(y_train).shape[0]  # number of class labels
        n_features = X_train.shape[1]
        self.w_out = self.random.normal(loc=0.0, scale=0.1, size=(self.n_hidden2, n_output))
        self.w_h2 = self.random.normal(loc=0.0, scale=0.1, size=(self.n_hidden1, self.n_hidden2))
        self.w_h1 = self.random.normal(loc=0.0, scale=0.1, size=(n_features, self.n_hidden1))
        
        y_train_enc = self.onehot(y_train, n_output)

        for epoch in range(self.epochs):
            indices = np.arange(X_train.shape[0])
            for start_idx in range(0, indices.shape[0] - self.minibatch_size + 1, self.minibatch_size):
                batch_idx = indices[start_idx:start_idx + self.minibatch_size]
                _, a_h1, _, a_h2, z_out, a_out = self._forward(X_train[batch_idx])

                # Backpropagation
                sigma_out = a_out - y_train_enc[batch_idx]
                sigmoid_derivative_h2 = a_h2 * (1 - a_h2)
                sigma_h2 = np.dot(sigma_out, self.w_out.T) * sigmoid_derivative_h2
                
                sigmoid_derivative_h1 = a_h1 * (1 - a_h1)
                sigma_h1 = np.dot(sigma_h2, self.w_h2.T) * sigmoid_derivative_h1

                grad_w_out = np.dot(a_h2.T, sigma_out)
                grad_w_h2 = np.dot(a_h1.T, sigma_h2)
                grad_w_h1 = np.dot(X_train[batch_idx].T, sigma_h1)

                self.w_out -= self.eta * grad_w_out
                self.w_h2 -= self.eta * grad_w_h2
                self.w_h1 -= self.eta * grad_w_h1

            # Evaluation after each epoch
            _, _, _, _, _, a_out = self._forward(X_train)
            cost = self.compute_cost(y_enc=y_train_enc, output=a_out)
            y_train_pred = self.predict(X_train)
            y_valid_pred = self.predict(X_valid)
            train_acc = ((np.sum(y_train == y_train_pred)).astype(float) / X_train.shape[0])
            valid_acc = ((np.sum(y_valid == y_valid_pred)).astype(float) / X_valid.shape[0])
            sys.stderr.write(f'\r{epoch+1}/{self.epochs} | Cost: {cost:.2f} | Train/Valid Acc.: {train_acc*100:.2f}%/{valid_acc*100:.2f}% ')
            sys.stderr.flush()

        return self

# Instantiate the model
nn = NeuralNetMLP(n_hidden1=50, n_hidden2=50, epochs=50, eta=0.001, minibatch_size=64, seed=1)

# Fit the model to the data
nn.fit(X_train, y_train, X_valid, y_valid)

# Predict and evaluate the model
y_pred = nn.predict(X_test_mnist)
accuracy = np.sum(y_test_mnist == y_pred).astype(float) / X_test_mnist.shape[0]
print(f'Accuracy on test set: {accuracy*100:.2f}%')


50/50 | Cost: 6671.35 | Train/Valid Acc.: 98.49%/97.38%  

Accuracy on test set: 96.99%
