In [2]:
# Load the Drive helper and mount
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
import gzip
import shutil
import json
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, SimpleRNN, Dense, Dropout, LSTM, GRU, Bidirectional
from keras.regularizers import l2
from keras.callbacks import EarlyStopping

In [3]:
shutil.copyfile("/content/drive/MyDrive/NN/hw5/Gift_Cards_5.json.gz", "/content/Gift_Cards_5.json.gz")

'/content/Gift_Cards_5.json.gz'

In [5]:
with gzip.open('/content/Gift_Cards_5.json.gz', 'rb') as f_in:
    with open('/content/Gift_Cards_5.json', 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)

In [6]:
# Open the file and read lines
data = []
with open('Gift_Cards_5.json', 'r') as file:
    for line in file:
        try:
            data.append(json.loads(line))
        except json.JSONDecodeError as e:
            print(f"Error decoding JSON: {e}")

# Now 'data' contains a list of dictionaries

In [7]:
text = []
labels = []
for i in range(len(data)):
    if 'reviewText' in data[i]:
      text.append(data[i]['reviewText'])
      labels.append(data[i]['overall'])
    else:
      continue

In [None]:
print(text[0],labels[0])

Another great gift. 5.0


In [8]:
max_length=0
for i in range(len(text)):
  length = len(text[i])
  if(length>max_length):
    max_length=length

In [9]:
# Tokenize the text
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(text)
sequences = tokenizer.texts_to_sequences(text)

# Pad sequences to ensure uniform input size
max_sequence_length = max_length
X = pad_sequences(sequences, maxlen=max_sequence_length)

# Encode labels
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(labels)

In [None]:
X[0]

array([  0,   0,   0, ..., 224,  10,   1], dtype=int32)

In [None]:
# Define the RNN model
model = Sequential()
model.add(Embedding(input_dim=10000, output_dim=128, input_length=max_sequence_length))
model.add(SimpleRNN(units=128, return_sequences=False))
model.add(Dropout(0.2))
model.add(Dense(5, activation='softmax'))

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X, y, epochs=2, batch_size=32, validation_split=0.2)

Epoch 1/2
Epoch 2/2


<keras.src.callbacks.History at 0x78e589273190>

In [None]:
# Define the model
model = Sequential()
model.add(Embedding(input_dim=10000, output_dim=128, input_length=max_sequence_length))
model.add(LSTM(units=128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(units=5, activation='softmax'))

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X, y, epochs=2, batch_size=32, validation_split=0.2)



Epoch 1/2
Epoch 2/2


<keras.src.callbacks.History at 0x793796fe3a00>

In [10]:
# Define the model
model = Sequential()
model.add(Embedding(input_dim=5000, output_dim=128, input_length=max_sequence_length))
model.add(LSTM(units=128, dropout=0.2, recurrent_dropout=0.2, return_sequences=True))
model.add(LSTM(units=64))
model.add(Dense(units=5, activation='softmax'))

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X, y, epochs=2, batch_size=32, validation_split=0.2)

Epoch 1/2
Epoch 2/2


<keras.src.callbacks.History at 0x7cba94e11450>

In [11]:
# Define the model
model = Sequential()
model.add(Embedding(input_dim=5000, output_dim=128, input_length=max_sequence_length))
model.add(GRU(units=128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(units=5, activation='softmax'))

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X, y, epochs=2, batch_size=32, validation_split=0.2)

Epoch 1/2
Epoch 2/2


<keras.src.callbacks.History at 0x7cba95830730>

2

In [None]:
# Separate class 2 and 4 samples
class_2_indices = np.where(y == 1)[0]
class_4_indices = np.where(y == 3)[0]

# Combine indices of class 2 and 4 samples
test_indices = np.concatenate((class_2_indices, class_4_indices))

# Remove these indices from the training data
X_train = np.delete(X, test_indices, axis=0)
y_train = np.delete(y, test_indices, axis=0)


# Add class 2 and 4 samples to the test set
X_test = X[test_indices]
y_test = y[test_indices]

In [None]:
# Define the RNN model
model = Sequential()
model.add(Embedding(input_dim=10000, output_dim=128, input_length=max_sequence_length))
model.add(SimpleRNN(units=128, return_sequences=False))
model.add(Dropout(0.2))
model.add(Dense(5, activation='softmax'))

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=2, batch_size=32, validation_split=0.2)

Epoch 1/2
Epoch 2/2


<keras.src.callbacks.History at 0x78e58951bfd0>

In [None]:
y_pred = model.predict(X_test)
y_pred = np.argmax(y_pred, axis=-1)
print('acc: {:.2%}'.format(np.mean(y_pred == y_test)))

acc: 0.00%


In [None]:
# Define the model
model = Sequential()
model.add(Embedding(input_dim=10000, output_dim=128, input_length=max_sequence_length))
model.add(LSTM(units=128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(units=5, activation='softmax'))

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=2, batch_size=32, validation_split=0.2)

y_pred = model.predict(X_test)
y_pred = np.argmax(y_pred, axis=-1)
print('acc: {:.2%}'.format(np.mean(y_pred == y_test)))



Epoch 1/2
Epoch 2/2
acc: 0.00%


In [None]:
!wget 'https://datarepo.eng.ucsd.edu/mcauley_group/data/amazon_v2/categoryFiles/Musical_Instruments.json.gz'

--2024-05-26 20:10:35--  https://datarepo.eng.ucsd.edu/mcauley_group/data/amazon_v2/categoryFiles/Musical_Instruments.json.gz
Resolving datarepo.eng.ucsd.edu (datarepo.eng.ucsd.edu)... 132.239.8.30
Connecting to datarepo.eng.ucsd.edu (datarepo.eng.ucsd.edu)|132.239.8.30|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 232750177 (222M) [application/x-gzip]
Saving to: ‘Musical_Instruments.json.gz’


2024-05-26 20:11:33 (3.88 MB/s) - ‘Musical_Instruments.json.gz’ saved [232750177/232750177]



In [None]:
with gzip.open('/content/Musical_Instruments.json.gz', 'rb') as f_in:
    with open('/content/Musical_Instruments.json', 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)

In [None]:
data = []
with open('Musical_Instruments.json', 'r') as file:
    for line in file:
        try:
            data.append(json.loads(line))
        except json.JSONDecodeError as e:
            print(f"Error decoding JSON: {e}")
text = []
labels = []
for i in range(len(data)):
    if 'reviewText' in data[i]:
      text.append(data[i]['reviewText'])
      labels.append(data[i]['overall'])
    else:
      continue

In [None]:
print(text[0],labels[0])

Crocheting for Dummies by Karen Manthey & Susan Brittain is a wonderfully thorough and very informative book for anyone wanting to learn to crochet and or wanting to freshen up their skills.

The book reads like a storybook in paragraph form.  Everything is explained in great detail from choosing yarns and hooks, to how to work a large array of crochet stitches, to how to read a pattern, right down to how to care for ones crocheted items.

The stitch drawings are clear and expertly done making learning new stitches so much easier.

The book has both a contents page and an index for easy referral.  I especially liked the fact that an index was included.  So many crochet books do not include this.  The index makes it very easy to find information on a particular topic quickly.

The recommendations for people just learning to crochet are fantastic.  This book wasn't out when I learned to crochet and I learned the hard way about many of the pit falls this book helps one to avoid.  For inst

In [None]:
len(text)

1511675

In [None]:
text_total = text
labels_total = labels

text = text[:5000]
labels = labels[:5000]

In [None]:
len(text)

10000

In [None]:
max_length=0
for i in range(len(text)):
  length = len(text[i])
  if(length>max_length):
    max_length=length

In [None]:
# Tokenize the text
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(text)
sequences = tokenizer.texts_to_sequences(text)

# Pad sequences to ensure uniform input size
max_sequence_length = max_length
X = pad_sequences(sequences, maxlen=max_sequence_length)

In [None]:
# Encode labels
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(labels)

# Separate class 2 and 4 samples
class_2_indices = np.where(y == 1)[0]
class_4_indices = np.where(y == 3)[0]

# Combine indices of class 2 and 4 samples
test_indices = np.concatenate((class_2_indices, class_4_indices))

# Remove these indices from the training data
X_train = np.delete(X, test_indices, axis=0)
y_train = np.delete(y, test_indices, axis=0)

# Add class 2 and 4 samples to the test set
X_test = X[test_indices]
y_test = y[test_indices]

In [None]:
len(y_test)

1102

In [None]:
max_length

5369

In [None]:
# Define the model
model = Sequential()
model.add(Embedding(input_dim=7000, output_dim=64, input_length=max_sequence_length))
model.add(LSTM(units=64, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(units=5, activation='softmax'))

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=1, batch_size=16, validation_split=0.2)

y_pred = model.predict(X_test)
y_pred = np.argmax(y_pred, axis=-1)
print('acc: {:.2%}'.format(np.mean(y_pred == y_test)))



acc: 0.00%


In [None]:
y_pred[:30]

array([4, 4, 4, 4, 4, 4, 4, 4, 4, 0, 4, 0, 0, 4, 4, 4, 4, 4, 0, 4, 4, 0,
       0, 4, 4, 4, 0, 0, 4, 4])

In [None]:
# Define the model
model = Sequential()
model.add(Embedding(input_dim=5000, output_dim=64, input_length=max_sequence_length))
model.add(LSTM(units=64, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(units=5, activation='softmax'))

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=1, batch_size= 64, validation_split=0.2)

y_pred = model.predict(X_test)
y_pred = np.argmax(y_pred, axis=-1)
print('acc: {:.2%}'.format(np.mean(y_pred == y_test)))

acc: 0.00%


In [13]:
# Separate class 2 and 4 samples
class_1_indices = np.where(y == 0)[0]
# class_4_indices = np.where(y == 3)[0]

# Combine indices of class 2 and 4 samples
# test_indices = np.concatenate((class_2_indices, class_4_indices))
test_indices = class_1_indices

# Remove these indices from the training data
X_train = np.delete(X, test_indices, axis=0)
y_train = np.delete(y, test_indices, axis=0)


# Add class 2 and 4 samples to the test set
X_test = X[test_indices]
y_test = y[test_indices]

In [15]:
model = Sequential()
model.add(Embedding(input_dim=5000, output_dim=64, input_length=max_sequence_length))
model.add(Bidirectional(LSTM(units=64, return_sequences=False, kernel_regularizer=l2(0.01))))
model.add(Dropout(0.5))
model.add(Dense(5, activation='softmax', kernel_regularizer=l2(0.01)))

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

model.fit(X_train, y_train, epochs=20, batch_size=16, validation_split=0.2, callbacks=[early_stopping])

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20


<keras.src.callbacks.History at 0x7cba8595c400>

In [16]:
y_pred = model.predict(X_test)
y_pred = np.argmax(y_pred, axis=-1)
print('acc: {:.2%}'.format(np.mean(y_pred == y_test)))

acc: 0.00%


In [None]:
!wget 'https://huggingface.co/datasets/scikit-learn/imdb/resolve/main/IMDB%20Dataset.csv'

--2024-06-01 18:28:33--  https://huggingface.co/datasets/scikit-learn/imdb/resolve/main/IMDB%20Dataset.csv
Resolving huggingface.co (huggingface.co)... 3.163.189.37, 3.163.189.74, 3.163.189.114, ...
Connecting to huggingface.co (huggingface.co)|3.163.189.37|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://cdn-lfs.huggingface.co/repos/77/fa/77fa70b48eef1c98bf08d7b3e43b710623c24c69b4f78d4484f43c3361e9d2af/dfc447764f82be365fa9c2beef4e8df89d3919e3da95f5088004797d79695aa2?response-content-disposition=attachment%3B+filename*%3DUTF-8%27%27IMDB%2520Dataset.csv%3B+filename%3D%22IMDB+Dataset.csv%22%3B&response-content-type=text%2Fcsv&Expires=1717525713&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTcxNzUyNTcxM319LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy5odWdnaW5nZmFjZS5jby9yZXBvcy83Ny9mYS83N2ZhNzBiNDhlZWYxYzk4YmYwOGQ3YjNlNDNiNzEwNjIzYzI0YzY5YjRmNzhkNDQ4NGY0M2MzMzYxZTlkMmFmL2RmYzQ0Nzc2NGY4MmJlMzY1ZmE5YzJiZWVmNGU4ZGY4OWQz

In [None]:
# Read the CSV file
df = pd.read_csv('/content/IMDB Dataset.csv')

# Display the first few rows
print(df.head())

                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive


In [None]:
df['review'][0]

"One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked. They are right, as this is exactly what happened with me.<br /><br />The first thing that struck me about Oz was its brutality and unflinching scenes of violence, which set in right from the word GO. Trust me, this is not a show for the faint hearted or timid. This show pulls no punches with regards to drugs, sex or violence. Its is hardcore, in the classic use of the word.<br /><br />It is called OZ as that is the nickname given to the Oswald Maximum Security State Penitentary. It focuses mainly on Emerald City, an experimental section of the prison where all the cells have glass fronts and face inwards, so privacy is not high on the agenda. Em City is home to many..Aryans, Muslims, gangstas, Latinos, Christians, Italians, Irish and more....so scuffles, death stares, dodgy dealings and shady agreements are never far away.<br /><br />I would say the main appeal of the show is due to the fa

In [None]:
df['review'][3]

"Basically there's a family where a little boy (Jake) thinks there's a zombie in his closet & his parents are fighting all the time.<br /><br />This movie is slower than a soap opera... and suddenly, Jake decides to become Rambo and kill the zombie.<br /><br />OK, first of all when you're going to make a film you must Decide if its a thriller or a drama! As a drama the movie is watchable. Parents are divorcing & arguing like in real life. And then we have Jake with his closet which totally ruins all the film! I expected to see a BOOGEYMAN similar movie, and instead i watched a drama with some meaningless thriller spots.<br /><br />3 out of 10 just for the well playing parents & descent dialogs. As for the shots with Jake: just ignore them."

In [None]:
len(df)

50000

In [None]:
data = df.iloc[:3000,:]

In [None]:
text = []
labels = []
for i in range(len(data)):
    # if 'reviewText' in data[i]:
      text.append(data['review'][i])
      if(data['sentiment'][i]=='positive'):
        labels.append(1)
      elif(data['sentiment'][i]=='negative'):
        labels.append(0)
    # else:
    #   continue

In [None]:
len(text[0])

1761

In [None]:
len(labels)

3000

In [None]:
print(labels[0], text[0])

1 One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked. They are right, as this is exactly what happened with me.<br /><br />The first thing that struck me about Oz was its brutality and unflinching scenes of violence, which set in right from the word GO. Trust me, this is not a show for the faint hearted or timid. This show pulls no punches with regards to drugs, sex or violence. Its is hardcore, in the classic use of the word.<br /><br />It is called OZ as that is the nickname given to the Oswald Maximum Security State Penitentary. It focuses mainly on Emerald City, an experimental section of the prison where all the cells have glass fronts and face inwards, so privacy is not high on the agenda. Em City is home to many..Aryans, Muslims, gangstas, Latinos, Christians, Italians, Irish and more....so scuffles, death stares, dodgy dealings and shady agreements are never far away.<br /><br />I would say the main appeal of the show is due to the f

In [None]:
max_length=0
for i in range(len(text)):
  length = len(text[i])
  if(length>max_length):
    max_length=length

In [None]:
# Tokenize the text
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(text)
sequences = tokenizer.texts_to_sequences(text)

# Pad sequences to ensure uniform input size
max_sequence_length = 200
X = pad_sequences(sequences, maxlen=max_sequence_length)

# Encode labels
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(labels)

In [None]:
max_sequence_length

8180

In [None]:
len(text[0])

1761

In [None]:
model = Sequential()
model.add(Embedding(input_dim=5000, output_dim=64, input_length=max_sequence_length))
model.add(Bidirectional(LSTM(units=64, return_sequences=False, kernel_regularizer=l2(0.01))))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid', kernel_regularizer=l2(0.01)))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

model.fit(X, y, epochs=20, batch_size=16, validation_split=0.2, callbacks=[early_stopping])

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20


<keras.src.callbacks.History at 0x78f902250cd0>