In [286]:
#Current imports
import pandas as pd

##Tensorflow
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Embedding, GlobalAveragePooling1D, Dense, Input, Flatten, Dropout



In [287]:
##Read in CSV
df = pd.read_csv('IMDB_Dataset.csv')
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [288]:
##do this to map each categorical variable as a 1 or 0
y=df['sentiment'].map({'positive': 1, 'negative': 0})
print(y.head())

reviews=df['review']
print(reviews.head())

0    1
1    1
2    1
3    0
4    1
Name: sentiment, dtype: int64
0    One of the other reviewers has mentioned that ...
1    A wonderful little production. <br /><br />The...
2    I thought this was a wonderful way to spend ti...
3    Basically there's a family where a little boy ...
4    Petter Mattei's "Love in the Time of Money" is...
Name: review, dtype: object


In [289]:
##Preprocessing libraries
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [290]:
##Preprocessing code
##remove stop words?
##lowercase or standardize the format or each word? 
##tokenizer calls .lower()

##standard tokenizing code
tokenizer = Tokenizer()
tokenizer.fit_on_texts(reviews)
sequences = tokenizer.texts_to_sequences(reviews)
word_index = tokenizer.word_index
padded_sequences = pad_sequences(sequences)


In [291]:
print(padded_sequences)
print(padded_sequences.shape)
# 50,000 sequences padded to a length of 2493

[[    0     0     0 ...   125  4103   486]
 [    0     0     0 ...  1977    69   221]
 [    0     0     0 ...    63    16   350]
 ...
 [    0     0     0 ... 22840     2  6050]
 [    0     0     0 ...    67   739    42]
 [    0     0     0 ...   794    11    17]]
(50000, 2493)


In [292]:
# After experimentation, cross validation is important!!!
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, y, test_size=0.4, random_state=22)

In [293]:
# Experiment and change this
EMBEDDING_DIM = 50

In [294]:
# Testing environment
print(Input(shape=(padded_sequences.shape[1],)))
print(Input(shape=(X_train.shape[1],)))


KerasTensor(type_spec=TensorSpec(shape=(None, 2493), dtype=tf.float32, name='input_46'), name='input_46', description="created by layer 'input_46'")
KerasTensor(type_spec=TensorSpec(shape=(None, 2493), dtype=tf.float32, name='input_47'), name='input_47', description="created by layer 'input_47'")


In [295]:
##initial doc2vec model only on IMDB_Dataset.csv
def doc2vec_model():
    
    ##We have a 2D representation for the tensor, reduce to 1D which is what we need
    ##first dimension will have variable size and second dimension is 2493

    input_layer = Input(shape=(X_train.shape[1],))


    #Embedding layer
    ##word_index is a dictionary for our vocabulary, add +1 for the padding token
    embedding_layer = Embedding(input_dim=len(word_index) + 1, output_dim=EMBEDDING_DIM)(input_layer)

    #GlobalAveragePooling1D layer
    ##average_pooling = GlobalAveragePooling1D()(embedding_layer)

    #Flattening
    flatten = Flatten()(embedding_layer)

    #Dense layers and dropout to help with overfitting
    dense_1 = Dense(64, activation='relu')(flatten)
    dropout_1 = Dropout(0.2)(dense_1)

    dense_2 = Dense(32, activation='relu')(dropout_1)
    dropout_2 = Dropout(0.1)(dense_2)


    output_layer = Dense(1, activation='sigmoid')(dropout_2)


    model = Model(inputs=input_layer, outputs=output_layer)

    #binary or multi-class compilation
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy', 'mae'])

    return model

In [296]:
##Train actual model
model = doc2vec_model()
model.fit(X_train, y_train, batch_size=10, epochs=2, verbose=1)


Epoch 1/2
Epoch 2/2


<keras.src.callbacks.History at 0x2b930e0d1b0>

In [297]:
# Evaluate the model on the test set
results = model.evaluate(X_test, y_test, batch_size=10)

# Print the evaluation results
print("Test Loss:", results[0])
print("Test Accuracy:", results[1])

Test Loss: 0.30932852625846863
Test Accuracy: 0.8873500227928162


In [298]:
##Use new dataset of movie reviews and see how it performs
import os

def read_txt_files(path):
    file_contents = []
    # Iterate over each file in the directory
    for file in os.listdir(path):
        if file.endswith(".txt"):
            file_path = os.path.join(path, file)
            
            # Read the content of the file and append it to the list
            with open(file_path, 'r', encoding='utf-8') as f:
                content = f.read()
                file_contents.append(content)

    return file_contents

##iterate and get reviews from each .txt file
pos_list = read_txt_files('pos')
neg_list = read_txt_files('neg')



In [299]:
##line up 1's and 0's with txt file reviews
pos_list_bin = [1 for i in list(range(12500))]
neg_list_bin = [0 for i in list(range(12500))]

##line up labels with txt file sentiment
list_of_strings = pos_list + neg_list
bin_list = pos_list_bin + neg_list_bin


In [300]:
##tokenizer to process data into same format that model had
tokenizer_test = Tokenizer()
tokenizer_test.fit_on_texts(list_of_strings)
sequences_test = tokenizer_test.texts_to_sequences(list_of_strings)
word_index_test = tokenizer_test.word_index
padded_test = pad_sequences(sequences_test, maxlen=2493)


In [301]:
# Convert this to a pd series
bin_list = pd.Series(bin_list)

In [302]:
##Testing environment
print(type(padded_sequences))
print(type(y))
print(type(padded_test))
print(type(bin_list))

<class 'numpy.ndarray'>
<class 'pandas.core.series.Series'>
<class 'numpy.ndarray'>
<class 'pandas.core.series.Series'>


In [303]:
#Evaluate on the test set (txt files from directories pos and neg
#Assuming X_test is the input data
results = model.evaluate(padded_test, bin_list, batch_size=10)

#Print the evaluation results
print("Test Loss:", results[0])
print("Test Accuracy:", results[1])

Test Loss: 1.2814308404922485
Test Accuracy: 0.5649200081825256


In [304]:
##Preprocessing for new model
##we concatenate our already padded sequences of the same length along rows
combined_sequences = np.concatenate((padded_sequences, padded_test), axis=0)
print(combined_sequences)

[[    0     0     0 ...   125  4103   486]
 [    0     0     0 ...  1977    69   221]
 [    0     0     0 ...    63    16   350]
 ...
 [    0     0     0 ...   125   332   154]
 [    0     0     0 ...    62   177     5]
 [    0     0     0 ...    39 50192  1103]]


In [305]:
##Experiemnt with this, higher seems better but not too high
EMBEDDING_DIM = 100

In [306]:
##combine the "vocabularies" of each padded reviews
combined_index = {**word_index, **word_index_test}

In [308]:
##lets define a new model here that combines the two datasets
def doc2vec_combined_model():
    
    ##We have a 2D representation for the tensor, reduce to 1D
    ##first dimension will have variable size and second dimension is 2493
    input_layer = Input(shape=(X_train.shape[1],))


    #Embedding layer
    ##word_index is a dictionary, add +1 for the padding token
    embedding_layer = Embedding(input_dim=len(combined_index) + 1, output_dim=EMBEDDING_DIM)(input_layer)

    #GlobalAveragePooling1D layer
    average_pooling = GlobalAveragePooling1D()(embedding_layer)

    #Flattening
    ##flatten = Flatten()(embedding_layer)

    #Dense Layers and Dropout
    dense_1 = Dense(64, activation='relu')(average_pooling)
    dropout_1 = Dropout(0.2)(dense_1)

    dense_2 = Dense(32, activation='relu')(dropout_1)
    dropout_2 = Dropout(0.1)(dense_2)

    ##Sigmoid as its a binary problem (pos or negative)
    output_layer = Dense(1, activation='sigmoid')(dropout_2)


    model = Model(inputs=input_layer, outputs=output_layer)

    #binary or multi-class compilation
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy', 'mae'])

    return model

In [309]:
##split dataset, so this includes moview reviews from directories pos and neg and IMDB_Dataset
from sklearn.model_selection import train_test_split
print(type(y))
print(type(bin_list))
print(y.shape)
print(bin_list.shape)

combined_label = pd.concat([y, bin_list], ignore_index=True, axis=0)
print(combined_label.shape)

X_train, X_test, y_train, y_test = train_test_split(combined_sequences, combined_label, test_size=0.4, random_state=22)

<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
(50000,)
(25000,)
(75000,)


In [310]:
# Run IT
model = doc2vec_combined_model()
model.fit(X_train, y_train, batch_size=10, epochs=2, verbose=1)

Epoch 1/2
Epoch 2/2


<keras.src.callbacks.History at 0x2b900c01240>

In [311]:
##evaluate new model
# Evaluate the model on the test set
results = model.evaluate(X_test, y_test, batch_size=10)

# Print the evaluation results
print("Test Loss:", results[0])
print("Test Accuracy:", results[1])

Test Loss: 0.32474032044410706
Test Accuracy: 0.861466646194458


In [312]:
##preprocessing on third dataset
##Read in CSV and drop empty rows
df = pd.read_csv('moviereviews.csv')
df = df.dropna()
df['label'] = df['label'].astype(str)
df2 = pd.read_csv('moviereviews2.csv')
df2 = df2.dropna()
df['label'] = df['label'].astype(str)

df_res = pd.concat([df, df2], ignore_index=True)
fin_label =df_res['label'].map({'pos': 1, 'neg': 0})
fin_reviews = df_res['review']
print(fin_reviews)
print(fin_label)



0       how do films like mouse hunt get into theatres...
1       some talented actresses are blessed with a dem...
2       this has been an extraordinary year for austra...
3       according to hollywood movies made in last few...
4       my first press screening of 1998 and already i...
                              ...                        
4741    The man who directed 'The Third Man' also dire...
4742    Kevin Spacey is very talented, but unfortunate...
4743    Poor Whoopi Goldberg. Imagine her at a friend'...
4744    This movie is essentially shot on a hand held ...
4745    It has singing. It has drama. It has comedy. I...
Name: review, Length: 4746, dtype: object
0       0
1       0
2       1
3       1
4       0
       ..
4741    1
4742    0
4743    0
4744    1
4745    1
Name: label, Length: 4746, dtype: int64


In [313]:
##same tokenization as before
fin_tokenizer = Tokenizer()
fin_tokenizer.fit_on_texts(fin_reviews)
fin_sequences = tokenizer.texts_to_sequences(fin_reviews)
fin_word_index = fin_tokenizer.word_index
fin_padded_sequences = pad_sequences(fin_sequences, maxlen=2493)

In [316]:
fin_res = model.evaluate(fin_padded_sequences, fin_label, batch_size=10)
# Print the evaluation results
print("Test Loss:", fin_res[0])
print("Test Accuracy:", fin_res[1])

Test Loss: 0.24318796396255493
Test Accuracy: 0.9068689346313477
