In [1]:
#Current imports
import pandas as pd

from sklearn.preprocessing import LabelEncoder, StandardScaler

##Tensorflow
import tensorflow as tf

from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import Embedding, GlobalAveragePooling1D, Dense, Input, Flatten






In [2]:
##Read in CSV
df = pd.read_csv('IMDB_Dataset.csv')
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [27]:
y=df['sentiment'].map({'positive': 1, 'negative': 0})
##turn string negative to positive into binary classification
print(y.head())

reviews=df['review']
print(reviews.head())




0    1
1    1
2    1
3    0
4    1
Name: sentiment, dtype: int64
0    One of the other reviewers has mentioned that ...
1    A wonderful little production. <br /><br />The...
2    I thought this was a wonderful way to spend ti...
3    Basically there's a family where a little boy ...
4    Petter Mattei's "Love in the Time of Money" is...
Name: review, dtype: object


In [28]:
##Preprocessing libraries
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [111]:
##Preprocessing code
##remove stop words? ##negligible
##lowercase or standardize the format or each word? ## case sensitivity might matter in sentiment analysis


tokenizer = Tokenizer()
tokenizer.fit_on_texts(reviews)
sequences = tokenizer.texts_to_sequences(reviews)
word_index = tokenizer.word_index
padded_sequences = pad_sequences(sequences)


In [112]:
print(padded_sequences)
print(padded_sequences.shape)
##50,000 sequences padded to a length of 2493

[[    0     0     0 ...   125  4103   486]
 [    0     0     0 ...  1977    69   221]
 [    0     0     0 ...    63    16   350]
 ...
 [    0     0     0 ... 22840     2  6050]
 [    0     0     0 ...    67   739    42]
 [    0     0     0 ...   794    11    17]]
(50000, 2493)


In [52]:
##Experiment with this
EMBEDDING_DIM = 50

In [113]:
##Testing environment
print(Input(shape=(padded_sequences.shape[1],)))



KerasTensor(type_spec=TensorSpec(shape=(None, 2493), dtype=tf.float32, name='input_11'), name='input_11', description="created by layer 'input_11'")


In [65]:
def doc2vec_model():
    
    ##We have a 2D representation for the tensor, reduce to 1D which is what we need
    ##first dimension will have variable size and second dimension is 2493

    input_layer = Input(shape=(padded_sequences.shape[1],))


    #Embedding layer
    ##word_index is a dictionary, add +1 for the padding token
    embedding_layer = Embedding(input_dim=len(word_index) + 1, output_dim=EMBEDDING_DIM)(input_layer)

    #GlobalAveragePooling1D layer
    ##average_pooling = GlobalAveragePooling1D()(embedding_layer)

    #Flattening
    flatten = Flatten()(embedding_layer)


    #Dense layers
    #Optimization such as dropout??

    output_layer = Dense(1, activation='sigmoid')(flatten)


    model = Model(inputs=input_layer, outputs=output_layer)

    #binary or multi-class compilation
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

    return model

In [55]:
##Train actual model


model = doc2vec_model()
model.fit(padded_sequences, y, batch_size=10, epochs=2, verbose=1)


Epoch 1/2
Epoch 2/2


<keras.src.callbacks.History at 0x2b91ab9c040>

In [114]:
# Evaluate the model on the test set
results = model.evaluate(padded_sequences, y, batch_size=10)

# Print the evaluation results
print("Test Loss:", results[0])
print("Test Accuracy:", results[1])

Test Loss: 0.013695603236556053
Test Accuracy: 0.9985600113868713


In [115]:
import os

def read_txt_files(directory_path):
    file_contents = []

    # Iterate over each file in the directory
    for filename in os.listdir(directory_path):
        if filename.endswith(".txt"):
            file_path = os.path.join(directory_path, filename)
            
            # Read the content of the file and append it to the list
            with open(file_path, 'r', encoding='utf-8') as file:
                content = file.read()
                file_contents.append(content)

    return file_contents

pos_list = read_txt_files('pos')
neg_list = read_txt_files('neg')

# Replace 'your_directory_path' with the path to your directory containing .txt files
list_of_strings = pos_list + neg_list




In [116]:
pos_list_bin = [1 for i in list(range(12500))]
neg_list_bin = [0 for i in list(range(12500))]


# Replace 'your_directory_path' with the path to your directory containing .txt files
list_of_strings = pos_list + neg_list
bin_list = pos_list_bin + neg_list_bin


In [126]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(list_of_strings)
sequences = tokenizer.texts_to_sequences(list_of_strings)
word_index = tokenizer.word_index
padded_test = pad_sequences(sequences, maxlen=2493)


In [127]:
bin_list = pd.Series(bin_list)

In [128]:
print(type(padded_sequences))
print(type(y))

print(type(padded_test))
print(type(bin_list))

<class 'numpy.ndarray'>
<class 'pandas.core.series.Series'>
<class 'numpy.ndarray'>
<class 'pandas.core.series.Series'>


In [129]:
# Evaluate the model on the test set
# Assuming X_test is the input data
print("Shape of X_test:", padded_test.shape)
results = model.evaluate(padded_test, bin_list, batch_size=10)

# Print the evaluation results
print("Test Loss:", results[0])
print("Test Accuracy:", results[1])

Shape of X_test: (25000, 2493)
Test Loss: 1.211595058441162
Test Accuracy: 0.5656399726867676
