In [33]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, GRU, RNN, Dense, BatchNormalization, SimpleRNN
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [7]:
def preprocess_data(df):
    label_encoder = LabelEncoder()
    df['sentiment_encoded'] = label_encoder.fit_transform(df['sentiment'])
    X_train, X_test, y_train, y_test = train_test_split(df['review'], df['sentiment_encoded'], test_size=0.2, random_state=42)
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(X_train.values)
    X_train_seq = tokenizer.texts_to_sequences(X_train.values)
    X_test_seq = tokenizer.texts_to_sequences(X_test.values)
    maxlen = max([item for sublist in X_train_seq for item in sublist])
    
    # Pad the sequences to the same length
    X_train_pad = pad_sequences(X_train_seq, maxlen=maxlen)
    X_test_pad = pad_sequences(X_test_seq, maxlen=maxlen)
    return X_train_pad, X_test_pad, y_train, y_test, label_encoder, maxlen

In [8]:
def model_def_and_compile(model_name, maxlen):
    
    # Define the model
    model = Sequential()
    model.add(Embedding(input_dim=maxlen+1, output_dim=100))
    model.add(BatchNormalization())
    
    if model_name == "LSTM":
        model.add(LSTM(200, dropout=0.2, recurrent_dropout=0.2, return_sequences=False))
    elif model_name == "GRU":
        model.add(GRU(200, dropout=0.2, recurrent_dropout=0.2, return_sequences=False))
    elif model_name == "RNN":
        model.add(SimpleRNN(200, dropout=0.2, recurrent_dropout=0.2, return_sequences=False))
        
    model.add(BatchNormalization())
    model.add(Dense(1, activation='sigmoid'))
    
    # Compile the model
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    
    return model

In [9]:
def predict_sentiment(model, X_test_pad, y_test):

    y_pred = model.predict(X_test_pad)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    
    return accuracy, precision, recall, f1

In [10]:

model_names = ['LSTM', 'RNN', 'GRU']

if __name__ == '__main__':
    
    # preprocessing  Data path
    processed_file_path = "../Data/Process/sample_preprocessed_data.xlsx"
    
    # Load preprocessed data
    df = pd.read_excel(processed_file_path)
    X_train_pad, X_test_pad, y_train, y_test, label_encoder, maxlen = preprocess_data(df)

    results = []
    for model_name in model_names:
        model = model_def_and_compile(model_name, maxlen)
        print(maxlen)
        # Train the model
        model.fit(X_train_pad, y_train, epochs=5, batch_size=32)
        
        accuracy, precision, recall, f1  = predict_sentiment(model, X_test_pad, y_test)
        results.append([model_name, accuracy, precision, recall, f1])
        
    df_summary = pd.DataFrame(results, columns=['model_name', 'accuracy' , 'precision', 'recall', 'f1'])
    summary_file_path = f'../Data/Process/deep_learning.xlsx'
    df_summary.to_excel(summary_file_path, index=False)
        
    

36918 36378


In [38]:
# preprocessing  Data path
processed_file_path = "../Data/Process/sample_preprocessed_data.xlsx"

# Load preprocessed data
df = pd.read_excel(processed_file_path)

label_encoder = LabelEncoder()
df['sentiment_encoded'] = label_encoder.fit_transform(df['sentiment'])
X_train, X_test, y_train, y_test = train_test_split(df['review'], df['sentiment_encoded'], test_size=0.2, random_state=42)
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train.values)
X_train_seq = tokenizer.texts_to_sequences(X_train.values)
X_test_seq = tokenizer.texts_to_sequences(X_test.values)

maxlen = max([len(sublist) for sublist in X_train_seq])

# Pad the sequences to the same length
X_train_pad = pad_sequences(X_train_seq, maxlen=maxlen)
X_test_pad = pad_sequences(X_test_seq, maxlen=maxlen)

In [37]:
maxlen

1504

In [35]:
X_train_seq

[[10,
  3701,
  6,
  7,
  21235,
  4885,
  10771,
  19,
  26,
  1169,
  128,
  3,
  211,
  256,
  519,
  142,
  7,
  172,
  167,
  3,
  167,
  1,
  222,
  6,
  37,
  132,
  26,
  58,
  192,
  52,
  15,
  377,
  122,
  414,
  95,
  108,
  4212,
  3,
  1,
  341,
  1241,
  936,
  5936,
  28,
  1056,
  5,
  25,
  2,
  2727,
  14,
  17,
  855,
  2,
  3702,
  3,
  960,
  1822,
  5937,
  124,
  1,
  293,
  985,
  44,
  2,
  503,
  4,
  293,
  466,
  15895,
  66,
  31,
  1,
  78,
  146,
  3,
  8,
  7,
  882,
  44,
  1,
  161,
  4,
  102,
  747,
  7,
  3860,
  6885,
  1,
  4213,
  3703,
  383,
  855,
  55,
  51,
  73,
  271,
  2458,
  1858,
  1291,
  6364,
  7,
  1,
  1065,
  8,
  53,
  30,
  7543,
  52,
  15,
  2,
  334,
  1726,
  43,
  610,
  122,
  707,
  50,
  1,
  61,
  5,
  520,
  1727,
  324,
  3446,
  1,
  274,
  21236,
  128,
  183,
  25,
  388,
  1,
  1107,
  38,
  1,
  1759,
  4,
  1,
  1858,
  34,
  1,
  3703,
  383],
 [83,
  381,
  80,
  20,
  62,
  2,
  18,
  4,
  96,
  261,
  11,