In [1]:
# Read in the data and clean up column names
import gensim
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from gensim.models import Word2Vec
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, Conv1D, MaxPooling1D, Flatten, Dense
from tensorflow.keras.models import Sequential
import nltk
nltk.download('stopwords')
nltk.download('punkt')

from google.colab import drive
drive.mount('/content/drive')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Mounted at /content/drive


In [2]:
df = pd.read_csv('../content/drive/MyDrive/Reviews.csv')

df = df[['Text','Score']]
df['Score'].isnull().sum()
df['Score'].isnull().sum()
df.drop_duplicates(subset=['Text','Score'],keep='first',inplace=True)

def set_sent(score):
    if score<=2:
        return 0
    elif score==3:
        return 1
    else:
        return 2

df['sentiment']=df['Score'].apply(set_sent)
df = df[['Text','sentiment']]

# Separate into three sentiment groups
df_neg1 = df[df['sentiment']==0]
df_0 = df[df['sentiment']==1]
df_1 = df[df['sentiment']==2]

n = 15000

df_neg1 = df_neg1.sample(n=n, random_state=42, replace=False)
df_0 = df_0.sample(n=n, random_state=42, replace=False)
df_1 = df_1.sample(n=n, random_state=42, replace=False)
# print("df_neg1: ", df_neg1)
# print("df_0: ", df_0)
# print("df_1: ", df_1)

sub_df = pd.concat([df_neg1, df_0, df_1], axis=0)
X = sub_df['Text']

# Tokenize using gensim
# X = X.apply(lambda x: gensim.utils.simple_preprocess(x))
y = sub_df['sentiment']

In [3]:
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split (X, y, test_size=0.2)


In [4]:
# Preprocess the text data
stop_words = set(stopwords.words('english'))
def preprocess(text):
    text = text.lower()
    text = ''.join([word for word in text if word not in string.punctuation])
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words]
    return ' '.join(tokens)

X_train = X_train.apply(preprocess)
X_test = X_test.apply(preprocess)

# Tokenize the text data
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)

X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)


vocab_size = len(tokenizer.word_index) + 1

In [5]:
# Pad the sequences to a fixed length
max_length = 100
print(X_train[0])
X_train = pad_sequences(X_train, maxlen=max_length, padding='post')
X_test = pad_sequences(X_test, maxlen=max_length, padding='post')

print(X_train.shape)
print(X_train[0])

[497, 32, 8032, 14093, 372, 9, 78, 133, 1525, 292, 9, 712, 79, 63, 1727, 1195, 2778, 2472, 928, 977, 14, 9, 611, 32, 8032, 301, 372, 789, 9, 266, 9, 1040, 731, 9, 21, 860, 8, 60, 1270, 54, 84, 25, 100, 9371, 199, 372, 789, 8394, 834, 311, 7, 849, 82, 404]
(36000, 100)
[  497    32  8032 14093   372     9    78   133  1525   292     9   712
    79    63  1727  1195  2778  2472   928   977    14     9   611    32
  8032   301   372   789     9   266     9  1040   731     9    21   860
     8    60  1270    54    84    25   100  9371   199   372   789  8394
   834   311     7   849    82   404     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0]


In [6]:
sen = []
for i in X_train:
    sen.append(i.split())
print(sen[:2])

AttributeError: ignored

In [None]:
# Train the Word2Vec model
# sentences = [sentence for sentence in X_train]
w2v_model = Word2Vec(sen, window=5, min_count=5, workers=4)

# Create a weight matrix for the embedding layer
embedding_matrix = np.zeros((vocab_size, 100))
for word, i in tokenizer.word_index.items():
    if word in w2v_model.wv:
        embedding_matrix[i] = w2v_model.wv[word]

In [None]:
# Define the CNN model
model = Sequential()
model.add(Embedding(vocab_size, 100, weights=[embedding_matrix], input_length=max_length, trainable=False))
model.add(Conv1D(128, 5, activation='relu'))
model.add(MaxPooling1D(5))
model.add(Conv1D(128, 5, activation='relu'))
model.add(MaxPooling1D(5))
model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))