# Sentiment Analysis Yujie Zhang, Yunzhe Wei

## (1) Data Loading

In [1]:
import pandas as pd

In [2]:
train_size = 9000
test_size = 1000

### Training set

In [3]:
train = pd.read_csv("amazon/train.csv", header=None)
columns = ['sentiment', 'title', 'content']
train.rename(columns=dict(zip(train.columns, columns)), inplace=True)
train.head()

Unnamed: 0,sentiment,title,content
0,2,Stuning even for the non-gamer,This sound track was beautiful! It paints the ...
1,2,The best soundtrack ever to anything.,I'm reading a lot of reviews saying that this ...
2,2,Amazing!,This soundtrack is my favorite music of all ti...
3,2,Excellent Soundtrack,I truly like this soundtrack and I enjoy video...
4,2,"Remember, Pull Your Jaw Off The Floor After He...","If you've played the game, you know how divine..."


In [4]:
train['sentiment'].value_counts()

sentiment
2    1800000
1    1800000
Name: count, dtype: int64

In [5]:
positive_df = train[train['sentiment'] == 2]
negative_df = train[train['sentiment'] == 1]

In [6]:
positive_sample = positive_df.sample(n=train_size, replace=False, random_state=10)
negative_sample = negative_df.sample(n=train_size, replace=False, random_state=10)

In [7]:
train = pd.concat([positive_sample, negative_sample])
train = train.sample(frac=1, random_state=10)
train['sentiment'] = train['sentiment'].map({1:0, 2:1})
train['text'] = train['title'] + train['content']
train = train.drop(columns=['title', 'content'])
train = train.reset_index(drop=True)
train.shape

(18000, 2)

In [8]:
train.head()

Unnamed: 0,sentiment,text
0,0,Bad research?I started to read Games of State ...
1,1,A great readWe Should Never Meet is one of the...
2,1,Rodgers & Hammerstein's CINDERELLALOVE LOVE LO...
3,0,Pathetic.The only reason i rated this item at ...
4,0,"MisleadingThis was supposed to be about going,..."


### Testing set

In [9]:
test = pd.read_csv("amazon/test.csv", header = None)
columns = ['sentiment', 'title', 'content']
test.rename(columns=dict(zip(test.columns, columns)), inplace=True)
test.head()

Unnamed: 0,sentiment,title,content
0,2,Great CD,My lovely Pat has one of the GREAT voices of h...
1,2,One of the best game music soundtracks - for a...,Despite the fact that I have only played a sma...
2,1,Batteries died within a year ...,I bought this charger in Jul 2003 and it worke...
3,2,"works fine, but Maha Energy is better",Check out Maha Energy's website. Their Powerex...
4,2,Great for the non-audiophile,Reviewed quite a bit of the combo players and ...


In [10]:
test['sentiment'].value_counts()

sentiment
2    200000
1    200000
Name: count, dtype: int64

In [11]:
positive_df = test[test['sentiment'] == 2]
negative_df = test[test['sentiment'] == 1]

In [12]:
positive_sample = positive_df.sample(n=test_size, replace=False, random_state=10)
negative_sample = negative_df.sample(n=test_size, replace=False, random_state=10)

In [13]:
test = pd.concat([positive_sample, negative_sample])
test = test.sample(frac=1, random_state=10)
test['sentiment'] = test['sentiment'].map({1:0, 2:1})
test['text'] = test['title'] + test['content']
test = test.drop(columns=['title', 'content'])
test = test.reset_index(drop=True)
test.shape

(2000, 2)

In [14]:
test.head()

Unnamed: 0,sentiment,text
0,1,I think it works...I can't really say for sure...
1,0,Not up to the hypeI got this as a gold box dea...
2,0,Not The BestAfter trying unsuccessfully to get...
3,1,Just discovered this group -- glad I didSaw a ...
4,0,"1 star is generous...Like many others, I watch..."


## (2) Data Cleaning

### Remove special characters

In [15]:
import re

In [16]:
def remove_special_characters(text, remove_digit=False):
    pattern = r'[^a-zA-z0-9\s]' if not remove_digit else r'[^a-zA-z\s]'
    text = re.sub(pattern, '', text)
    return text

### Remove stopwords

In [17]:
import nltk
from nltk.corpus import stopwords

In [18]:
def remove_stopwords(text):
    stop_words = set(stopwords.words('english'))
    tokens = nltk.word_tokenize(text)
    return " ".join([token for token in tokens if token not in stop_words])

### Stemmer

In [19]:
from nltk.stem import SnowballStemmer

In [20]:
def snowball_stemmer(text):
    stemmer = SnowballStemmer("english")
    text = ' '.join([stemmer.stem(word) for word in text.split()])
    return text

### Putting all together

In [21]:
def normalize_corpus(text, 
                     special_char_removal=True,
                     remove_digit=False,
                     text_lower_case=True, 
                     stopword_removal=True,
                     text_stem=True):
    # remove special characters and\or digits 
    if special_char_removal:
        if remove_digit:
            text = remove_special_characters(text, remove_digit=True)
        else:
            text = remove_special_characters(text)
    # lowercase the text    
    if text_lower_case:
        text = text.lower()
    # remove stopwords
    if stopword_removal:
        text = remove_stopwords(text)
    # stem text
    if text_stem:
        text = snowball_stemmer(text)
    return text

In [22]:
train['text'] = train['text'].apply(normalize_corpus, remove_digit=True)
test['text'] = test['text'].apply(normalize_corpus, remove_digit=True)

In [23]:
train.head()

Unnamed: 0,sentiment,text
0,0,bad researchi start read game state day ago mu...
1,1,great readw never meet one best short stori co...
2,1,rodger hammerstein cinderellalov love love ama...
3,0,patheticth reason rate item star cant give isn...
4,0,misleadingthi suppos go realli noth thin veil ...


In [24]:
test.head()

Unnamed: 0,sentiment,text
0,1,think worksi cant realli say sure actual reduc...
1,0,hypei got gold box deal ok cheek fine doesnt n...
2,0,bestaft tri unsuccess get ear piec stay ear ga...
3,1,discov group glad didsaw video album last mont...
4,0,star generouslik mani other watch movi stori s...


## (3) Text Processing

In [25]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [26]:
max_words = 10000
max_len = 500

In [27]:
tokenizer = Tokenizer(num_words = max_words)

In [28]:
tokenizer.fit_on_texts(train['text']) 

In [29]:
seq_train = tokenizer.texts_to_sequences(train['text']) 
seq_test = tokenizer.texts_to_sequences(test['text'])

In [30]:
word_index = tokenizer.word_index 

In [31]:
data_train = pad_sequences(seq_train, maxlen = max_len)
data_test = pad_sequences(seq_test, maxlen = max_len)

In [32]:
data_train[0:5]

array([[   0,    0,    0, ...,  175,  633,    1],
       [   0,    0,    0, ...,  889, 5673,  575],
       [   0,    0,    0, ...,   12,  141, 5674],
       [   0,    0,    0, ...,   13,   17,  483],
       [   0,    0,    0, ..., 1402,  624,  353]], dtype=int32)

In [33]:
data_test[0:5]

array([[   0,    0,    0, ...,  481, 5112, 2468],
       [   0,    0,    0, ...,  484,  206,  116],
       [   0,    0,    0, ...,  130,  200,  132],
       [   0,    0,    0, ...,    3,   44,  201],
       [   0,    0,    0, ...,   52,  135, 9690]], dtype=int32)

## (4) Apply tensorflow model

In [34]:
import tensorflow as tf
from tensorflow.keras.layers import Embedding, Bidirectional, BatchNormalization, Dropout, Dense

In [35]:
model = tf.keras.models.Sequential([
    Embedding(10000,16),
    Bidirectional(tf.keras.layers.LSTM(16, return_sequences=True)) ,
    BatchNormalization(),
    Bidirectional(tf.keras.layers.LSTM(32)),
    Dropout(0.2),
    Dense(512, activation = 'relu'),
    Dense(1, activation = 'sigmoid')
])

Metal device set to: Apple M1 Pro


2023-05-04 10:47:15.469489: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2023-05-04 10:47:15.469589: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:272] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


In [36]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 16)          160000    
                                                                 
 bidirectional (Bidirectiona  (None, None, 32)         4224      
 l)                                                              
                                                                 
 batch_normalization (BatchN  (None, None, 32)         128       
 ormalization)                                                   
                                                                 
 bidirectional_1 (Bidirectio  (None, 64)               16640     
 nal)                                                            
                                                                 
 dropout (Dropout)           (None, 64)                0         
                                                        

In [37]:
model.compile(loss = tf.keras.losses.binary_crossentropy, 
              optimizer = tf.keras.optimizers.Adam(), 
              metrics = ['accuracy'])

In [38]:
# history = model.fit(data_train, 
#                     train['sentiment'], 
#                     validation_split = 0.1, 
#                     epochs = 10)