In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from sklearn.feature_extraction.text import CountVectorizer
from keras.preprocessing.text import Tokenizer
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from keras.layers import Input, Dense, Embedding, SpatialDropout1D, add, concatenate
from keras.layers import CuDNNLSTM, Bidirectional, GlobalMaxPooling1D, GlobalAveragePooling1D
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
from tensorflow.keras.preprocessing.sequence import pad_sequences


from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, log_loss
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from xgboost import XGBClassifier
import xgboost
from sklearn import svm
from sklearn import feature_extraction, model_selection, naive_bayes, pipeline, manifold, preprocessing

from sklearn.datasets import make_blobs
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer  
stop_words = stopwords.words('english')

import re
import time


2022-12-20 14:59:52.204678: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# load excel
data = pd.read_csv("/Users/hansangjun/Desktop/Springboard/Capstone3/Data/IMDB_Dataset.csv")

In [3]:
# replace 1 and 0
data.sentiment = data.sentiment.replace({'positive': 1, 'negative': 0})

In [4]:
df = data.sample(n=5000, random_state=123)
df['sentiment'].value_counts()

1    2519
0    2481
Name: sentiment, dtype: int64

In [5]:
classifiers = [
    KNeighborsClassifier(3),
    RandomForestClassifier(),
    XGBClassifier(),
    AdaBoostClassifier(),
    GradientBoostingClassifier(),
    LogisticRegression(),
    naive_bayes.MultinomialNB()
    ]

def CountVec():
    X_train, X_test, y_train, y_test = train_test_split(df['review'].values, 
                                                        df['sentiment'].values, test_size=0.25, 
                                                        random_state=1000)
    vectorizer = CountVectorizer()
    vectorizer.fit(X_train)
    X_train = vectorizer.transform(X_train)
    X_test  = vectorizer.transform(X_test)
    
    for clf in classifiers:
        start_time = time.time()
        clf.fit(X_train, y_train)
        train_predictions = clf.predict(X_test)
        acc = accuracy_score(y_test, train_predictions)
        print("model name: {}".format(clf))
        print("Accuracy: {:.4%}".format(acc))

        train_predictions = clf.predict_proba(X_test)
        ll = log_loss(y_test, train_predictions)
        print("Log Loss: {}".format(ll))
        print("--- %s seconds ---" % (time.time() - start_time))
        print("="*40)
        
def TfIdf():
    X_train, X_test, y_train, y_test = train_test_split(df['review'].values, 
                                                        df['sentiment'].values, test_size=0.25, 
                                                        random_state=1000)
    tfidf_vectorizer = TfidfVectorizer() 
    tfidf_train_vectors = tfidf_vectorizer.fit_transform(X_train)
    tfidf_test_vectors = tfidf_vectorizer.transform(X_test)
    
    for clf in classifiers:
        start_time = time.time()
        clf.fit(tfidf_train_vectors, y_train)
        train_predictions = clf.predict(tfidf_test_vectors)
        acc = accuracy_score(y_test, train_predictions)
        print("model name: {}".format(clf))
        print("Accuracy: {:.4%}".format(acc))

        train_predictions = clf.predict_proba(tfidf_test_vectors)
        ll = log_loss(y_test, train_predictions)
        print("Log Loss: {}".format(ll))
        print("--- %s seconds ---" % (time.time() - start_time))
        print("="*40)
        
sw = stopwords.words('english')
lemmatizer = WordNetLemmatizer() 

def clean_text(text):
    
    # Make it lower case
    text = text.lower()
    
    # Replace everything with space except (a-z, A-Z, ".", "?", "!")
    text = re.sub(r"[^a-zA-Z?.!¿]+", " ", text) 

    # Remove URLs 
    text = re.sub(r"http\S+", "",text) 
    
    # Remove text 'br'(break)
    text = re.sub(r"br", "",text)
    
    # Remove html tags
    html=re.compile(r'<.*?>') 
    text = html.sub(r'',text)
    
    # Remove punctuations
    punctuations = '@#!?+&*[]-%.:/();$=><|{}^' + "'`" + '_'
    for p in punctuations:
        text = text.replace(p,'') 
        
    # Remove stopwords
    text = [word for word in text.split() if word not in sw]
    
    # lemmatize word - it is based on the context analysis. 
    text = [lemmatizer.lemmatize(word) for word in text]
    
    text = " ".join(text) 
        
    return text

In [6]:
start_time = time.time()
df['review'] = df['review'].apply(lambda x: clean_text(x))
print("--- %s seconds ---" % (time.time() - start_time))
df.head()

--- 6.031841993331909 seconds ---


Unnamed: 0,review,sentiment
11872,movie beyond awful pimple movie industry know ...,0
40828,writing john carpenter halloween nearing th an...,1
36400,must admit slight disappointment film read lot...,1
5166,oh dear bbc knocked pedestal absorbing period ...,0
30273,totally average film semi alright action seque...,0


In [15]:
max_fatures = 1196
tokenizer = Tokenizer(num_words=max_fatures, split=' ')
tokenizer.fit_on_texts(df['review'].values)
X = tokenizer.texts_to_sequences(df['review'].values)
X = pad_sequences(X)

In [16]:
embed_dim = 128
lstm_out = 196

model = Sequential()
model.add(Embedding(max_fatures, embed_dim,input_length = X.shape[1]))
model.add(SpatialDropout1D(0.4))
model.add(LSTM(lstm_out, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(2,activation='softmax'))
model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])
print(model.summary())

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 442, 128)          153088    
                                                                 
 spatial_dropout1d_1 (Spatia  (None, 442, 128)         0         
 lDropout1D)                                                     
                                                                 
 lstm_1 (LSTM)               (None, 196)               254800    
                                                                 
 dense_1 (Dense)             (None, 2)                 394       
                                                                 
Total params: 408,282
Trainable params: 408,282
Non-trainable params: 0
_________________________________________________________________
None


In [17]:
Y = pd.get_dummies(df['sentiment']).values
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.33, random_state = 42)
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)

(3350, 442) (3350, 2)
(1650, 442) (1650, 2)


In [18]:
batch_size = 32
history = model.fit(X_train, Y_train, epochs = 7, 
          batch_size=batch_size, verbose = 1,
          validation_data = (X_test, Y_test),
         )

Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7


In [12]:
loss, accuracy = model.evaluate(X_train, Y_train, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = model.evaluate(X_test, Y_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))

Training Accuracy: 0.9842
Testing Accuracy:  0.7915


In [13]:
import matplotlib.pyplot as plt
plt.style.use('ggplot')

def plot_history(history):
    acc = history.history['accuracy']
    val_acc = history.history['val_accuracy']
    loss = history.history['loss']
    val_loss = history.history['val_loss']
    x = range(1, len(acc) + 1)

    plt.figure(figsize=(12, 5))
    plt.subplot(1, 2, 1)
    plt.plot(x, acc, 'b', label='Training acc')
    plt.plot(x, val_acc, 'r', label='Validation acc')
    plt.title('Training and validation accuracy')
    plt.legend()
    plt.subplot(1, 2, 2)
    plt.plot(x, loss, 'b', label='Training loss')
    plt.plot(x, val_loss, 'r', label='Validation loss')
    plt.title('Training and validation loss')
    plt.legend()

In [14]:
plot_history(history)

Epoch 1/7
  5/105 [>.............................] - ETA: 2:24 - loss: 0.0706 - accuracy: 0.9688

KeyboardInterrupt: 