# Project A: Classifying Sentiment from Text Reviews
In this project, our team will try to perfrom Bag of Word approch to implement sentiment clasificaion on product review on websites.

In [39]:
import numpy as np
import pandas as pd
import os
import nltk
from nltk.corpus import stopwords
import sklearn.linear_model
import sklearn.pipeline
from sklearn.feature_extraction.text import CountVectorizer
import text_process

## Step1: Importing Dataset

In [4]:
# import datasets
data_dir = 'data_reviews'
x_train_df = pd.read_csv(os.path.join(data_dir, 'x_train.csv'))
y_train_df = pd.read_csv(os.path.join(data_dir, 'y_train.csv'))

N, n_cols = x_train_df.shape
print("Shape of x_train_df: (%d, %d)" % (N, n_cols))
print("Shape of y_train_df: %s" % str(y_train_df.shape))

Shape of x_train_df: (2400, 2)
Shape of y_train_df: (2400, 1)


In [6]:
# Print out the first five rows and last five rows
tr_text_list = x_train_df['text'].values.tolist()
rows = np.arange(0, 5)
for row_id in rows:
    text = tr_text_list[row_id]
    print("row %5d | y = %d | %s" % (row_id, y_train_df.values[row_id, 0], text))
    
print("...")

rows = np.arange(N - 5, N)
for row_id in rows:
    text = tr_text_list[row_id]
    print("row %5d | y = %d | %s" % (row_id, y_train_df.values[row_id, 0], text))

row     0 | y = 0 | Oh and I forgot to also mention the weird color effect it has on your phone.
row     1 | y = 0 | THAT one didn't work either.
row     2 | y = 0 | Waste of 13 bucks.
row     3 | y = 0 | Product is useless, since it does not have enough charging current to charge the 2 cellphones I was planning to use it with.
row     4 | y = 0 | None of the three sizes they sent with the headset would stay in my ears.
...
row  2395 | y = 1 | The sweet potato fries were very good and seasoned well.
row  2396 | y = 1 | I could eat their bruschetta all day it is devine.
row  2397 | y = 1 | Ambience is perfect.
row  2398 | y = 1 | We ordered the duck rare and it was pink and tender on the inside with a nice char on the outside.
row  2399 | y = 1 | Service was good and the company was better!


## Step2: Cleaning Imported Data

In [15]:
# cleaning raw data
tr_text_array = x_train_df['text'].values.tolist()
tok_count_dict = dict()

for line in tr_text_array:
    tok_list = text_process.tokenize_text(line)
    for tok in tok_list:
        if tok in tok_count_dict:
            tok_count_dict[tok] += 1
        else:
            tok_count_dict[tok] = 1
sorted_tokens = list(sorted(tok_count_dict, key=tok_count_dict.get, reverse=True))

In [30]:
# printing out the frequency of word
vocab_list = [w for w in sorted_tokens if tok_count_dict[w] >= 4]

for w in vocab_list:
    print("%5d %s" % (tok_count_dict[w], w))

 1560 the
  916 and
  707 a
  700 i
  609 is
  542 to
  534 it
  493 of
  493 this
  447 was
  328 in
  257 for
  244 not
  231 that
  212 with
  202 very
  201 my
  183 good
  176 on
  163 you
  162 great
  158 but
  147 have
  143 are
  141 so
  140 movie
  137 phone
  136 as
  119 film
  115 be
  115 all
  111 one
  109 had
  103 at
  100 place
   98 food
   95 like
   92 were
   90 an
   89 just
   86 there
   84 service
   84 time
   83 if
   82 we
   79 bad
   79 really
   78 out
   78 it's
   76 they
   76 from
   75 would
   69 has
   69 about
   68 well
   66 your
   64 only
   63 even
   63 ever
   63 best
   62 by
   62 back
   62 or
   61 don't
   60 -
   59 here
   57 also
   57 will
   56 no
   54 up
   53 go
   52 than
   51 quality
   51 when
   51 love
   50 me
   50 what
   49 can
   49 he
   48 made
   48 more
   47 product
   47 because
   47 excellent
   45 better
   45 which
   44 recommend
   44 some
   42 work
   42 i'm
   42 could
   42 i've
   42 get
   42 too

## Step3: Feature Extration

In [36]:
from nltk.corpus import stopwords
useless = stopwords.words('english')
print(useless)
filtered_words = [word for word in vocab_list if not word in useless]

for w in filtered_words:
    print("%s" % (w))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [46]:
vocab_dict = dict()

for vocab_id, tok in enumerate(vocab_list):
    vocab_dict[tok] = vocab_id

def transform_text_into_feature_vector(text, vocab_dict):
    ''' Produce count feature vector for provided text

    Args
    ----
    text : string
        A string of raw text, representing a single 'review'
    vocab_dict : dict with string keys
        If token is in vocabulary, will exist as key in the dict
        If token is not in vocabulary, will not be in the dict

    Returns
    -------
    count_V : 1D numpy array, shape (V,) = (n_vocab,)
        Count vector, indicating how often each vocab word
        appears in the provided text string
    '''
    V = len(vocab_dict.keys())
    count_V = np.zeros(V)
    for tok in tokenize_text(text):
        if tok in vocab_dict:
            vv = vocab_dict[tok]
            count_V[vv] += 1
    return count_V


def tokenize_text(raw_text):
    ''' Transform a plain-text string into a list of tokens

    We assume that *whitespace* divides tokens.

    Args
    ----
    raw_text : string

    Returns
    -------
    list_of_tokens : list of strings
        Each element is one token in the provided text
    '''
    list_of_tokens = raw_text.split()  # split method divides on whitespace by default
    for pp in range(len(list_of_tokens)):
        cur_token = list_of_tokens[pp]
        # Remove punctuation
        for punc in ['?', '!', '_', '.', ',', '"', '/']:
            cur_token = cur_token.replace(punc, "")
        # Turn to lower case
        clean_token = cur_token.lower()
        # Replace the cleaned token into the original list
        list_of_tokens[pp] = clean_token
    return list_of_tokens


# print(transform_text_into_feature_vector("the was this the of a an of a", vocab_dict))

# for line in tr_text_array:
#     print("\nRaw text:")
#     print(line)
#     print("Clean token list:")
#     print(transform_text_into_feature_vector(line, vocab_dict))

V = len(vocab_dict)
x_tr_NV = np.zeros((N, V))

for nn, raw_text_line in enumerate(tr_text_array):
    x_tr_NV[nn] = transform_text_into_feature_vector(raw_text_line, vocab_dict)

# print(x_tr_NV.shape)
#
# print(np.sum(x_tr_NV[:,0]))

y_true = np.ravel(y_train_df)

c_list = [1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3, 1e4, 1e5]
for c in c_list:
    clf = sklearn.linear_model.LogisticRegression(C=c, max_iter=1000)
    
    clf.fit(x_tr_NV, y_true)
    
    yhat_tr_N = clf.predict(x_tr_NV)
    acc = np.mean( y_true == yhat_tr_N )
    
    print("Training accuracy: %.3f" % acc)


weights_V = clf.coef_[0]
sorted_tok_ids_V = np.argsort(weights_V)

# for vv in sorted_tok_ids_V:
#     print("% 7.3f %s" % (weights_V[vv], vocab_list[vv]))

Training accuracy: 0.697
Training accuracy: 0.693
Training accuracy: 0.708
Training accuracy: 0.760
Training accuracy: 0.864
Training accuracy: 0.914
Training accuracy: 0.955
Training accuracy: 0.978
Training accuracy: 0.986


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Training accuracy: 0.992
Training accuracy: 0.994


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
