In [2]:
import numpy as np
import pandas as pd
import os

## Load Data

In [3]:
data_dir = 'data_reviews'
x_train_df = pd.read_csv(os.path.join(data_dir, 'x_train.csv'))
y_train_df = pd.read_csv(os.path.join(data_dir, 'y_train.csv'))

N, n_cols = x_train_df.shape
print("Shape of x_train_df: (%d, %d)" % (N,n_cols))
print("Shape of y_train_df: %s" % str(y_train_df.shape))
x_train_df

Shape of x_train_df: (2400, 2)
Shape of y_train_df: (2400, 1)


Unnamed: 0,website_name,text
0,amazon,Oh and I forgot to also mention the weird colo...
1,amazon,THAT one didn't work either.
2,amazon,Waste of 13 bucks.
3,amazon,"Product is useless, since it does not have eno..."
4,amazon,None of the three sizes they sent with the hea...
...,...,...
2395,yelp,The sweet potato fries were very good and seas...
2396,yelp,I could eat their bruschetta all day it is dev...
2397,yelp,Ambience is perfect.
2398,yelp,We ordered the duck rare and it was pink and t...


In [4]:
# df to list
tr_text_list = x_train_df['text'].values.tolist()

In [5]:
y_train_list = y_train_df['is_positive_sentiment'].values.tolist()
# len(y_train_list)

## Partition sentences into words

In [6]:
from sklearn.feature_extraction.text import CountVectorizer

In [7]:
vectorizer = CountVectorizer()

### How big is my vocabulary?

In [8]:
x = vectorizer.fit_transform(tr_text_list)
vocab = vectorizer.get_feature_names_out()
vocab.shape[0]

4510

In [9]:
x_train_NV = x.toarray()
x_train_NV.shape # N is the number of sentences, and V is the number of vocabulary

(2400, 4510)

In [10]:
N = x_train_NV.shape[0]
V = x_train_NV.shape[1]
N,V

(2400, 4510)

In [16]:
vectorizer.vocabulary_.get('the')

3941

### Get the common and rare

In [12]:
x_train_NV[0][2701] # search the first sentence for 'oh'

1

In [13]:
x_common = []

x_col_sum = np.sum(x_train_NV, axis = 0)

np.max(x_col_sum)

1572

In [14]:
x_col_sum.shape, x_col_sum[2701]

((4510,), 9)

In [15]:
x_common = x_col_sum > N/2 # happens half of the time 
x_common_V = [int(x) for x in x_common]
x_common_V = np.array(x_common_V)

In [77]:
x_rare = x_col_sum < 10
x_rare_V = [int(x) for x in x_rare]
x_rare_V = np.array(x_rare_V)

In [78]:
x_common_V.shape, x_rare_V.shape

((4510,), (4510,))

In [79]:
# x_common_V.reshape(V, 1)

In [86]:
x_common = x_train_NV * x_common_V 
x_rare = x_train_NV * x_rare_V

idx = vectorizer.vocabulary_.get('good')
idx

1752

In [87]:
np.sum(x_train_NV[:, idx])

185

In [88]:
x_rare.shape, x_common.shape

((2400, 4510), (2400, 4510))

### Get Normal

In [89]:
x_normal = x_train_NV * (1 - x_common_V) * (1 - x_rare_V)

In [91]:
x_normal[:, idx]

array([0, 0, 0, ..., 0, 0, 1])

In [20]:
# remove punctuation in the text
def rm_punc(text_list):
    puncs = '!@#$%^&*()_+-={}[]:;|<>,.?/"\''
    clean_text_list = []
    for text in text_list:
        clean_text = ""
        for char in text:
            if char not in puncs:
                clean_text += char.lower()
        clean_text_list.append(clean_text)
    return clean_text_list

clean_tr_text = rm_punc(tr_text_list)
clean_tr_text[0]

'oh and i forgot to also mention the weird color effect it has on your phone'

In [21]:
# partition the data
words_list = []
for text in clean_tr_text:
    words_list.append(text.split())
    
words_list[0]

['oh',
 'and',
 'i',
 'forgot',
 'to',
 'also',
 'mention',
 'the',
 'weird',
 'color',
 'effect',
 'it',
 'has',
 'on',
 'your',
 'phone']