In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression

In [2]:
prject_folder = "proj3_data/split_"

### Step 1: Load and preprocess training data

In [3]:
df_train = pd.read_csv(os.path.join(prject_folder+str(1), "train.tsv"), sep='\t', header=0, dtype=str)
df_test = pd.read_csv(os.path.join(prject_folder+str(1), "test.tsv"), sep='\t', header=0, dtype=str)
df_test['sentiment'] = pd.read_csv(os.path.join(prject_folder+str(1), "test_y.tsv"), sep='\t', header=0, dtype=str)['sentiment']
df_train = pd.concat([df_train, df_test])

In [5]:
df_train['review'] = df_train['review'].str.replace('<.*?>', '', regex=True)
df_train['review'] = df_train['review'].str.replace('&lt;.*?&gt;', ' ', regex=True)

In [6]:
positive_indices = df_train[df_train['sentiment']=='1'].index.values
negative_indices = df_train[df_train['sentiment']=='0'].index.values

In [7]:
num_pos = len(positive_indices)
num_neg = len(negative_indices)

### Step 2: Construct DT (DocumentTerm) matrix (maximum 4-grams).

In [8]:
stop_words = ["i", "me", "my", "myself",
               "we", "our", "ours", "ourselves",
               "you", "your", "yours",
               "their", "they", "his", "her",
               "she", "he", "a", "an", "and",
               "is", "was", "are", "were",
               "him", "himself", "has", "have",
               "it", "its", "the", "us"]

vectorizer = TfidfVectorizer(
    preprocessor=lambda x: x.lower(),  # Convert to lowercase
    stop_words=stop_words,             # Remove stop words
    ngram_range=(1, 4),               # Use 1- to 4-grams
    min_df=0.001,                        # Minimum document frequency
    max_df=0.5,                       # Maximum document frequency
    token_pattern=r"\b[\w+\|']+\b" # Use word tokenizer: See Ethan's comment below
)

dtm_train = vectorizer.fit_transform(df_train['review'])

In [9]:
df_features_names = pd.DataFrame(vectorizer.get_feature_names_out(), columns=['feature_names'])

## Step 3: Two sample t-test calculation
Filter the vocabulary to include only terms I could readily interpret. I employed a straightforward screening method: the two-sample t-test. This test compares one-dimensional observations from two groups, denoted as:$X_1, X_2, ..., X_m$, $Y_1, Y_2, ..., Y_n$

The goal is to determine whether the X population and the Y population share the same mean. The two-sample t-statistic is computed as:
$$
\frac{\bar X - \bar Y}{\sqrt{\frac{\sigma_{X}^2}{m} + \frac{\sigma_{Y}^2}{n}}}
$$
where $\sigma_{X}^2$, $\sigma_{Y}^2$ denote the sample variance of $X$ and $Y$

> Suppose we have $m$ positive reviews and $n$ negative reviews. For a given word, $X_i$'s represent the measurements associated with each of the positive reviews, and $Y_j$'s represent the measurements corresponding to each of the negative reviews. The goal is to assess the significance of differences in measurements between the two sentiment groups for each word via t-test.

In [10]:
dtm_train_squared = dtm_train.copy()
dtm_train_squared.data **= 2

In [11]:
pos_mean = dtm_train[positive_indices].mean(axis=0)
pos_var = dtm_train_squared[positive_indices].mean(axis=0) - np.square(dtm_train[positive_indices].mean(axis=0))

In [12]:
neg_mean = dtm_train[negative_indices].mean(axis=0)
neg_var = dtm_train_squared[negative_indices].mean(axis=0) - np.square(dtm_train[negative_indices].mean(axis=0))

In [13]:
t_statistics_array = (pos_mean - neg_mean) / np.sqrt(pos_var / num_pos + neg_var / num_neg)
t_statistics_array = np.squeeze(np.asarray(t_statistics_array))
t_statistics_descending_array = t_statistics_array.argsort()[::-1]

> Top 50 **positive words**

In [14]:
df_features_names['feature_names'].values[t_statistics_descending_array[:50]]

array(['great', 'excellent', 'best', 'of best', 'love', 'wonderful',
       'one of best', 'brilliant', 'well', 'perfect', 'amazing', 'life',
       'loved', 'favorite', 'beautiful', 'also', 'very', 'must see',
       'one of', 'superb', 'highly', 'performance', 'wonderfully',
       'fantastic', 'both', 'enjoyed', 'very well', 'performances',
       'years', 'this great', 'always', 'gives', 'world', 'definitely',
       'moving', 'especially', 'touching', 'story of', 'young',
       'powerful', 'strong', 'great film', 'job', 'love this',
       'well worth', 'perfectly', 'enjoy', 'gem', 'greatest', 'well as'],
      dtype=object)

> Top 50 **Negative words**

In [15]:
df_features_names['feature_names'].values[t_statistics_descending_array[-50:]]

array(['badly', 'cheap', 'worst movie', 'waste time', 'supposed to be',
       'thing', 'only', 'pointless', 'or', 'instead', 'pathetic', "don't",
       'not even', 'least', 'laughable', '1', 'oh', 'script', 'lame',
       'supposed to', 'poorly', 'crap', 'why', 'money', 'ridiculous',
       'waste of time', 'supposed', 'at all', 'stupid', 'acting', 'avoid',
       'so bad', 'horrible', 'one of worst', 'plot', 'just', 'even',
       'worse', 'minutes', 'of worst', 'boring', 'waste of', 'poor', 'no',
       'nothing', 'terrible', 'awful', 'waste', 'worst', 'bad'],
      dtype=object)

> Select 2000 words with top absolute t_statistics

In [16]:
absolute_t_statistics_array = np.abs(t_statistics_array)
absolute_t_stastics_descending_array_index = absolute_t_statistics_array.argsort()[::-1]
selected_idx = absolute_t_stastics_descending_array_index[:2000]
selected_vocab_2k = df_features_names['feature_names'].values[list(selected_idx)]

### Step 4: use Lasso (with logistic regression) to trim the vocabulary size to 1000

In [17]:
# Construct new DT matrix
vectorizer2k = CountVectorizer(
    ngram_range=(1, 2),               # Use 1- to 4-grams
    vocabulary = selected_vocab_2k    # use 2000 vocabulary from the previous step
)

dtm_train2k = vectorizer2k.fit_transform(df_train['review'])

> Fit a lasso model 50 times, each time on 60% of data and record the features selected by Lasso. Save the top 1000 most frequently selected words.

In [18]:
training_count = df_train.shape[0]
training_index = np.arange(training_count)
selection_freq = np.zeros(dtm_train2k.shape[1])
for i in range(50):
    np.random.seed(i)
    np.random.shuffle(training_index)
    curr_idx = training_index[:int(training_count*0.6)]
    curr_data = dtm_train2k[curr_idx]
    lasso_model = LogisticRegression(penalty='l1', solver='liblinear', C = 0.3)
    lasso_model.fit(X=curr_data, y=df_train['sentiment'].iloc[curr_idx])
    selection_freq += np.squeeze(lasso_model.coef_) != 0

In [19]:
idx_by_freq = np.argsort(selection_freq)[::-1]
top_vocab = selected_vocab_2k[idx_by_freq[:1000]]
print(top_vocab)

['bad' 'quite' 'seemed' 'even better' 'okay' 'not recommend'
 'very disappointed' 'instead of' 'tries' 'may not' 'unwatchable' 'tells'
 'become' 'nowhere' 'supposedly' 'fascinating' 'appreciated' 'funniest'
 'much better' 'any of' 'light' 'might' 'dvd' 'mildly' 'exceptional'
 'makes' 'ages' 'see' 'nonsense' 'cardboard' 'superbly' 'suppose'
 'intense' 'although' 'stay away' 'meets' 'recommend this' 'frankly'
 'really good' 'fast paced' 'gritty' 'endless' 'lacks' 'chilling'
 'clichés' 'sequel' 'filmmakers' 'surprised' 'atmosphere' 'human'
 'pretentious' 'enjoyable' 'refreshing' 'camera' 'uninteresting'
 'make movie' 'appalling' 'be missed' 'embarrassed' 'sucked' 'that bad'
 'painful' 'movie just' 'first time' 'falls' 'forgettable' 'satisfying'
 'saving' 'delightful' 'clichéd' 'hoping' 'works' 'completely' 'rubbish'
 'journey' 'uninspired' 'way too' 'failed' 'entertaining' 'story'
 'with this' 'first saw' 'only good' 'wonder' 'subtle' 'quiet'
 'unconvincing' 'just not' 'what makes' 'dialo

In [20]:
fp = open(r'myvocab.txt', 'w')
for item in top_vocab:
    fp.write("%s\n" % item)