In [1]:
import pandas as pd
import os
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression, LassoCV, Lasso, RidgeClassifier, RidgeClassifierCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score

In [2]:
prject_folder = "proj3_data/split_"

### Step 1: Load the training data and clean the html tags

In [3]:
df_train = pd.concat([pd.read_csv(os.path.join(prject_folder+str(fold), "train.tsv"), sep='\t', header=0, dtype=str) for fold in range(1, 6)])

In [4]:
df_train.shape

(125000, 3)

In [5]:
df_train['review'] = df_train['review'].str.replace('&lt;.*?&gt;', ' ', regex=True)

In [6]:
positive_indices = df_train[df_train['sentiment']=='1'].index.values
negative_indices = df_train[df_train['sentiment']=='0'].index.values

In [7]:
num_pos = len(positive_indices)
num_neg = len(negative_indices)

### Step 2: Construct DT (DocumentTerm) matrix (maximum 4-grams).
> The default vocabulary size (i.e., # of columns of dtm_train) is more than 30,000, bigger than the sample size n = 25,000.

In [8]:
stop_words = ["i", "me", "my", "myself",
               "we", "our", "ours", "ourselves",
               "you", "your", "yours",
               "their", "they", "his", "her",
               "she", "he", "a", "an", "and",
               "is", "was", "are", "were",
               "him", "himself", "has", "have",
               "it", "its", "the", "us", "br"]

vectorizer = CountVectorizer(
    preprocessor=lambda x: x.lower(),  # Convert to lowercase
    stop_words=stop_words,             # Remove stop words
    ngram_range=(1, 4),               # Use 1- to 4-grams
    min_df=0.001,                        # Minimum term frequency
    max_df=0.5,                       # Maximum document frequency
    token_pattern=r"\b[\w+\|']+\b" # Use word tokenizer: See Ethan's comment below
)

dtm_train = vectorizer.fit_transform(df_train['review'])

In [9]:
dtm_train.shape

(125000, 31536)

In [10]:
df_features_names = pd.DataFrame(vectorizer.get_feature_names_out(), columns=['feature_names'])

## Step 3: Two sample t-test calculation
Filter the vocabulary to include only terms I could readily interpret. I employed a straightforward screening method: the two-sample t-test. This test compares one-dimensional observations from two groups, denoted as:$X_1, X_2, ..., X_m$, $Y_1, Y_2, ..., Y_n$

The goal is to determine whether the X population and the Y population share the same mean. The two-sample t-statistic is computed as:
$$
\frac{\bar X - \bar Y}{\sqrt{\frac{\sigma_{X}^2}{m} + \frac{\sigma_{Y}^2}{n}}}
$$
where $\sigma_{X}^2$, $\sigma_{Y}^2$ denote the sample variance of $X$ and $Y$

> Suppose we have $m$ positive reviews and $n$ negative reviews. For a given word, $X_i$'s represent the measurements associated with each of the positive reviews, and $Y_j$'s represent the measurements corresponding to each of the negative reviews. The goal is to assess the significance of differences in measurements between the two sentiment groups for each word via t-test.

In [11]:
dtm_train_squared = dtm_train.copy()
dtm_train_squared.data **= 2

In [12]:
pos_mean = dtm_train[positive_indices].mean(axis=0)
pos_var = dtm_train_squared[positive_indices].mean(axis=0) - np.square(dtm_train[positive_indices].mean(axis=0))

In [13]:
neg_mean = dtm_train[negative_indices].mean(axis=0)
neg_var = dtm_train_squared[negative_indices].mean(axis=0) - np.square(dtm_train[negative_indices].mean(axis=0))

In [14]:
t_statistics_array = (pos_mean - neg_mean) / np.sqrt(pos_var / num_pos + neg_var / num_neg)

In [15]:
t_statistics_array = np.squeeze(np.asarray(t_statistics_array))
t_statistics_array.shape

(31536,)

In [16]:
t_statistics_descending_array = t_statistics_array.argsort()[::-1]

> Top 50 **positive words**

In [17]:
df_features_names['feature_names'].values[t_statistics_descending_array[:50]]

array(['great', 'excellent', 'wonderful', 'of best', 'one of best',
       'best', 'amazing', 'superb', 'love', 'well', 'loved', 'must see',
       'brilliant', 'one of', 'today', 'well worth', 'very', 'highly',
       'perfect', 'enjoyed', 'outstanding', 'performance', 'wonderfully',
       'young', 'also', 'life', 'this great', 'beautiful', 'strong',
       'unique', 'performances', 'both', 'fantastic', 'own', 'definitely',
       'beautifully', 'top notch', 'favorite', 'powerful', 'excellent as',
       'sweet', 'touching', 'war', 'classic', 'well as', 'awesome',
       'gives', 'story of', 'moving', 'best of'], dtype=object)

> Top **Negative words**

In [18]:
df_features_names['feature_names'].values[t_statistics_descending_array[-50:]]

array(['cheap', 'fails', 'instead', 'bad acting', 'any', 'waste time',
       'badly', 'bother', 'annoying', 'unless', 'thing', 'wasted',
       'avoid this', 'poorly', 'so bad', 'no sense', 'reason',
       'supposed to be', 'crap', 'just', 'lame', 'dull', 'one of worst',
       'waste of time', 'why', 'avoid', 'minutes', 'laughable', 'plot',
       'ridiculous', 'of worst', 'not even', 'stupid', 'supposed to',
       'horrible', 'supposed', 'even', 'at all', 'waste of', 'nothing',
       'acting', 'worse', 'poor', 'boring', 'no', 'terrible', 'awful',
       'waste', 'worst', 'bad'], dtype=object)

> Select 2000 words with top absolute t_statistics

In [19]:
absolute_t_statistics_array = np.abs(t_statistics_array)

In [20]:
absolute_t_stastics_descending_array_index = absolute_t_statistics_array.argsort()[::-1]

In [21]:
selected_idx = absolute_t_stastics_descending_array_index[:2000]

In [22]:
selected_vocab_2k = df_features_names['feature_names'].values[list(selected_idx)]

### Step 4: use Lasso (with logistic regression) to trim the vocabulary size iteratively

In [23]:
lasso_model = LogisticRegression(penalty='l1', solver='liblinear')

In [24]:
vectorizer2k = CountVectorizer(
    ngram_range=(1, 4),               # Use 1- to 4-grams
    vocabulary = selected_vocab_2k
)

# vectorizer2k.fit(selected_vocab_2k)

In [25]:
dtm_train2k = vectorizer2k.transform(df_train['review'])

In [26]:
dtm_train2k.shape

(125000, 2000)

In [27]:
training_count = df_train.shape[0]
training_index = np.arange(training_count)
selection_freq = np.zeros(dtm_train2k.shape[1])
for i in range(50):
    np.random.seed(i)
    np.random.shuffle(training_index)
    curr_idx = training_index[:int(training_count*0.6)]
    curr_data = dtm_train2k[curr_idx]
    lasso_model = LogisticRegression(penalty='l1', solver='liblinear', C = 0.2)
    lasso_model.fit(X=curr_data, y=df_train['sentiment'].iloc[curr_idx])
    selection_freq += np.squeeze(lasso_model.coef_) != 0

In [41]:
idx_by_freq = np.argsort(selection_freq)[::-1]

In [55]:
top_vocab = selected_vocab_2k[idx_by_freq[:1000]]

In [68]:
print(top_vocab)

['going for' 'caught' 'cash in' 'been' 'movie no' 'power of' 'insult to'
 'may not' 'really good' 'explained' 'rip off' 'wasting' 'dvd' 'because'
 'often' 'most' 'minutes into' 'useless' 'should' 'movie just' 'want'
 'unintentional' 'recommended' 'frank' 'unbelievable' 'unconvincing'
 'hoping' 'embarrassed' 'otherwise' 'shallow' 'nicely' 'series'
 'worth seeing' 'affection' 'disjointed' 'loved this'
 'highly recommend this' 'even get' 'prince' 'fears' 'fascinating'
 'flawless' 'lost interest' 'deliciously' 'favorite movies' 'basically'
 'revolting' 'not that' 'barbara' 'first saw' 'share' 'very good'
 'soundtrack' 'insult' 'original' 'tells' 'impact' 'same time' 'miss this'
 'movie ever' 'no chemistry' 'history' 'seconds' 'dreck' 'edge' 'timeless'
 'country' 'way too' 'emotions' 'with this' 'played' 'past' 'drivel'
 'very bad' 'not good' 'tense' 'this crap' 'brings' 'tries to be' 'gore'
 'point' 'hilarious' 'warm' 'raw' 'plastic' 'many' 'tedious' 'by saying'
 'disappointing' 'does noth

In [74]:
fp = open(r'myvocab.txt', 'w')
for item in top_vocab:
    fp.write("%s\n" % item)

### Step 4: Ridge regression

In [58]:
vectorizer2 = CountVectorizer(
    ngram_range=(1, 4),               # Use 1- to 4-grams
    vocabulary=top_vocab
)

# vectorizer2.fit(top_vocab)
dtm_train2 = vectorizer2.transform(df_train['review'])


In [59]:
alphas = np.linspace(1, 10, 20)
ridge_model = LogisticRegression(penalty='l2', solver='liblinear')
ridge_clf = GridSearchCV(ridge_model, [{'C': alphas}], cv=10, refit=False, scoring='roc_auc')
ridge_clf.fit(X=dtm_train2, y=df_train['sentiment'])
best_alpha = ridge_clf.best_params_['C']

In [60]:
ridge_clf.best_score_, best_alpha

(0.9631540356870454, 3.3684210526315788)

In [61]:
best_ridge_model = LogisticRegression(penalty='l2', solver='liblinear', C = best_alpha)
best_ridge_model.fit(X=dtm_train2, y=df_train['sentiment'])

> The probability estimates correspond to the probability of the class with the greater label, i.e. estimator.classes_[1]
and thus estimator.predict_proba(X, y)[:, 1]

In [62]:
auc_score_list = []
for fold in range(1, 6):
    df_test_x = pd.read_csv(os.path.join(prject_folder+str(fold), "test.tsv"), sep='\t', header=0, dtype=str)
    df_test_y = pd.read_csv(os.path.join(prject_folder+str(fold), "test_y.tsv"), sep='\t', header=0, dtype=str)
    df_test_x['review'] = df_test_x['review'].str.replace('&lt;.*?&gt;', ' ', regex=True)
    dtm_test2= vectorizer2.transform(df_test_x['review'])
    auc_score_list.append(roc_auc_score(df_test_y['sentiment'].values, best_ridge_model.predict_proba(dtm_test2)[:, 1]))

In [63]:
auc_score_list

[0.9608065563390693,
 0.960927405211607,
 0.9604661843656356,
 0.9610337286615864,
 0.9606003280894073]

### Evaluation

In [3]:
import time

In [21]:
myvocab = []
with open(r'myvocab.txt', 'r') as fp:
    for line in fp:
        x = line[:-1]
        myvocab.append(x)

In [26]:
vectorizer = CountVectorizer(
    ngram_range=(1, 4),
    vocabulary=myvocab
)
ridge_model = LogisticRegression(penalty='l2', solver='liblinear', C=0.3)
    

In [27]:
def read_and_transform_data(data_path, vectorizer):
    df_train = pd.read_csv(os.path.join(data_path, "train.tsv"), sep='\t', header=0, dtype=str)
    df_train['review'] = df_train['review'].str.replace('&lt;.*?&gt;', ' ', regex=True)
    dtm_train = vectorizer.transform(df_train['review'])
    train_y = df_train['sentiment']
    df_test_x = pd.read_csv(os.path.join(data_path, "test.tsv"), sep='\t', header=0, dtype=str)
    df_test_x['review'] = df_test_x['review'].str.replace('&lt;.*?&gt;', ' ', regex=True)
    dtm_test = vectorizer.transform(df_test_x['review'])
    return dtm_train, train_y, dtm_test

In [28]:
auc_score_list = []
for fold in range(1, 6):
    data_path = prject_folder + str(fold)
    dtm_train, train_y, dtm_test = read_and_transform_data(data_path, vectorizer)
    time0 = time.time()
    ridge_model.fit(X=dtm_train, y=train_y)
    time1 = time.time()
    y_proba = ridge_model.predict_proba(dtm_test)[:, 1]
    time2 = time.time()
    df_test_y = pd.read_csv(os.path.join(data_path, "test_y.tsv"), sep='\t', header=0, dtype=str)
    auc_score_list.append(roc_auc_score(df_test_y['sentiment'].values, y_proba))
    print("fold", fold, "training_time:", time1-time0, "pred_time:", time2-time1, "auc_score:", auc_score_list[-1])

fold 1 training_time: 0.16851472854614258 pred_time: 0.003536224365234375 auc_score: 0.9524941463566892
fold 2 training_time: 0.18445301055908203 pred_time: 0.001817941665649414 auc_score: 0.9527457485222572
fold 3 training_time: 0.1652388572692871 pred_time: 0.0012230873107910156 auc_score: 0.9524304905599402
fold 4 training_time: 0.16896891593933105 pred_time: 0.0016388893127441406 auc_score: 0.9529205970691821
fold 5 training_time: 0.17342901229858398 pred_time: 0.002187013626098633 auc_score: 0.9526752215691583


In [29]:
auc_score_list

[0.9524941463566892,
 0.9527457485222572,
 0.9524304905599402,
 0.9529205970691821,
 0.9526752215691583]

In [25]:
for fold in range(1, 2):
    data_path = prject_folder + str(fold)
    dtm_train, train_y, dtm_test = read_and_transform_data(data_path, vectorizer)
    
    alphas = np.logspace(-2, 0, 10)
    ridge_model = LogisticRegression(penalty='l2', solver='liblinear')
    ridge_clf = GridSearchCV(ridge_model, [{'C': alphas}], cv=10, refit=False, scoring='roc_auc')
    ridge_clf.fit(X=dtm_train, y=train_y)
    best_alpha = ridge_clf.best_params_['C']
    print(best_alpha)
    
    ridge_model = LogisticRegression(penalty='l2', solver='liblinear', C=best_alpha)
    ridge_model.fit(X=dtm_train, y=train_y)
    y_proba = ridge_model.predict_proba(dtm_test)[:, 1]
    df_test_y = pd.read_csv(os.path.join(data_path, "test_y.tsv"), sep='\t', header=0, dtype=str)
    auc_score = roc_auc_score(df_test_y['sentiment'].values, y_proba)
    print(auc_score)

0.3593813663804626
0.9525136984479116


In [None]:
# manually went over vocab list and deleted words like "had", "then", "by", "do", "or" and names