In [206]:
import pandas as pd
import os
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score

In [60]:
prject_folder = "~/Dropbox/UIUC_CS598_Statistical_Learning/CS598_Pratical_Statistical_Learning/Project3/proj3_data/split_1"

### Step 1: Load the training data and clean the html tags

In [61]:
df_train = pd.read_csv(os.path.join(prject_folder, "train.tsv"), sep='\t', header=0, dtype=str)
df_test_x = pd.read_csv(os.path.join(prject_folder, "test.tsv"), sep='\t', header=0, dtype=str)


In [62]:

df_test_y = pd.read_csv(os.path.join(prject_folder, "test_y.tsv"),sep='\t', header=0, dtype=int)

In [63]:
df_train['review'] = df_train['review'].str.replace('&lt;.*?&gt;', ' ', regex=True)
df_test_x['review'] = df_test_x['review'].str.replace('&lt;.*?&gt;', ' ', regex=True)

In [98]:
positive_indices = df_train[df_train['sentiment']=='1'].index.values
negaive_indices = df_train[df_train['sentiment']=='0'].index.values

In [104]:
num_pos = len(positive_indices)
num_neg = len(negaive_indices)

### Step 2: Construct DT (DocumentTerm) matrix (maximum 4-grams). 
> The default vocabulary size (i.e., # of columns of dtm_train) is more than 30,000, bigger than the sample size n = 25,000.

In [64]:
stop_words = ["i", "me", "my", "myself", 
               "we", "our", "ours", "ourselves", 
               "you", "your", "yours", 
               "their", "they", "his", "her", 
               "she", "he", "a", "an", "and",
               "is", "was", "are", "were", 
               "him", "himself", "has", "have", 
               "it", "its", "the", "us"]

vectorizer = CountVectorizer(
    preprocessor=lambda x: x.lower(),  # Convert to lowercase
    stop_words=stop_words,             # Remove stop words
    ngram_range=(1, 4),               # Use 1- to 4-grams
    min_df=0.001,                        # Minimum term frequency
    max_df=0.5,                       # Maximum document frequency
    token_pattern=r"\b[\w+\|']+\b" # Use word tokenizer: See Ethan's comment below
)

dtm_train = vectorizer.fit_transform(df_train['review'])

In [7]:
df_features_names = pd.DataFrame(vectorizer.get_feature_names_out(), columns=['feature_names'])

## Step 3: Two sample t-test calculation
Filter the vocabulary to include only terms I could readily interpret. I employed a straightforward screening method: the two-sample t-test. This test compares one-dimensional observations from two groups, denoted as:$X_1, X_2, ..., X_m$, $Y_1, Y_2, ..., Y_n$

The goal is to determine whether the X population and the Y population share the same mean. The two-sample t-statistic is computed as:
$$
\frac{\bar X - \bar Y}{\sqrt{\frac{\sigma_{X}^2}{m} + \frac{\sigma_{Y}^2}{n}}}
$$
where $\sigma_{X}^2$, $\sigma_{Y}^2$ denote the sample variance of $X$ and $Y$

> Suppose we have $m$ positive reviews and $n$ negative reviews. For a given word, $X_i$'s represent the measurements associated with each of the positive reviews, and $Y_j$'s represent the measurements corresponding to each of the negative reviews. The goal is to assess the significance of differences in measurements between the two sentiment groups for each word via t-test.

In [78]:
dtm_train_array= dtm_train.toarray()

In [107]:
t_statistics_array = np.zeros(len(df_features_names))

In [108]:
for i in range(len(df_features_names)):
    # positive mean/variance
    x_bar = np.mean(dtm_train_array[positive_indices, i])
    x_var = np.var(dtm_train_array[positive_indices, i])

    # negative mean/variance
    y_bar = np.mean(dtm_train_array[negaive_indices, i])
    y_var = np.var(dtm_train_array[negaive_indices, i])

    t_statistics_array[i] = (x_bar - y_bar)/np.sqrt(x_var/num_pos + y_var/num_neg)

In [117]:
t_statistics_descending_array = t_statistics_array.argsort()[::-1]

> Top 50 **positive words**

In [119]:
df_features_names['feature_names'].values[t_statistics_descending_array[:50]]

array(['great', 'excellent', 'best', 'of best', 'wonderful',
       'one of best', 'perfect', 'love', 'amazing', 'superb', 'loved',
       'beautiful', 'well', 'favorite', 'brilliant', 'life', 'must see',
       'highly', 'also', 'very', 'fantastic', 'one of', 'performance',
       'beautifully', 'both', 'always', 'enjoyed', 'wonderfully',
       'very well', 'well worth', 'strong', 'today', '8 10',
       'highly recommend', 'this great', 'performances', 'young',
       'touching', '10 10', 'highly recommended', 'years', '7 10',
       'powerful', 'perfectly', 'definitely', 'terrific', 'moving',
       'love this', 'well as', '8'], dtype=object)

> Top **Negative words**

In [120]:
df_features_names['feature_names'].values[t_statistics_descending_array[-50:]]

array(['mess', 'pathetic', 'unless', 'laughable', 'annoying', 'dull',
       'any', 'supposed to be', 'oh', 'waste time', 'thing', 'pointless',
       'cheap', 'money', 'worst movie', 'script', "don't", 'wasted',
       'lame', 'not even', 'waste of time', 'plot', 'ridiculous', 'avoid',
       'why', 'supposed to', 'just', 'crap', 'poorly', 'at all',
       'supposed', 'one of worst', 'so bad', 'acting', 'even', 'horrible',
       'stupid', 'minutes', 'nothing', 'poor', 'of worst', 'boring', 'no',
       'waste of', 'worse', 'terrible', 'awful', 'waste', 'worst', 'bad'],
      dtype=object)

> Select 2000 words with top absolute t_statistics

In [124]:
absolute_t_statistics_array = np.abs(t_statistics_array)

In [125]:
absolute_t_stastics_descending_array_index = absolute_t_statistics_array.argsort()[::-1]

In [129]:
selected_vocab_2k = df_features_names['feature_names'].values[absolute_t_stastics_descending_array_index[:2000]]

### Step 4: use Lasso (with logistic regression) to trim the vocabulary size iteratively

In [287]:
lasso_model = LogisticRegression(penalty='l1', solver='liblinear')

In [288]:
vectorizer2k = CountVectorizer(
    ngram_range=(1, 2)               # Use 1- to 4-grams
)

vectorizer2k.fit(selected_vocab_2k)

In [None]:
dtm_train2k = vectorizer2k.transform(df_train['review'])

In [190]:
alphas = np.logspace(-2, 1, 10)
lass_models = []
for alpha in alphas:
    lasso_model = LogisticRegression(penalty='l1', solver='liblinear', C = alpha)
    lasso_model.fit(X=dtm_train2k, y=df_train['sentiment'])
    lass_models.append(lasso_model)

tuned_parameters = [{'C': alphas}]
n_folds = 5
lasso_clf = GridSearchCV(lasso_model, [{'C': alphas}], cv=n_folds, refit=False, scoring='roc_auc')
lasso_clf.fit(X=dtm_train2k, y=df_train['sentiment'])
best_alpha = lasso_clf.best_params_['C']

In [191]:
lasso_clf.best_score_

0.9555493607259746

In [192]:
best_alpha

0.46415888336127775

In [357]:
best_lasso_model = LogisticRegression(penalty='l1', solver='liblinear', C = 0.19)
best_lasso_model.fit(X=dtm_train2k, y=df_train['sentiment'])

In [358]:
df_best_model_coef = pd.DataFrame(np.squeeze(abs(best_lasso_model.coef_)), columns=['abs_coef']).sort_values(by=['abs_coef'], ascending=False)

In [359]:
df_best_model_coef[df_best_model_coef['abs_coef']>0]

Unnamed: 0,abs_coef
1027,1.712660
1735,1.671854
1817,1.490252
977,1.467570
596,1.363797
...,...
309,0.000993
1049,0.000912
1509,0.000756
1679,0.000492


In [360]:
top_indices = df_best_model_coef[df_best_model_coef['abs_coef']>0.0].index

In [361]:
top_vocab = selected_vocab_2k[top_indices]

In [362]:
len(top_vocab)

987

In [363]:
top_vocab

array(['surprisingly', 'dreams', 'without any', 'project', 'portrayal',
       'cash in', 'to sit', 'rest of', 'worse than', 'film shows',
       'release', 'unlike', 'to work with', 'doing', 'perfectly cast',
       'musical', 'obvious', '0 10', 'original', 'br 9 10', 'only',
       'br br worst', 'entertaining', 'excellent movie', 'br 1', 'do',
       'worst', 'realistic', 'nothing to do with', 'spectacular',
       'seriously', 'drivel', 'hooked', 'for no', 'even for', 'music',
       'this gem', 'sounded', 'waste of time money', 'job',
       'poorly acted', 'unintentional', '7 10', 'worst part', 'images',
       'wasted time', 'avoid at all costs', 'spent', 'camcorder',
       'that supposed to', 'neither', 'waste time with this',
       'this great film', 'clichéd', 'avoid this movie', 'although',
       'little film', '4 out of 10', 'disgusting', 'costs', 'notch',
       'era', '90 minutes', 'only good thing about', 'br br 8 10',
       'of worst movies ever', 'this piece of', '

### Step 4: Ridge regression

In [364]:
vectorizer2 = CountVectorizer(
    ngram_range=(1, 4)               # Use 1- to 4-grams
)

vectorizer2.fit(top_vocab)
dtm_train2 = vectorizer2.transform(df_train['review'])


In [365]:
dtm_test2= vectorizer2.transform(df_test_x['review'])

In [366]:
dtm_train2

<25000x1336 sparse matrix of type '<class 'numpy.int64'>'
	with 1565106 stored elements in Compressed Sparse Row format>

In [367]:
vectorizer2.get_feature_names_out()

array(['10', '10 10', '10 out', ..., 'young man', 'yourself',
       'yourself favor'], dtype=object)

In [386]:
alphas = np.linspace(0.01, 0.5, 20)
ridge_model = LogisticRegression(penalty='l2', solver='liblinear')
ridge_clf = GridSearchCV(ridge_model, [{'C': alphas}], cv=n_folds, refit=False, scoring='roc_auc')
ridge_clf.fit(X=dtm_train2, y=df_train['sentiment'])
best_alpha = ridge_clf.best_params_['C']

In [387]:
ridge_clf.best_score_

0.946826279956344

In [388]:
best_alpha = ridge_clf.best_params_['C']

In [389]:
best_alpha

0.29368421052631577

In [390]:
best_ridge_model = LogisticRegression(penalty='l2', solver='liblinear', C = best_alpha)
best_ridge_model.fit(X=dtm_train2, y=df_train['sentiment'])

> The probability estimates correspond to the probability of the class with the greater label, i.e. estimator.classes_[1] 
and thus estimator.predict_proba(X, y)[:, 1]

In [391]:
roc_auc_score(df_test_y['sentiment'].values, best_ridge_model.predict_proba(dtm_test2)[:, 1])

0.9484156249279397