In [42]:
import pandas as pd
import os
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score

In [3]:
prject_folder = "~/Dropbox/UIUC_CS598_Statistical_Learning/CS598_Pratical_Statistical_Learning/Project3/proj3_data/split_1"

### Step 1: Load the training data and clean the html tags

In [4]:
df_train = pd.read_csv(os.path.join(prject_folder, "train.tsv"), sep='\t', header=0, dtype=str)
df_test_x = pd.read_csv(os.path.join(prject_folder, "test.tsv"), sep='\t', header=0, dtype=str)


In [46]:

df_test_y = pd.read_csv(os.path.join(prject_folder, "test_y.tsv"),sep='\t', header=0, dtype=int)

In [5]:
df_train['review'] = df_train['review'].str.replace('&lt;.*?&gt;', ' ', regex=True)
df_test_x['review'] = df_test_x['review'].str.replace('&lt;.*?&gt;', ' ', regex=True)

### Step 2: Construct DT (DocumentTerm) matrix (maximum 4-grams). 
> The default vocabulary size (i.e., # of columns of dtm_train) is more than 30,000, bigger than the sample size n = 25,000.

In [6]:
stop_words = ["i", "me", "my", "myself", 
               "we", "our", "ours", "ourselves", 
               "you", "your", "yours", 
               "their", "they", "his", "her", 
               "she", "he", "a", "an", "and",
               "is", "was", "are", "were", 
               "him", "himself", "has", "have", 
               "it", "its", "the", "us"]

vectorizer = CountVectorizer(
    preprocessor=lambda x: x.lower(),  # Convert to lowercase
    stop_words=stop_words,             # Remove stop words
    ngram_range=(1, 4),               # Use 1- to 4-grams
    min_df=0.001,                        # Minimum term frequency
    max_df=0.5,                       # Maximum document frequency
    token_pattern=r"\b[\w+\|']+\b" # Use word tokenizer: See Ethan's comment below
)

dtm_train = vectorizer.fit_transform(df_train['review'])

In [7]:
df_features_names = pd.DataFrame(vectorizer.get_feature_names_out(), columns=['feature_names'])

### Step 3: use Lasso (with logistic regression) to trim the vocabulary size to 2K.

In [8]:
lasso_model = LogisticRegression(penalty='l1', solver='liblinear')

In [24]:
alphas = np.logspace(-2, 0, 10)
tuned_parameters = [{'C': alphas}]
n_folds = 5
lasso_clf = GridSearchCV(lasso_model, [{'C': alphas}], cv=n_folds, refit=False, scoring='roc_auc')
lasso_clf.fit(X=dtm_train, y=df_train['sentiment'])
best_alpha = lasso_clf.best_params_['C']

In [25]:
lasso_clf.best_score_

0.9552182239637297

In [26]:
best_alpha

0.21544346900318834

In [27]:
best_lasso_model = LogisticRegression(penalty='l1', solver='liblinear', C = best_alpha)
best_lasso_model.fit(X=dtm_train, y=df_train['sentiment'])

In [28]:
df_best_model_coef = pd.DataFrame(np.squeeze(abs(best_lasso_model.coef_)), columns=['abs_coef']).sort_values(by=['abs_coef'], ascending=False)

In [29]:
top_2k_index = df_best_model_coef.head(2000).index

In [30]:
selected_vocab = df_features_names.loc[top_2k_index,:]['feature_names']

In [31]:
selected_vocab

289               7 10
245               4 10
213               3 10
31405            waste
18840    not recommend
             ...      
18641      nonetheless
9927           fiction
8228           douglas
13760          in fact
13160       hysterical
Name: feature_names, Length: 2000, dtype: object

### Step 4: Ridge regression

In [32]:
vectorizer2 = CountVectorizer(
    ngram_range=(1, 2)               # Use 1- to 4-grams
)

vectorizer2.fit(selected_vocab)
dtm_train2 = vectorizer2.transform(df_train['review'])


In [41]:

dtm_test2= vectorizer2.transform(df_test_x['review'])

In [49]:
dtm_train2

<25000x2113 sparse matrix of type '<class 'numpy.int64'>'
	with 2227879 stored elements in Compressed Sparse Row format>

In [34]:
vectorizer2.get_feature_names_out()

array(['10', '10 10', '10 out', ..., 'zero', 'zombie', 'zombies'],
      dtype=object)

In [50]:
ridge_model = LogisticRegression(penalty='l2', solver='liblinear')
ridge_clf = GridSearchCV(ridge_model, [{'C': alphas}], cv=n_folds, refit=False, scoring='roc_auc')
ridge_clf.fit(X=dtm_train2, y=df_train['sentiment'])
best_alpha = ridge_clf.best_params_['C']

In [51]:
ridge_clf.best_score_

0.9710659456553221

In [52]:
best_alpha = ridge_clf.best_params_['C']

In [53]:
best_alpha

0.3593813663804626

In [54]:
best_ridge_model = LogisticRegression(penalty='l2', solver='liblinear', C = best_alpha)
best_ridge_model.fit(X=dtm_train2, y=df_train['sentiment'])

> The probability estimates correspond to the probability of the class with the greater label, i.e. estimator.classes_[1] 
and thus estimator.predict_proba(X, y)[:, 1]

In [55]:

roc_auc_score(df_test_y['sentiment'].values, best_ridge_model.predict_proba(dtm_test2)[:, 1])

0.9506847747148849