### Plan of Attack
1. Load the downloaded dataset from google drive.
2. extract file and check whether the data have organized into the train and test subfolders. (check the file hierarchy)
3. Load the both training and test dataset.
4. Clean and apply text preprocessing steps.
5. Apply text vectorization method tf-idf.
6. Train the various models and test to evaluate.
7. Export required pickle files
8. Use pre-trained DistilBERT model to evaluate.

In [1]:
# load dataset
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
# extract dataset
!tar -xzf /content/drive/MyDrive/aclImdb_v1.tar.gz -C /content

# file hierarchy
train/neg/ and train/pos/id_rating.txt

test/neg/ and test/pos/id_rating.txt

In [3]:
# use os module to handle files
import os

# get postive review path
dir_path = os.path.join('aclImdb', 'train', 'pos')
print(dir_path)

aclImdb/train/pos


In [4]:
# get first file name and path
file_name = os.listdir(dir_path)[1]
file_path = os.path.join(dir_path, file_name)
print(file_path)

aclImdb/train/pos/6775_8.txt


In [5]:
# open the file and extract text
with open(file_path, 'r') as f:
  text = f.read()

In [6]:
text

'This was a very well scripted movie. Great fun if you just want a stupid film. Not great production value (ok, the sound really sucked) but the performance of Danny Masterson more than makes up for it.<br /><br />Watch this movie and laugh out loud!'

In [7]:
# text preprocessing
# lowercase
text = text.lower()
text

'this was a very well scripted movie. great fun if you just want a stupid film. not great production value (ok, the sound really sucked) but the performance of danny masterson more than makes up for it.<br /><br />watch this movie and laugh out loud!'

In [8]:
# remove html tags
import re
text = re.sub(r'<[^>]+>', '', text)
text

'this was a very well scripted movie. great fun if you just want a stupid film. not great production value (ok, the sound really sucked) but the performance of danny masterson more than makes up for it.watch this movie and laugh out loud!'

In [9]:
# remove punctuation
import string
text = text.translate(str.maketrans('', '', string.punctuation))
text

'this was a very well scripted movie great fun if you just want a stupid film not great production value ok the sound really sucked but the performance of danny masterson more than makes up for itwatch this movie and laugh out loud'

In [10]:
# remove extra spaces
text = ' '.join(text.split())
text

'this was a very well scripted movie great fun if you just want a stupid film not great production value ok the sound really sucked but the performance of danny masterson more than makes up for itwatch this movie and laugh out loud'

In [11]:
# tokenize before removing stopwords and lemmatization
import nltk
nltk.download('punkt_tab')
from nltk.tokenize import word_tokenize
tokens = word_tokenize(text)
tokens[:5]

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


['this', 'was', 'a', 'very', 'well']

In [12]:
# remove stopwords
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
tokens = [word for word in tokens if word not in stop_words]
tokens[:5]

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


['well', 'scripted', 'movie', 'great', 'fun']

In [13]:
# apply lemmatization
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
wl = WordNetLemmatizer()
tokens = [wl.lemmatize(t) for t in tokens]
tokens[:5]

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


['well', 'scripted', 'movie', 'great', 'fun']

In [14]:
clean_text = ' '.join(tokens)
clean_text[:25]

'well scripted movie great'

In [15]:
# load the full dataset
def load_dataset(base_dir):
  # create lists of texts and corresponding labels
  texts = []
  labels = []
  # extract files of both positive and negative sentiments subfolders
  for sentiment in ['pos', 'neg']:
    dir_path = os.path.join(base_dir, sentiment)
    for file_name in os.listdir(dir_path):
      # check whether it is textfile or not
      if file_name.endswith('.txt'):
        file_path = os.path.join(dir_path, file_name)
        with open(file_path, 'r', encoding='utf-8') as f:
          # extract each file content and add it into list
          txt = f.read()
          texts.append(txt)
          labels.append(1 if sentiment == 'pos' else 0)
  return texts, labels

train_texts, train_labels = load_dataset('aclImdb/train')
test_texts, test_labels = load_dataset('aclImdb/test')

In [16]:
len(train_texts), len(test_texts)

(25000, 25000)

In [17]:
# text preprocessing step
def preprocess_text(text):
  # lowercase
  text = text.lower()

  # remove HTML tags
  text = re.sub(r'<[^>]+>', '', text)

  # remove punctuation
  text = text.translate(str.maketrans('', '', string.punctuation))

  # tokenize before lemmatization
  tokens = word_tokenize(text)

  # lemmatize each word
  tokens = [wl.lemmatize(token) for token in tokens]

  # join back the string
  text = ' '.join(tokens)

  # remove extra spaces
  text = ' '.join(text.split())

  return text

train_clean_texts = [preprocess_text(t) for t in train_texts]
test_clean_texts = [preprocess_text(t) for t in test_texts]

In [18]:
# apply tfidf vectorization
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(
    max_features=30000, # vocabulary size
    ngram_range=(1, 2), # unigrams + bigrams
    stop_words='english'
)

In [19]:
# transform data
X_train = tfidf.fit_transform(train_clean_texts)
X_test = tfidf.transform(test_clean_texts)

In [20]:
X_train.shape, X_test.shape

((25000, 30000), (25000, 30000))

In [21]:
# use svm model (to get probability value, we need CalibratedClassifierCV)
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV

svm = CalibratedClassifierCV(LinearSVC(), cv=3)
svm.fit(X_train, train_labels)

In [39]:
# model evaluation
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [26]:
svm_preds = svm.predict(X_test)
svm_proba = svm.predict_proba(X_test)

In [27]:
svm_proba

array([[0.00475686, 0.99524314],
       [0.01295451, 0.98704549],
       [0.62829338, 0.37170662],
       ...,
       [0.99779185, 0.00220815],
       [0.93107786, 0.06892214],
       [0.92734981, 0.07265019]])

In [28]:
svm_proba = svm_proba[:, 1]
svm_proba

array([0.99524314, 0.98704549, 0.37170662, ..., 0.00220815, 0.06892214,
       0.07265019])

In [34]:
accuracy_score(test_labels, svm_preds)

0.87712

In [31]:
confusion_matrix(test_labels, svm_preds)

array([[11108,  1392],
       [ 1680, 10820]])

In [40]:
print(classification_report(test_labels, svm_preds))

              precision    recall  f1-score   support

           0       0.87      0.89      0.88     12500
           1       0.89      0.87      0.88     12500

    accuracy                           0.88     25000
   macro avg       0.88      0.88      0.88     25000
weighted avg       0.88      0.88      0.88     25000



In [35]:
# use logistic regression
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()
lr.fit(X_train, train_labels)

In [36]:
lr_preds = lr.predict(X_test)
lr_proba = lr.predict_proba(X_test)[:, 1]

In [37]:
print('Logistic Regression Accuracy:', accuracy_score(test_labels, lr_preds))

Logistic Regression Accuracy: 0.87936


In [38]:
confusion_matrix(test_labels, lr_preds)

array([[10990,  1510],
       [ 1506, 10994]])

In [41]:
print(classification_report(test_labels, lr_preds))

              precision    recall  f1-score   support

           0       0.88      0.88      0.88     12500
           1       0.88      0.88      0.88     12500

    accuracy                           0.88     25000
   macro avg       0.88      0.88      0.88     25000
weighted avg       0.88      0.88      0.88     25000



In [42]:
# now check the performance of model on the new reviews that is inside unsup subfolder
unsup_path = 'aclImdb/train/unsup'
sample_file = os.listdir(unsup_path)[5]
sample_path = os.path.join(unsup_path, sample_file)
print(sample_path)

aclImdb/train/unsup/5144_0.txt


In [43]:
# open the file and extract text
with open(sample_path, 'r') as f:
  text = f.read()

In [44]:
text

'Here\'s one of those gloriously godawful "you gotta be kiddin\' me!"-type of plodding and maladroit low-rent no-budget psycho sicko gore flicks that played on double bills in numerous drive-ins and grindhouses in the splendidly sleazy 70\'s. Indeed, this delectably dreadful dreck was paired with the equally atrocious, yet somehow oddly endearing dippy hippie terror trip-out "The Curse of the Headless Horseman" on a twice-the-tacky-terror twin feature offering that must have caused anyone who saw them together to either make an immediate beeline for the exit door 15-odd minutes into the first film or slump into their seats in a comatose stupor after the ending credits of the second picture finished rolling.<br /><br />A mother-fixated bargain basement Norman Bates-like oedipal wreck homicidal crazy brutally butchers assorted supremely irritating women at an especially dingy and rundown beachside carnival in upstate New York. That\'s it for the threadbare plot -- and said skimpy story i

In [45]:
clean_text = preprocess_text(text)
clean_text

'here one of those gloriously godawful you got ta be kiddin metype of plodding and maladroit lowrent nobudget psycho sicko gore flick that played on double bill in numerous driveins and grindhouses in the splendidly sleazy 70 indeed this delectably dreadful dreck wa paired with the equally atrocious yet somehow oddly endearing dippy hippie terror tripout the curse of the headless horseman on a twicethetackyterror twin feature offering that must have caused anyone who saw them together to either make an immediate beeline for the exit door 15odd minute into the first film or slump into their seat in a comatose stupor after the ending credit of the second picture finished rollinga motherfixated bargain basement norman bateslike oedipal wreck homicidal crazy brutally butcher assorted supremely irritating woman at an especially dingy and rundown beachside carnival in upstate new york thats it for the threadbare plot and said skimpy story is related by cinematic blunder wonder triple threat 

In [46]:
vector = tfidf.transform([clean_text])

svm_pred = svm.predict(vector)[0]
print('SVM prediction:', 'positive' if svm_pred == 1 else 'negative')

lr_pred = lr.predict(vector)[0]
print('LR prediction:', 'positive' if lr_pred == 1 else 'negative')

SVM prediction: negative
LR prediction: negative


In [47]:
# save tfidf and models
import pickle

with open('tfidf.pkl', 'wb') as f:
  pickle.dump(tfidf, f)

with open('svm_model.pkl', 'wb') as f:
  pickle.dump(svm, f)

with open('lr_model.pkl', 'wb') as f:
  pickle.dump(lr, f)

In case of distilBERT, no need of lowercase, punctuation/stopwords removal, lemmatization/stemming or even tokenization.

In [None]:
# use distilBERT tokenizer
from transformers import DistilBertTokenizerFast
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

In [None]:
# tokensization function
def tokenize(batch):
  return tokenizer(
      batch['text'],
      truncation=True,
      padding='max_length',
      max_length=256
  )

In [None]:
# convert into HuggingFace Dataset object
from datasets import Dataset

train_dataset = Dataset.from_dict({
    'text': train_texts,
    'labels': train_labels
})
test_dataset = Dataset.from_dict({
    'text': test_texts,
    'labels': test_labels
})

In [None]:
# apply tokenization
train_encoded = train_dataset.map(tokenize, batched=True)
test_encoded = test_dataset.map(tokenize, batched=True)

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

In [None]:
# set the dataset format for pytorch
train_encoded.set_format(
    type='torch',
    columns=['input_ids', 'attention_mask', 'labels']
)
test_encoded.set_format(
    type='torch',
    columns=['input_ids', 'attention_mask', 'labels']
)

In [None]:
# load DistilBERT model
from transformers import DistilBertForSequenceClassification

model = DistilBertForSequenceClassification.from_pretrained(
    'distilbert-base-uncased',
    num_labels=2
)

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# training arguments
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./distilbert-imdb",
    logging_strategy="epoch",
    save_strategy="epoch",
    num_train_epochs=2,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    learning_rate=3e-5,
    weight_decay=0.01,
    push_to_hub=True
)

In [None]:
# login into huggingface
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
# trainer
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_encoded,
    eval_dataset=test_encoded
)

HfHubHTTPError: 401 Client Error: Unauthorized for url: https://huggingface.co/api/repos/create (Request ID: Root=1-6932261b-6d0155c50f1df14b183306c0;6ad0f83d-f6e6-401b-9596-6b28853ae165)

Invalid username or password.

In [None]:
os.environ["WANDB_DISABLED"] = "true"

In [None]:
trainer.train()

In [None]:
trainer.evaluate()

In [None]:
import requests

url = "https://programming-test-movie-sentiment-api.hf.space/predict_bert"
data = {"text": "I loved the movie!"}

response = requests.post(url, json=data)
print(response.json())

{'model': 'distilbert', 'label': 'positive', 'score': 0.9998749494552612}
