In [0]:
from google.colab import drive
drive.mount('/content/drive')

# 1 - Data Preprocessing

## 1.1. Download Dataset

In [0]:
# Code to download file into Colaboratory:
!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
# Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

id = '1vF3FqgBC1Y-RPefeVmY8zetdZG1jmHzT'
downloaded = drive.CreateFile({'id':id}) 
downloaded.GetContentFile('imdb_train.csv')

id = '1XhaV8YMuQeSwozQww8PeyiWMJfia13G6'
downloaded = drive.CreateFile({'id':id}) 
downloaded.GetContentFile('imdb_test.csv')

import pandas as pd
df_train = pd.read_csv("imdb_train.csv")
df_test = pd.read_csv("imdb_test.csv")

reviews_train = df_train['review'].tolist()
sentiments_train = df_train['sentiment'].tolist()
reviews_test = df_test['review'].tolist()
sentiments_test = df_test['sentiment'].tolist()

print("Training set number:",len(reviews_train))
print("Testing set number:",len(reviews_test))

## 1.2. Preprocess data

In [0]:
# Please comment your code
#we do lower case the word then delete punctuation then delete html structe then tokenize then lemmatize,the preprocess is a pipeline concat these step

import re
from bs4 import BeautifulSoup
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from tqdm import tqdm
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from tqdm import tqdm
import sys
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer

def lower_case(x):#
    x=x.lower()
    return x

def delete_punct(x):#delete punctuation such as .,:;
    x = re.sub(r'[^\w\s]','',x)
    return x

def get_plain_text(x):#get text without html structure
  x=BeautifulSoup(x).get_text()
  return x

def tokenize(x):#tokenize 
  x = word_tokenize(x)
  return x

def lemmatize(text,lemmatizer):#reductive tense
  x = [lemmatizer.lemmatize(x, "v") for x in text]#lemmatize
  return x

def remove_stopword(text,stop_words):#
  text = [x for x in text if x not in stop_words]#delete stopword
  return text

def preprocess(text,stop_words,lemmatizer):#connect the all preprocessing steps
    text = get_plain_text(text)
    text = lower_case(text)
    text = delete_punct(text)
    words= tokenize(text)
    without_stopwords_words=remove_stopword(words,stop_words)
    lemmatized=lemmatize(without_stopwords_words,lemmatizer)
    " ".join(lemmatized)
    return lemmatized

lemmatizer = WordNetLemmatizer()
stop_words = stopwords.words("english")

print("starting processing ")
reviews_train =[preprocess(review,stop_words,lemmatizer) for review in reviews_train]
reviews_test =[preprocess(review,stop_words,lemmatizer) for review in reviews_test]
print("successful")


In [0]:
print(reviews_train[:10])

# 2 - Model Implementation

## 2.1. Word Embeddings

In [0]:
# Please comment your code
#not like BERT, we do not first pretrain a model, and fine tune this model. we train the model to our classification task directly.
#so we do not use word2vec .the word2vec is more like BERT.that extract infomation from plain without classifier task.
# we use fasttext ,a simple linear model. we use the nn.Embedding.data as our final word embedding vector

### 2.1.1. Data Preprocessing for Word Embeddings

In [0]:
#we first get the all word(the frequency is larger than 10) from all reviews,note that the index of word start from 0,and 0 is padding token ,the common word index is from 1
#and we convert all review to its word ids,if a word is not in word list ,the id will be 0 (pad),if a single is not long enough or less than the average length of reviews .padding will fill the last id


from collections import Counter#a counter uesd for get the occure times of a word

def get_average_length(all_reviews):#calculate the average length of the all reviews
  average_review_length=0
  for review in all_reviews:
    average_review_length+=len(review)
  return average_review_length//len(all_reviews)

def binarySentiment(sentiment):#pos-->1 ,neg-->0
    return [1 if sent=='pos' else 0 for sent in sentiment]

def reviews_to_ids(word2id,data):#convert the reviews to its word id list
  reviews_ids=[]
  for review in data:
    review_ids=[]
    review_ids=[word2id.get(word,0) for word in review]#if the word do not exist ,use 0 instead ,else use the word id
    review_ids=review_ids[:average_reviews_length]
    if len(review)<average_reviews_length:#padding 
      padding_length=average_reviews_length-len(review_ids)
      review_ids=review_ids+[0]*padding_length
    reviews_ids.append(review_ids)
  return reviews_ids

all_reviews=reviews_train+reviews_test#
all_words=['<pad>']#the first word is padding token the index is 0
for review in all_reviews:
  for word in review:
    all_words.append(word)
counter=Counter(word for word in all_words)
id2word=[word for word,frequency in counter.items() if frequency >=10]#a list convert wordid to word 
id2word.sort()
word2id={ w : i for i,w in enumerate(id2word)}#dict convert word to its id

average_reviews_length=get_average_length(all_reviews)#119
average_reviews_length=128   #average length is 119, we use 128 the nearest power of 2
print('average review length is ',average_reviews_length)

sentiment_train=binarySentiment(sentiments_train)#pos to 1,neg to 0
sentiment_test=binarySentiment(sentiments_test)

reviews_ids = reviews_to_ids(word2id,reviews_train)#convert all train reviews
voc_size = len(id2word)


In [0]:

print(reviews_ids[0])
print(sentiment_train)
print(voc_size)

### 2.1.2. Build Word Embeddings Model

In [0]:
# Please comment your code

input_dim=128
embedding_size=200 #we get 200dim fasttext embedding
output_dim=2#because it's a binary classification
num_of_epochs = 20
batch_size = 50
learning_rate = 0.001
device = "cuda" if torch.cuda.is_available() else "cpu"
criterion = nn.CrossEntropyLoss() 


class fasttext(nn.Module):
    def __init__(self,embedding_size,output_dim):
        super(fasttext, self).__init__()
        self.embedding = nn.Embedding(voc_size,embedding_size)
        self.projection=nn.Linear(embedding_size,embedding_size)#many paper use such projection,so I use it .lol
        self.linear1 = nn.Linear(embedding_size,32,bias=True)#two simple linear layer 
        self.linear2 = nn.Linear(32,output_dim,bias=False)

    def forward(self, inputs):
        embeds = self.embedding(inputs)
        #50*128*200
        embeds=self.projection(embeds)#50*128*200
        embed_mean = torch.mean(embeds,dim=1)#because we should use a 2dim tensor not 3D
        # print(embed_mean.size)
        out = self.linear1(embed_mean)
        out = self.linear2(out)
        return out

train_input=reviews_to_ids(word2id,reviews_train)#load train input and test input 

train_input=reviews_to_ids(word2id,reviews_train)#load train input and test input 
train_input=torch.tensor(train_input).long().to(device)#convert the numpy to tensor

print(train_input.size())

train_label=torch.tensor(sentiment_train).to(device)#load train target and test target and convert to tensor
print(train_label.size())

#generate batch from Dataloader,and TensorDataset
train_input_dataset=TensorDataset(train_input,train_label)#create dataset
train_input_batch=DataLoader(train_input_dataset,batch_size=100,shuffle=False)#create dataloader from dataset

    

### 2.1.3. Train Word Embeddings Model

In [0]:
# Please comment your code

model = fasttext(embedding_size,output_dim).to(device)#set model
optimizer = optim.Adam(model.parameters(), lr=learning_rate)#set optimizer ,we use Adam,

for epoch in range(num_of_epochs):
    epoch_loss = 0#total loss in one epoch
    correct = 0#use to calculate acc
    total_num = 0
    for step,batch in enumerate(train_input_batch):
        batch_in_torch,batch_tar_torch=batch
        model.train()#set to the train mode
        optimizer.zero_grad()#delete the grad that already consist in optimizer
        outputs = model(batch_in_torch)#get output
        print(outputs.size())
        # print("output size",outputs.size())
        loss = criterion(outputs, batch_tar_torch)#get loss
        loss.backward()
        optimizer.step()#SGD
        epoch_loss += loss.item()
        correct += torch.sum(outputs.argmax(dim=1)==batch_tar_torch).item()
        total_num += len(labels)
        print("\r Epoch:%d, loss:%s, acc:%s"%(epoch+1,epoch_loss,correct*1.0/total_num),end="")
    print()

### 2.1.4. Save Word Embeddings Model

In [0]:
# Please comment your code

fasttest_embedding = model.embedding.weight.data#get the embedding
trained_embeddings = fasttest_embedding.detach().cpu().numpy()#

torch.save(trained_embeddings,"./fasttext_embedding.pkl")#save the embedding to local
torch.save(word2id,"./word2id.pkl")#and the corresponding word 
torch.save(model,'fasttext_model.pt')#save the model

In [0]:
print(trained_embeddings.shape)#vocab num.,embedding dim

### 2.1.5. Load Word Embeddings Model

In [0]:
# Please comment your code

fasttext_embedding = torch.load('./fasttext_embedding.pkl')#load the embedding
word2id = torch.load('./word2id.pkl')#and the word
print(len(fasttext_embedding),len(fasttext_embedding[0]))

## 2.2. Character Embeddings

### 2.2.1. Data Preprocessing for Character Embeddings

In [0]:
# Please comment your code
# To get char embedding,we need a character2id table.The character should be lower case,beacuse of we have changed the review to lower case. for every word
# we will change it to a tensor .for example,the word "ab" will change to [[1,0,0,0,0,...,0],[0,1,0,0...0]] AKA onehot code.The shape of a word should be (word_length,1,37) ,37=26character +padding+0-9number

char_set=set()#get all character ever occured in word
for word in id2word:
  char_set.update(word.lower())#get all character

id2char=['*']+list(char_set)#"* is the padding symbol
print(id2char)
char2id={c:i for i,c in enumerate(id2char)}
dic_len=len(char2id)
print('char table have',dic_len)


seq_data = id2word#char embedding only for the word that embedded by word2vec,If a word is un-embedded ,we can using zero to init its embedding
padded_seq_data=[]#save the all padded words
average_length=0
for word in id2word:
    average_length+=len(word)
average_length=average_length//len(id2word)
print("average word length is ",average_length)#7


for word in seq_data:#padding process
  if len(word)>=average_length+1:#because I use // not /,when compute average_length
    padded_seq_data.append(word[:average_length+1])
  else:
    pad='*'*(average_length+1-len(word))
    pad+=word
    padded_seq_data.append(pad)#average word length
print(padded_seq_data)


def make_batch(seq_data):#same as lab
  input_batch = []
  target_batch = []
  i=0
  for seq in seq_data:
    i+=1
    input_data = [char2id[n] for n in seq[:-1]]
    padd_num=0
    for i in input_data:
      if i==26:
        padd_num+=1

    target = char2id[seq[-1]]
    single_one_hot=np.eye(dic_len)[input_data]
    single_one_hot[:padd_num]=[0]
    input_batch.append(single_one_hot)
    target_batch.append([target])

  return input_batch, target_batch


### 2.2.2. Build Character Embeddings Model

In [0]:
# Please comment your code
#we choose Bi-LSTM which is strength in many task.Such a toy job is not worth to use a deep network .
#and the hidden dimsion is 200(bidirection * 100)，This is a classification task ,we can use crossEntropy that is useful in
#classification task.We use Adam to optim our training .

learning_rate=0.001

n_hidden=100
# number of inputs (dimension of input vector )
n_input = dic_len
# number of classes = 37
n_class = dic_len
class Net(nn.Module):#一个简单的LSTM分类器
    def __init__(self):
        super(Net, self).__init__()
        self.lstm = nn.LSTM(n_input, n_hidden, batch_first=True, bidirectional=True, dropout=0.2)
        self.linear = nn.Linear(n_hidden * 2, n_class)

    def forward(self, sentence):
        # h_n of shape (num_layers * num_directions, batch, hidden_size): tensor containing the hidden state for t = seq_len.
        lstm_out, (h_n, c_n) = self.lstm(sentence)#h_n也就是hidden，隐藏着上下文信息
        # concat the last hidden state from two direction
        hidden_out = torch.cat((h_n[0, :, :], h_n[1, :, :]), 1)#我们把双向的lstm得到的hidden合并成一个
        z = self.linear(hidden_out)#一次projection
        log_output = F.log_softmax(z, dim=1)#得到结果，用于计算loss
        return log_output, F.softmax(hidden_out, dim=1)


# Move the model to GPU
net = Net().to(device)
# Loss function and optimizer
criterion = nn.CrossEntropyLoss()#交叉熵损失函数
optimizer = optim.Adam(net.parameters(), lr=learning_rate)#Adam优化器

# Preparing input
input_batch, target_batch = make_batch(padded_seq_data)


### 2.1.4. Train Character Embeddings Model

In [0]:
# Please comment your code
from sklearn.metrics import accuracy_score

learning_rate = 0.001
n_hidden = 100
total_epoch = 50
from torch.utils.data import DataLoader, TensorDataset#使用Dataloader来分batch

# Convert input into tensors and move them to GPU by uting tensor.to(device)
input_batch_torch = torch.tensor(input_batch).float().to(device)#convert batch to tensor 
target_batch_torch = torch.tensor(target_batch).view(-1).to(device)

train_data=TensorDataset(input_batch_torch,target_batch_torch)#use dataloader to generate batch
train_dataloader=DataLoader(train_data,batch_size=50)#50一个batch
train_batches = [batch for batch in train_dataloader]

for epoch in range(total_epoch):
    for step,batch in enumerate(train_batches):
        # Set the flag to training
        net.train()#set to train mode
        input_mini_batch_torch,target_mini_batch_torch=batch#one batch
        input_mini_batch_torch.to(device)
        target_mini_batch_torch.to(device)
        # forward + backward + optimize
        outputs,_ = net(input_mini_batch_torch)#get output
        loss = criterion(outputs, target_mini_batch_torch)#calculate loss
        loss.backward()#
        optimizer.step()#SGD
        optimizer.zero_grad()#delete the grad that contain in the optimizer

        # Set the flag to evaluation, which will 'turn off' the dropout
        net.eval()#set to the eval mode.
        outputs,_= net(input_mini_batch_torch)#and get the output 

        # Evaluation loss and accuracy calculation
        loss = criterion(outputs, target_mini_batch_torch)
        _, predicted = torch.max(outputs, 1)
        acc = accuracy_score(predicted.cpu().numpy(), target_mini_batch_torch.cpu().numpy())

        print('Epoch: %d, loss: %.5f, train_acc: %.2f' % (epoch + 1, loss.item(), acc))

print('Finished Training')

#get char embedding
net.eval()
_,hidden_state = net(input_batch_torch)#the hidden state contain thte context info, so we use hidden state of rnn as the cahr embedding
hidden_state=hidden_state.cpu().data.numpy()



### 2.1.5. Save Character Embeddings Model

In [0]:

def save_embedding(hidden_state,id2word, file_name):#save the embedding to the local
    embedding = hidden_state#(word number,100)
    fout = open(file_name, 'w')
    fout.write('%d %d\n' % (len(id2word), len(embedding[0])))
    for wid, w in enumerate(id2word):
        e = embedding[wid]
        e = ' '.join(map(lambda x: str(x), e))
        fout.write('%s %s\n' % (w, e))
save_embedding(hidden_state,id2word,'./char2vec_embedding.txt')
torch.save(net,'./char2vec_model.pt')


### 2.1.6. Load Character Embeddings Model

In [0]:
# Please comment your code
char2vec_model=torch.load('./char2vec_model.pt')

## 2.3. Sequence model

### 2.3.1. Apply/Import Word Embedding and Character Embedding Model

In [0]:
#

#load fasttext embedding 
fasttext_embedding = torch.load('./fasttext_embedding.pkl')
fasttext_embedding=torch.tensor(fasttext_embedding).float()
word2id = torch.load('./word2id.pkl')

#load char2vec emebdding by hand from char2vec_embedding.txt
def build_embedding(id2word, file, dim):#dim is 200 .that is because rnn is bidirectional so 100*2=200
    vocab_size = len(id2word)
    emb = np.random.uniform(-1, 1, (vocab_size, dim))
    emb[0] = 0 # <pad> should be all 0
    flag = 0
    w2id = {w: i for i, w in enumerate(id2word)}#id to char embedding
    with open(file, encoding="utf8") as f:
        for line in f:
            if flag==0:
                flag+=1
                continue
            elems = line.split()
            token = ''.join(elems[0:-dim])#
            if token in w2id:#if word in the w2id
                emb[w2id[token]] = [float(v) for v in elems[-dim:]]#then get the embedding
    return emb

char2vec_embedding=build_embedding(id2word,'./char2vec_embedding.txt',200)
char2vec_embedding=torch.tensor(char2vec_embedding).float()
print(char2vec_embedding.size())

#concat two embedding ,get a 400 dim embedding
embedding=torch.cat((char2vec_embedding,fasttext_embedding),dim=1)
print(embedding.size())


### 2.3.2. Build Sequence Model

*You are required to describe how hyperparameters were decided with justification of your decision.*

In [0]:
# the model use the pretrained fast text embedding +char embedding, so the input dim is fixed to 400,drop out rate 0.2 is useful
#and then 400dim will reduce to 32 then 2(for pos or neg)


from sklearn.metrics import accuracy_score,recall_score,f1_score

class imdbClassifier(nn.Module):
    def __init__(self,embedding,n_hidden = 128):
        super(imdbClassifier, self).__init__()
        self.embedding = nn.Embedding.from_pretrained(torch.Tensor(embedding))
        self.drop_out = nn.Dropout(0.2)
        n_input = embedding.shape[1]#400 200+200from pretrained embedding
        self.lstm = nn.LSTM(n_input, n_hidden, num_layers=2, batch_first =True,bidirectional=True)
        self.linear = nn.Linear(n_hidden*2 ,32)
        self.linear2 = nn.Linear(32 ,2)

    def forward(self, x):
        x = self.embedding(x)
#         print(x.shape)
        x,_ = self.lstm(x)
        x = self.drop_out(x[:,-1,:])
        x = self.linear(x)
        x = self.linear2(x)
        return x

imdb = imdbClassifier(embedding).to(device)
print(imdb)

### 2.3.3. Train Sequence Model

In [0]:

from torch.utils.data import DataLoader, TensorDataset#使用Dataloader来分batch


train_input=reviews_to_ids(word2id,reviews_train)#load train input and test input 
test_input=reviews_to_ids(word2id,reviews_test)
train_input=torch.tensor(train_input).long().to(device)#convert the numpy to tensor
test_input=torch.tensor(test_input).long().to(device)
print(train_input.size())


train_label=torch.tensor(sentiment_train).to(device)#load train target and test target and convert to tensor
test_label=torch.tensor(sentiment_test).to(device)#
print(train_label.size())

#generate batch from Dataloader,and TensorDataset
train_input_dataset=TensorDataset(train_input,train_label)#create dataset
train_input_batch=DataLoader(train_input_dataset,batch_size=100,shuffle=False)#create dataloader from dataset
test_input_dataset=TensorDataset(test_input,test_label)
test_input_batch=DataLoader(test_input_dataset,batch_size=100,shuffle=False)


criterion = nn.CrossEntropyLoss()
learning_rate = 0.0004# 
optimizer = optim.Adam(imdb.parameters(), lr=learning_rate)
batch_size=100
total_epoch=25#if set to 30 ,will over fit

best_state_dict = None
best_f1 = 0 #log the best f1


for epoch in range(total_epoch):     
    epoch_loss = 0#total loss of a epoch
    correct = 0
    total = 0
    imdb.train()
    for step,batch in enumerate(train_input_batch):#get a batch data
        optimizer.zero_grad()#delete the grad that conatin in the optimizer
        input_batch_torch,target_batch_torch=batch
        outputs = imdb(input_batch_torch)#get outputs
        loss = criterion(outputs, target_batch_torch)
        predicted = torch.argmax(outputs,dim=1)#predicted is a 0/1 list,
        correct += torch.sum(predicted==target_batch_torch)#current step correct number 
        total += batch_size#total correct number
        loss.backward()#SGD
        optimizer.step()        
        epoch_loss+=loss.item()#total loss in this epoch
        acc = (correct*1.0/total).item()
        print('\rEpoch: %d, loss: %.2f, acc:%s' %(epoch + 1,epoch_loss, acc),end="")
    print() 
    imdb.eval()
    predicted_labels = []#do eval
    true_labels = []
    with torch.no_grad():
      for batch in test_input_batch:
          input_batch,target_batch=batch
          input_batch_torch = torch.from_numpy(np.array(input_batch.cpu())).long().to(device)
          outputs = imdb(input_batch_torch) 
          predicted = torch.argmax(outputs,dim=1).cpu().numpy().tolist()
          labels = target_batch.cpu().data

      acc = accuracy_score(predicted,labels)
      recall = recall_score(predicted,labels)
      current_f1 = f1_score(predicted,labels)
      log.append((epoch,current_f1))#use for the finall graph
      if current_f1 > best_f1:#we only save the best model 
          torch.save(imdb.state_dict(),"temp_imdb.pkl")
          best_state_dict = torch.load("temp_imdb.pkl")
          best_f1 = current_f1
      
    print("epoch: %s, test f1: %s"%(epoch+1,current_f1))
print('Finished Training')

### 2.3.4. Save Sequence Model

In [0]:
# Please comment your code
torch.save(best_state_dict,"./imdb.pkl")

### 2.3.5. Load Sequence Model

In [0]:
# Please comment your code

imdb = imdbClassifier(embedding).to(device)
imdb.load_state_dict(torch.load("./imdb.pkl"))

# 3 - Evaluation

(*Please show your empirical evidence*)

## 3.1. Performance Evaluation


You are required to provide the table with precision, recall, f1 of test set.

In [0]:
# Please comment your code
imdb.eval()
predicted_labels = []
true_labels = []
test_input_eval=torch.tensor(test_input[:1000]).long()#only use 1000 data to eval ,otherwise the GPU will crash
print(test_input_eval.size())
test_label=torch.tensor(sentiment_test[:1000])
print(test_label.size())
with torch.no_grad():#same as above
        outputs = imdb(test_input_eval) 
        predicted = torch.argmax(outputs,dim=1).cpu().numpy().tolist()
        labels = target_batch
        true_labels.extend(labels)
acc = accuracy_score(predicted,test_label.cpu())
recall = recall_score(predicted,test_label.cpu())
f1 = f1_score(predicted,test_label.cpu())
print("precision:%s, recall:%s, f1:%s"%(acc,recall,f1))

## 3.2. Hyperparameter Testing
*You are required to draw a graph(y-axis: f1, x-axis: epoch) for test set and explain the optimal number of epochs based on the learning rate you have already chosen.*

In [0]:
# Please comment your code
#as is shown in the graph the best model is when epoch between 10-15, the f1 is 0.9 .very good!!
import matplotlib.pyplot as plt
epochs,f1 = zip(*log)
plt.title(" Hyperparameter Testing  lol")
plt.xlabel("epoch")
plt.ylabel("f1")
plt.plot(epochs,f1)
plt.show()

## Object Oriented Programming codes here

*You can use multiple code snippets. Just add more if needed* 

In [0]:
# If you used OOP style, use this section