In [10]:
import re 
import os
from collections import Counter, OrderedDict
import math
from sklearn import metrics

In [16]:
def clean(raw_review):  
    raw_review = raw_review.lower();
    raw_review = re.sub(r"<br|/>", " ", raw_review)
    review = re.sub(r",|\(|\)|\[|\]|<|>|-|!|\?|\"|â€œ|:|\*|\.|#", " ", raw_review)
    return review

def read_dir(dir_path):
    os.chdir(dir_path)
    review_list = []
    for file in os.listdir():
        if file.endswith(".txt"):
            file_path = f"{dir_path}/{file}"
            with open(file_path, 'r') as f:
                s = f.read()
                #print(s)
                s = clean(s)
                #print(s)
                review_list.append(s)
    return review_list

In [17]:
train_pos_dir = "/Users/zhitaoliang/Documents/study/LP3/TME286/A1/aclImdb/train/pos"
train_neg_dir = "/Users/zhitaoliang/Documents/study/LP3/TME286/A1/aclImdb/train/neg"
train_pos_list = read_dir(train_pos_dir)
train_neg_list = read_dir(train_neg_dir)

test_pos_dir = "/Users/zhitaoliang/Documents/study/LP3/TME286/A1/aclImdb/test/pos"
test_neg_dir = "/Users/zhitaoliang/Documents/study/LP3/TME286/A1/aclImdb/test/neg"
test_pos_list = read_dir(test_pos_dir)
test_neg_list = read_dir(test_neg_dir)

print(len(train_pos_list), len(train_neg_list), len(test_pos_list), len(test_neg_list))

12500 12500 12500 12500


In [29]:
class Classifier:
    token_with_logdis_dic = {}

    def tokenize_and_count(self, review_list):
        token_list = []
        for review in review_list:
            token_list += review.split()
        token_counter = Counter(token_list)
        return token_counter

    def cal_gamma_of_token(self, pos_token_counter, neg_token_counter):
        self.token_with_logdis_dic = {}
        for token in pos_token_counter:
            if(neg_token_counter[token] == 0):
                continue
            pos_count = pos_token_counter[token]
            neg_count = neg_token_counter[token]
            self.token_with_logdis_dic[token] = math.log10(float(pos_count/neg_count))
        output_file = "/Users/zhitaoliang/Documents/study/LP3/TME286/A1/TokenList.txt"
        sort_by_value = sorted(self.token_with_logdis_dic.items(), key=lambda x:x[1])
        print(sort_by_value[0:30])
        print(sort_by_value[len(sort_by_value)-31:len(sort_by_value)-1])
        self.token_with_logdis_dic = OrderedDict(sorted(self.token_with_logdis_dic.items()))
        
        f= open(output_file,"x")
        f= open(output_file,"w+")
        for token in self.token_with_logdis_dic:
            f.write(token + "\t" + str(self.token_with_logdis_dic[token]) + "\n")
        f.close()

    def fit(self, pos_review_list, neg_review_list):
        pos_counter = self.tokenize_and_count(pos_review_list)
        neg_counter = self.tokenize_and_count(neg_review_list)
        self.cal_gamma_of_token(pos_counter, neg_counter)
    
    def classify_review(self, review):
        token_seq = review.split()
        sum_gamma = 0
        predicted_label = -1
        for token in token_seq:
            if token in self.token_with_logdis_dic:
                sum_gamma += self.token_with_logdis_dic[token]
        if sum_gamma > 0:
            predicted_label = 1
        else:
            predicted_label = 0
        return predicted_label
    
    def classify(self, review_list):
        pre_label_list = []
        for review in review_list:
            predicted_label = self.classify_review(review)
            pre_label_list.append(predicted_label)
        return pre_label_list
    
    def score(self, predicted_labels, actual_labels):
        #precision = metrics.precision_score(actual_labels, predicted_labels)
        #recall = metrics.recall_score(actual_labels, predicted_labels)
        accuracy = metrics.accuracy_score(actual_labels, predicted_labels)
        #f1score = metrics.f1_score(actual_labels, predicted_labels)
        classification_report = metrics.classification_report(actual_labels, predicted_labels)
        
        return accuracy, classification_report

In [31]:
movie_review_classifier = Classifier()
# fit classifier
movie_review_classifier.fit(train_pos_list, train_neg_list)
# classify
predicted_train_labels = movie_review_classifier.classify(train_pos_list) + movie_review_classifier.classify(train_neg_list)
predicted_test_labels = movie_review_classifier.classify(test_pos_list) + movie_review_classifier.classify(test_neg_list)

actual_train_labels = [1]*len(train_pos_list) + [0]*len(train_neg_list)
actual_test_labels = [1]*len(test_pos_list) + [0]*len(test_neg_list)

print(len(predicted_train_labels), len(predicted_test_labels), len(actual_train_labels), len(actual_test_labels))
# score
train_score = movie_review_classifier.score(predicted_train_labels, actual_train_labels)
test_score = movie_review_classifier.score(predicted_test_labels, actual_test_labels)


[('boll', -2.1072099696478683), ('2/10', -2.093421685162235), ('uwe', -2.0), ('beowulf', -1.7481880270062005), ('ajay', -1.662757831681574), ('seagal', -1.656417653650555), ('wayans', -1.6532125137753437), ('4/10', -1.6384892569546374), ('scarecrows', -1.6334684555795866), ('dahmer', -1.6232492903979006), ('awfulness', -1.5797835966168101), ('grendel', -1.568201724066995), ('steaming', -1.568201724066995), ('3/10', -1.5314789170422551), ('segal', -1.5314789170422551), ('deathstalker', -1.5185139398778875), ('interminable', -1.4771212547196624), ('forwarding', -1.462397997898956), ('sabretooth', -1.4471580313422192), ('gamera', -1.4313637641589874), ('picker', -1.414973347970818), ('dreck', -1.414973347970818), ('devgan', -1.414973347970818), ('unwatchable', -1.41077723337721), ('stinker', -1.3847117429382825), ('razzie', -1.3802112417116061), ('nada', -1.3802112417116061), ('mst3k', -1.3679767852945943), ('nostril', -1.3617278360175928), ('demi', -1.3424226808222062)]
[("tony's", 1.414

In [32]:
print("Training set result\nAccuracy:", train_score[0])
print(train_score[1])
print("\nTest set result\nAccuracy:", test_score[0])
print(test_score[1])

Training set result
Accuracy: 0.87808
              precision    recall  f1-score   support

           0       0.90      0.85      0.87     12500
           1       0.86      0.91      0.88     12500

    accuracy                           0.88     25000
   macro avg       0.88      0.88      0.88     25000
weighted avg       0.88      0.88      0.88     25000


Test set result
Accuracy: 0.8272
              precision    recall  f1-score   support

           0       0.85      0.80      0.82     12500
           1       0.81      0.85      0.83     12500

    accuracy                           0.83     25000
   macro avg       0.83      0.83      0.83     25000
weighted avg       0.83      0.83      0.83     25000

