## Preparing the Data 

In [1]:
import nltk
from nltk.corpus import movie_reviews
from collections import Counter

nltk.download('movie_reviews')

pos_reviews = []
neg_reviews = []

for label in movie_reviews.categories():
    for fileid in movie_reviews.fileids(label):
        doc = movie_reviews.words(fileid)
        if label == 'pos':
            pos_reviews.append(" ".join(doc))
        else:
            neg_reviews.append(" ".join(doc))
            
train_pos_reviews = pos_reviews[:900]
train_neg_reviews = neg_reviews[:900]
test_pos_reviews = pos_reviews[900:]
test_neg_reviews = neg_reviews[900:]

print("Number of training data %d (POS) %d (NEG)" % (
    len(train_pos_reviews), len(train_neg_reviews)) )
print("Number of test data %d (POS) %d (NEG)" % (
    len(test_pos_reviews), len(test_neg_reviews)) )

[nltk_data] Downloading package movie_reviews to C:\Users\Guan-Ting
[nltk_data]     Chen\AppData\Roaming\nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!


Number of training data 900 (POS) 900 (NEG)
Number of test data 100 (POS) 100 (NEG)


In [2]:
train_pos_reviews[0]

'films adapted from comic books have had plenty of success , whether they \' re about superheroes ( batman , superman , spawn ) , or geared toward kids ( casper ) or the arthouse crowd ( ghost world ) , but there \' s never really been a comic book like from hell before . for starters , it was created by alan moore ( and eddie campbell ) , who brought the medium to a whole new level in the mid \' 80s with a 12 - part series called the watchmen . to say moore and campbell thoroughly researched the subject of jack the ripper would be like saying michael jackson is starting to look a little odd . the book ( or " graphic novel , " if you will ) is over 500 pages long and includes nearly 30 more that consist of nothing but footnotes . in other words , don \' t dismiss this film because of its source . if you can get past the whole comic book thing , you might find another stumbling block in from hell \' s directors , albert and allen hughes . getting the hughes brothers to direct this seems

In [3]:
train_X = []
train_Y = []
test_X = []
test_Y = []

for x in train_pos_reviews:
    train_X.append(x)
    train_Y.append(1)
for x in train_neg_reviews:
    train_X.append(x)
    train_Y.append(0)
    
for x in test_pos_reviews:
    test_X.append(x)
    test_Y.append(1)
for x in test_neg_reviews:
    test_X.append(x)
    test_Y.append(0)

# data clean 

In [4]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
nltk.download('stopwords')
stopword_list = stopwords.words('english')
from nltk.stem.snowball import SnowballStemmer
snowball_stemmer = SnowballStemmer("english")

[nltk_data] Downloading package stopwords to C:\Users\Guan-Ting
[nltk_data]     Chen\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
def clean(x):
    # remove conjunction and prepositions
    filter_tags = ['CC', 'IN']
    tokens = word_tokenize(x)
    pos_token = nltk.pos_tag(tokens)
    cleaned_tokens = []
    for (tok, pos) in pos_token:
        #remove stopword and keep english letters
        if tok.isalpha() & (tok not in stopword_list) & (pos not in filter_tags):
            tok_lower = tok.lower()
            #steming each word
            cleaned_tokens.append(snowball_stemmer.stem(tok_lower))
    return " ".join(cleaned_tokens)

In [6]:
# clean our training data
cleaned_train_X = []
cleaned_test_X = []
for x in train_X:
    cleaned_train_X.append(clean(x))
for x in test_X:
    cleaned_test_X.append(clean(x))

In [7]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# use CountVectorizer to calculate the frequency of each word in data 

In [8]:
feature_extractor = CountVectorizer(max_features = 1500, ngram_range=(1, 3))
training_vectors = feature_extractor.fit_transform(cleaned_train_X)
test_vectors = feature_extractor.transform(cleaned_test_X)

```
(i, j) f = the j-th word in word vocabulary occur f times in the i-th sentence
```

In [9]:
print(training_vectors[0])

  (0, 494)	6
  (0, 20)	1
  (0, 244)	5
  (0, 142)	4
  (0, 991)	1
  (0, 1290)	1
  (0, 109)	1
  (0, 1233)	1
  (0, 727)	1
  (0, 302)	1
  (0, 559)	1
  (0, 1478)	2
  (0, 897)	2
  (0, 1061)	2
  (0, 610)	5
  (0, 290)	1
  (0, 36)	1
  (0, 868)	3
  (0, 397)	1
  (0, 178)	3
  (0, 158)	1
  (0, 1453)	2
  (0, 898)	1
  (0, 769)	1
  (0, 958)	1
  :	:
  (0, 595)	1
  (0, 156)	1
  (0, 4)	2
  (0, 709)	1
  (0, 1153)	1
  (0, 580)	1
  (0, 1300)	1
  (0, 1109)	1
  (0, 121)	1
  (0, 937)	1
  (0, 875)	1
  (0, 657)	1
  (0, 86)	1
  (0, 17)	1
  (0, 592)	1
  (0, 100)	1
  (0, 641)	1
  (0, 574)	1
  (0, 1174)	1
  (0, 744)	1
  (0, 385)	1
  (0, 269)	1
  (0, 245)	3
  (0, 809)	1
  (0, 505)	1


## import package

In [10]:
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score

## model 1 : LightGBM

In [11]:
import numpy as np
import lightgbm as lgbm

In [12]:
#define metric f-score to monitor LightGBM model
def lgb_f1_score(y_hat, data):
    y_true = data.get_label()
    y_hat = np.round(y_hat)
    return 'f1', f1_score(y_true, y_hat), True

In [13]:
params = {
    'objective' :'binary',
    'learning_rate' : 0.05,
    'num_leaves' : 100,
    'feature_fraction': 0.7, 
    'bagging_fraction': 0.8, 
    'bagging_freq':1,
    'boosting_type' : 'gbdt',
    'metric': 'auc'
}

In [14]:
train = lgbm.Dataset(training_vectors.astype(float), train_Y)
valid = lgbm.Dataset(test_vectors.astype(float), test_Y)

# training with early stop
clf = lgbm.train(params, train, num_boost_round=5000, valid_sets=[valid], verbose_eval=50, feval=lgb_f1_score, early_stopping_rounds=500)

Training until validation scores don't improve for 500 rounds.
[50]	valid_0's auc: 0.9018	valid_0's f1: 0.830918
[100]	valid_0's auc: 0.918	valid_0's f1: 0.84058
[150]	valid_0's auc: 0.9265	valid_0's f1: 0.870813
[200]	valid_0's auc: 0.9263	valid_0's f1: 0.869565
[250]	valid_0's auc: 0.9295	valid_0's f1: 0.878049
[300]	valid_0's auc: 0.9305	valid_0's f1: 0.883495
[350]	valid_0's auc: 0.9324	valid_0's f1: 0.872549
[400]	valid_0's auc: 0.9317	valid_0's f1: 0.872549
[450]	valid_0's auc: 0.9316	valid_0's f1: 0.866995
[500]	valid_0's auc: 0.9326	valid_0's f1: 0.871287
[550]	valid_0's auc: 0.9329	valid_0's f1: 0.878049
[600]	valid_0's auc: 0.9331	valid_0's f1: 0.887805
[650]	valid_0's auc: 0.9333	valid_0's f1: 0.887805
[700]	valid_0's auc: 0.9318	valid_0's f1: 0.882353
[750]	valid_0's auc: 0.9306	valid_0's f1: 0.878049
[800]	valid_0's auc: 0.9313	valid_0's f1: 0.878049
Early stopping, best iteration is:
[327]	valid_0's auc: 0.9317	valid_0's f1: 0.888889


## evaluate F-score: 0.884859

In [15]:
#define probability threshold
THRESHOLD = 0.5
#################################################################
test_prob_Y = clf.predict(test_vectors.astype(float))
pred_Y = [1 if i>THRESHOLD else 0 for i in test_prob_Y]
accuracy = accuracy_score(test_Y, pred_Y)
precision = precision_score(test_Y, pred_Y, average='macro')
recall = recall_score(test_Y, pred_Y, average='macro')
fscore = f1_score(test_Y, pred_Y, average='macro')

print("Accuracy: %g\tPrecision: %g\tRecall: %g\tF-score: %g" % (
    accuracy, precision, recall, fscore))

Accuracy: 0.885	Precision: 0.886896	Recall: 0.885	F-score: 0.884859


## model2 Logistic  Regression

In [18]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(penalty='elasticnet', solver='saga', C=0.6, max_iter=500, l1_ratio=0.2)
clf = clf.fit(training_vectors, train_Y)

pred_Y = clf.predict(test_vectors)

## evaluate  F-score: 0.834996

In [19]:
accuracy = accuracy_score(test_Y, pred_Y)
precision = precision_score(test_Y, pred_Y, average='macro')
recall = recall_score(test_Y, pred_Y, average='macro')
fscore = f1_score(test_Y, pred_Y, average='macro')

print("Accuracy: %g\tPrecision: %g\tRecall: %g\tF-score: %g" % (
    accuracy, precision, recall, fscore))

Accuracy: 0.835	Precision: 0.835034	Recall: 0.835	F-score: 0.834996
