In [None]:
import os
import sys
import random
import time
import psutil
import math
import pytz
from contextlib import contextmanager
from datetime import datetime
import numpy as np
import pandas as pd
from tqdm import tqdm
from collections import OrderedDict
import re, string
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.metrics import classification_report

@contextmanager
def trace(trace_msg):    ## 追踪内存变化和运行时间
    t0 = time.time()
    p = psutil.Process(os.getpid())
    m0 = p.memory_info()[0] / 2. ** 30
    yield
    m1 = p.memory_info()[0] / 2. ** 30
    delta = m1 - m0
    sign = '+' if delta >= 0 else '-'
    delta = math.fabs(delta)
    trace_msg = str(trace_msg)
    
    tz = pytz.timezone('Asia/Shanghai')
    now = datetime.now(tz)
    dt_str = now.strftime("%Y-%m-%d %H:%M:%S")
    print(f"[{m1:.3f}GB({sign}{delta:.3f}GB):{time.time() - t0:.3f}sec] {trace_msg} 【{dt_str}】", file=sys.stdout)
    
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
random.seed(RANDOM_SEED)

DEBUG_RUN = True
global_start_t = time.time()
print('ok')

In [None]:
imdb_data = pd.read_csv('../input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv')
imdb_data['sentiment'] = imdb_data['sentiment'].map({'positive': 1, 'negative': 0})
print('before drop_duplicates, imdb_data.shape: ', imdb_data.shape)
imdb_data = imdb_data.drop_duplicates()
print('after drop_duplicates, imdb_data.shape: ', imdb_data.shape)
imdb_data = imdb_data.sample(30000)
print('after sample, imdb_data.shape: ', imdb_data.shape)
imdb_data = imdb_data.sample(len(imdb_data)).reset_index(drop=True)  # shuffle

imdb_data.head(5)

In [None]:
TRAIN_NUM = 15000
imdb_data_test = imdb_data.iloc[:5000]
imdb_data_valid = imdb_data.iloc[5000:10000]
imdb_data_train = imdb_data.iloc[10000:TRAIN_NUM+10000]

if DEBUG_RUN:
    SAMPLE_NUM = 300
    imdb_data_test = imdb_data_test.sample(SAMPLE_NUM)
    imdb_data_valid = imdb_data_valid.sample(SAMPLE_NUM)
    imdb_data_train = imdb_data_train.sample(2*SAMPLE_NUM)

print(f'imdb_data_train.shape: {imdb_data_train.shape}, imdb_data_valid.shape: {imdb_data_valid.shape}, '
      f'imdb_data_test.shape: {imdb_data_test.shape}')

########### 如果是中文内容，则添加下面的代码 ########
#============================================================
# import jieba as jb
# for df in (imdb_data_train, imdb_data_valid, imdb_data_test):
#     df['review'] = df['review'].apply(lambda x: " ".join([w for w in list(jb.cut(x))]))
#============================================================

imdb_data_test.head(5)

In [None]:
train_texts, train_labels = list(imdb_data_train['review'].values), list(imdb_data_train['sentiment'].values)
with trace('baseline model train'):
    baseline_model = make_pipeline(CountVectorizer(ngram_range=(1, 3)),
                                   #TfidfTransformer(),  # 对于LR算法，使用Tfidf通常效果更差，可以打开改行注释尝试一下！
                                   LogisticRegression()).fit(train_texts, train_labels)

In [None]:
valid_texts, valid_labels = list(imdb_data_valid['review'].values), list(imdb_data_valid['sentiment'].values)
with trace('baseline model valid predict'):
    baseline_predicted = baseline_model.predict(valid_texts)
    
print(classification_report(valid_labels, baseline_predicted, digits=4))

In [None]:
test_texts, test_labels = list(imdb_data_test['review'].values), list(imdb_data_test['sentiment'].values)
with trace('baseline model test predict'):
    baseline_predicted = baseline_model.predict(test_texts)
    
print(classification_report(test_labels, baseline_predicted, digits=4))
print('finished, total cost time: ', time.time() - global_start_t)