# preprocess.py

In [None]:
#coding:utf-8
import re
import sys
from utils import write_status
from nltk.stem.porter import PorterStemmer

In [None]:
def preprocess_word(word):
    # 删除标点符号(语料是英文就没写中文版的标点符号)
    word = word.strip('\'"?!,.():;')
    # 英文当中有重复某个字母来表达强烈情绪的用法...
    # 比如: funnnnny --> funny
    # 所以要删除掉这种重复.
    word = re.sub(r'(.)\1+', r'\1\1', word)
    # 删除 - 和 '
    word = re.sub(r'(-|\')', '', word)
    return word

In [None]:
def is_valid_word(word):
    # 检测word是不是字母开头
    # Check if word begins with an alphabet
    return (re.search(r'^[a-zA-Z][a-z0-9A-Z\._]*$', word) is not None)

In [None]:
def handle_emojis(tweet):
    # 处理一些字符表情...把他们分为 EMO_POS 和 EMO_NEG两种
    # 中文一般都不用这种简单的东西..都是升级版的, 比如✧(≖ ◡ ≖✿)嘿嘿
    
    # Smile -- :), : ), :-), (:, ( :, (-:, :')
    tweet = re.sub(r'(:\s?\)|:-\)|\(\s?:|\(-:|:\'\))', ' EMO_POS ', tweet)
    # Laugh -- :D, : D, :-D, xD, x-D, XD, X-D
    tweet = re.sub(r'(:\s?D|:-D|x-?D|X-?D)', ' EMO_POS ', tweet)
    # Love -- <3, :*
    tweet = re.sub(r'(<3|:\*)', ' EMO_POS ', tweet)
    # Wink -- ;-), ;), ;-D, ;D, (;,  (-;
    tweet = re.sub(r'(;-?\)|;-?D|\(-?;)', ' EMO_POS ', tweet)
    # Sad -- :-(, : (, :(, ):, )-:
    tweet = re.sub(r'(:\s?\(|:-\(|\)\s?:|\)-:)', ' EMO_NEG ', tweet)
    # Cry -- :,(, :'(, :"(
    tweet = re.sub(r'(:,\(|:\'\(|:"\()', ' EMO_NEG ', tweet)
    return tweet

In [None]:
def preprocess_tweet(tweet):
    # 处理tweet
    processed_tweet = []
    # 变为小写字母
    tweet = tweet.lower()
    # 把URL替换为 'URL' 这个标记词
    tweet = re.sub(r'((www\.[\S]+)|(https?://[\S]+))', ' URL ', tweet)
    # 把@XXX的文本, 替换为 'USER_MENTION'  这个标记词
    tweet = re.sub(r'@[\S]+', 'USER_MENTION', tweet)
    # 把#XXX这种tag的#去掉
    tweet = re.sub(r'#(\S+)', r' \1 ', tweet)
    # 删除转帖标识符
    tweet = re.sub(r'\brt\b', '', tweet)
    # ...这种太多点了, 只留一个
    tweet = re.sub(r'\.{2,}', ' ', tweet)
    # 删掉 空格, " and ' 
    tweet = tweet.strip(' "\'')
    # 替换字符表情符为 EMO_POS or EMO_NEG
    tweet = handle_emojis(tweet)
    # 很多空格的话, 只留一个
    tweet = re.sub(r'\s+', ' ', tweet)
    words = tweet.split()

    for word in words:
        word = preprocess_word(word)
        if is_valid_word(word):
            if use_stemmer:
                
                # 提取词干, 把某词的 名词, 动词, 动名词, 形容词, 过去时等都用一个词表示
                # 例如: excited, exciting, excit 提取词干后都对应 excit
                word = str(porter_stemmer.stem(word))
            processed_tweet.append(word)

    return ' '.join(processed_tweet)

In [None]:
def preprocess_csv(csv_file_name, processed_file_name, test_file=False):
    '''
    test_file: 如果是测试集, 就改为True, 这样处理后不写入label
    
    实际上用pandas就很好做了, 这个适合 数据多+内存小的情况
    '''
    save_to_file = open(processed_file_name, 'w')

    with open(csv_file_name, 'r') as csv:
        lines = csv.readlines()
        total = len(lines)
        for i, line in enumerate(lines):
            tweet_id = line[:line.find(',')]
            if not test_file:
                line = line[1 + line.find(','):]
                positive = int(line[:line.find(',')])
            line = line[1 + line.find(','):]
            tweet = line
            processed_tweet = preprocess_tweet(tweet)
            if not test_file:
                save_to_file.write('%s,%d,%s\n' %
                                   (tweet_id, positive, processed_tweet))
            else:
                save_to_file.write('%s,%s\n' %
                                   (tweet_id, processed_tweet))
            write_status(i + 1, total)
    save_to_file.close()
    print '\nSaved processed tweets to: %s' % processed_file_name
    return processed_file_name

In [None]:
# main
path = './dataset/'
csv_file_name = 'train'
processed_file_name = path + csv_file_name + '_processed.csv'

use_stemmer = False
if use_stemmer:
    porter_stemmer = PorterStemmer()
    processed_file_name = path + csv_file_name + '_processed_stemmed.csv'
preprocess_csv(path + csv_file_name + '.csv', processed_file_name, test_file=False)

# stats.py


In [None]:
#coding:utf-8
from nltk import FreqDist
import pickle
import sys
from utils import write_status
from collections import Counter

In [None]:
# 处理 预处理后的csv文件, 得到统计信息
# 得到词频和bigram的词频信息.保存在pickle文件.
# Takes in a preprocessed CSV file and gives statistics
# Writes the frequency distribution of words and bigrams
# to pickle files.


def analyze_tweet(tweet):
    # 分别统计各种类型的次数
    result = {}
    result['MENTIONS'] = tweet.count('USER_MENTION')
    result['URLS'] = tweet.count('URL')
    result['POS_EMOS'] = tweet.count('EMO_POS')
    result['NEG_EMOS'] = tweet.count('EMO_NEG')
    
    # 去掉标识词, 再统计单词数量
    tweet = tweet.replace('USER_MENTION', '').replace(
        'URL', '')
    words = tweet.split()
    result['WORDS'] = len(words)
    
    # 构建bigram, 统计bigram数量.
    bigrams = get_bigrams(words)
    result['BIGRAMS'] = len(bigrams)
    return result, words, bigrams

In [None]:
def get_bigrams(tweet_words):
    # 以tuple形式得到bigram
    bigrams = []
    num_words = len(tweet_words)
    for i in xrange(num_words - 1):
        bigrams.append((tweet_words[i], tweet_words[i + 1]))
    return bigrams

In [None]:
def get_bigram_freqdist(bigrams):
    # 统计bigram的频数
    freq_dict = {}
    for bigram in bigrams:
        if freq_dict.get(bigram):
            freq_dict[bigram] += 1
        else:
            freq_dict[bigram] = 1
    counter = Counter(freq_dict)
    return counter

In [None]:
# 初始化一堆东西...
num_tweets, num_pos_tweets, num_neg_tweets = 0, 0, 0
num_mentions, max_mentions = 0, 0
num_emojis, num_pos_emojis, num_neg_emojis, max_emojis = 0, 0, 0, 0
num_urls, max_urls = 0, 0
num_words, num_unique_words, min_words, max_words = 0, 0, 1e6, 0
num_bigrams, num_unique_bigrams = 0, 0
all_words = []
all_bigrams = []

In [None]:
# 一条一条读数据, 进行统计
preprocessed_file_name = './dataset/train_processed.csv'
unique_words_file_name = './dataset/train_processed_unique.txt'
pkl_file_name = './dataset/train_processed_freqdist.pkl'
bi_pkl_file_name = './dataset/train_processed_freqdist_bi.pkl'

# 下面这一段代码觉得写的不太好看...
with open(preprocessed_file_name, 'r') as csv:
    lines = csv.readlines()
    num_tweets = len(lines)
    for i, line in enumerate(lines):
        t_id, if_pos, tweet = line.strip().split(',')
        if_pos = int(if_pos)
        if if_pos:
            num_pos_tweets += 1
        else:
            num_neg_tweets += 1
        result, words, bigrams = analyze_tweet(tweet)
        num_mentions += result['MENTIONS']
        max_mentions = max(max_mentions, result['MENTIONS'])
        num_pos_emojis += result['POS_EMOS']
        num_neg_emojis += result['NEG_EMOS']
        max_emojis = max(
            max_emojis, result['POS_EMOS'] + result['NEG_EMOS'])
        num_urls += result['URLS']
        max_urls = max(max_urls, result['URLS'])
        num_words += result['WORDS']
        min_words = min(min_words, result['WORDS'])
        max_words = max(max_words, result['WORDS'])
        all_words.extend(words)
        num_bigrams += result['BIGRAMS']
        all_bigrams.extend(bigrams)
        write_status(i + 1, num_tweets)
num_emojis = num_pos_emojis + num_neg_emojis
unique_words = list(set(all_words))


with open(unique_words_file_name, 'w') as uwf:
    uwf.write('\n'.join(unique_words))
num_unique_words = len(unique_words)
num_unique_bigrams = len(set(all_bigrams))
print '\nCalculating frequency distribution'

# Unigrams
freq_dist = FreqDist(all_words)

with open(pkl_file_name, 'wb') as pkl_file:
    pickle.dump(freq_dist, pkl_file)
print 'Saved uni-frequency distribution to %s' % pkl_file_name


# Bigrams
bigram_freq_dist = get_bigram_freqdist(all_bigrams)

with open(bi_pkl_file_name, 'wb') as pkl_file:
    pickle.dump(bigram_freq_dist, pkl_file)
print 'Saved bi-frequency distribution to %s' % bi_pkl_file_name
print '\n[Analysis Statistics]'
print 'Tweets => Total: %d, Positive: %d, Negative: %d' % (num_tweets, num_pos_tweets, num_neg_tweets)
print 'User Mentions => Total: %d, Avg: %.4f, Max: %d' % (num_mentions, num_mentions / float(num_tweets), max_mentions)
print 'URLs => Total: %d, Avg: %.4f, Max: %d' % (num_urls, num_urls / float(num_tweets), max_urls)
print 'Emojis => Total: %d, Positive: %d, Negative: %d, Avg: %.4f, Max: %d' % (num_emojis, num_pos_emojis, num_neg_emojis, num_emojis / float(num_tweets), max_emojis)
print 'Words => Total: %d, Unique: %d, Avg: %.4f, Max: %d, Min: %d' % (num_words, num_unique_words, num_words / float(num_tweets), max_words, min_words)
print 'Bigrams => Total: %d, Unique: %d, Avg: %.4f' % (num_bigrams, num_unique_bigrams, num_bigrams / float(num_tweets))

# LSTM.py

In [None]:
#coding:utf-8
import numpy as np
import sys
import os
from keras.models import Sequential, load_model
from keras.layers import Dense, Dropout, Activation
from keras.layers import Embedding
from keras.callbacks import ModelCheckpoint, ReduceLROnPlateau
from keras.layers import LSTM
import utils
from keras.preprocessing.sequence import pad_sequences

In [None]:
# Performs classification using LSTM network.

FREQ_DIST_FILE = './dataset/train_processed_freqdist.pkl'
BI_FREQ_DIST_FILE = './dataset/train_processed_freqdist_bi.pkl'
TRAIN_PROCESSED_FILE = './dataset/train_processed.csv'
TEST_PROCESSED_FILE = './dataset/test_processed.csv'
GLOVE_FILE = './dataset/glove_seeds.txt'
dim = 200

In [None]:
def get_glove_vectors(vocab):
    print 'Looking for GLOVE vectors'
    glove_vectors = {}
    found = 0
    with open(GLOVE_FILE, 'r') as glove_file:
        for i, line in enumerate(glove_file):
            utils.write_status(i + 1, 0)
            tokens = line.split()
            word = tokens[0]
            if vocab.get(word):
                vector = [float(e) for e in tokens[1:]]
                glove_vectors[word] = np.array(vector)
                found += 1
    print '\n'
    print 'Found %d words in GLOVE' % found
    return glove_vectors

In [None]:
def get_feature_vector(tweet):
    words = tweet.split()
    feature_vector = []
    for i in range(len(words) - 1):
        word = words[i]
        if vocab.get(word) is not None:
            feature_vector.append(vocab.get(word))
    if len(words) >= 1:
        if vocab.get(words[-1]) is not None:
            feature_vector.append(vocab.get(words[-1]))
    return feature_vector

In [None]:
def process_tweets(csv_file, test_file=True):
    tweets = []
    labels = []
    print 'Generating feature vectors'
    with open(csv_file, 'r') as csv:
        lines = csv.readlines()
        total = len(lines)
        for i, line in enumerate(lines):
            if test_file:
                tweet_id, tweet = line.split(',')
            else:
                tweet_id, sentiment, tweet = line.split(',')
            feature_vector = get_feature_vector(tweet)
            if test_file:
                tweets.append(feature_vector)
            else:
                tweets.append(feature_vector)
                labels.append(int(sentiment))
            utils.write_status(i + 1, total)
    print '\n'
    return tweets, np.array(labels)

In [None]:
train = len(sys.argv) == 1
np.random.seed(1337)
vocab_size = 90000
batch_size = 128
max_length = 40
filters = 128
kernel_size = 3

In [None]:
#返回词频最多的vocab_size个词, 每个词对应的value是index
vocab = utils.top_n_words(FREQ_DIST_FILE, vocab_size, shift=1)

# 如果下载了pre-train的glove词典则去查词
# glove_vectors = get_glove_vectors(vocab)

# 将tweer转化为单词index的形式
tweets, labels = process_tweets(TRAIN_PROCESSED_FILE, test_file=False)
# 初始化Embedding matrix
embedding_matrix = np.random.randn(vocab_size + 1, dim) * 0.01 # mean=0, std=0.01

In [None]:
# 加入没有下载pre-train的词典, 则令glove_vetors={}
glove_vectors = {}

# 将glove的词向量替换 Embedding matrix中的row
for word, i in vocab.items():
    glove_vector = glove_vectors.get(word)
    if glove_vector is not None:
        embedding_matrix[i] = glove_vector

In [None]:
# 设置最大句子长度max_length, 
# 例如30, 如果句子没有30个词, 则补index 0, 
# 如果超过30, 则去掉后面的词
tweets = pad_sequences(tweets, maxlen=max_length, padding='post')

In [None]:
# 打乱索引
shuffled_indices = np.random.permutation(tweets.shape[0])

tweets = tweets[shuffled_indices]
labels = labels[shuffled_indices]

In [None]:
# 构建模型
model = Sequential()
model.add(Embedding(vocab_size + 1, dim, weights=[embedding_matrix], input_length=max_length))
model.add(Dropout(0.4))
model.add(LSTM(filters))
model.add(Dense(64))
model.add(Dropout(0.5))
model.add(Activation('relu'))
model.add(Dense(1))
model.add(Activation('sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
# 训练模型
if not os.path.exists('./models'):
    os.makedirs('./models')
filepath = "./models/lstm-{epoch:02d}-{loss:0.3f}-{acc:0.3f}-{val_loss:0.3f}-{val_acc:0.3f}.hdf5"

# 设置在每个epoch后储存模型, 只保存val_loss最小的model
checkpoint = ModelCheckpoint(filepath, monitor="val_loss", verbose=1, save_best_only=True, mode='min')

# 如果val_loss在patience个epoch后还不变, 就线性减小学习率, new_lr = lr * factor
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=2, min_lr=0.000001)
print model.summary()

model.fit(tweets, labels, batch_size=batch_size, epochs=1, validation_split=0.1, \
          shuffle=True, callbacks=[checkpoint, reduce_lr])

In [None]:
# 在测试集测试模型效果. 

# 选择模型
model = load_model('./models/XXXXX')
print model.summary()

test_tweets, _ = process_tweets(TEST_PROCESSED_FILE, test_file=True)
test_tweets = pad_sequences(test_tweets, maxlen=max_length, padding='post')
predictions = model.predict(test_tweets, batch_size=128, verbose=1)
results = zip(map(str, range(len(test_tweets))), np.round(predictions[:, 0]).astype(int))
utils.save_results_to_csv(results, 'lstm.csv')