In [1]:
import nltk
from nltk.corpus import stopwords
from collections import Counter
import re
import joblib
import numpy as np

In [2]:
# 已经下载了
# nltk.download("stopwords")

In [3]:
stop_words = set(stopwords.words('english'))

In [10]:
with open(f"stop_words.txt", "r", encoding="latin1") as f:
    stop_words = f.read().strip().split("\n")

In [4]:
dataset = "mr"

In [5]:
# param
stop_words = set(stopwords.words('english'))
least_freq = 5
if dataset == "mr" or "SST" in dataset:
    stop_words = set()
    least_freq = 0

In [6]:
# func load texts & labels
def load_dataset(dataset):
    with open(f"corpus/{dataset}.texts.txt", "r", encoding="latin1") as f:
        texts = f.read().strip().split("\n")
    with open(f"corpus/{dataset}.labels.txt", "r") as f:
        labels = f.read().strip().split("\n")
    return texts, labels

In [7]:
def filter_text(text: str):
    text = text.lower()
    text = re.sub(r"([\w\.-]+)@([\w\.-]+)(\.[\w\.]+)", " ", text)  # 删除邮件地址
    text = re.sub(r"([\w\.-]+)@([\w\.-]+)", " ", text)  # 删除邮件地址
    text = re.sub(r"([\w\.-]+)(\.[\w\.]+)", " ", text)  # 删除网址
    text = text.replace("'ll ", " will ")
    text = text.replace("'d ", " would ")
    text = text.replace("'m ", " am ")
    text = text.replace("'s ", " is ")
    text = text.replace("'re ", " are ")
    text = text.replace("'ve ", " have ")
    text = text.replace(" can't ", " can not ")
    text = text.replace(" ain't ", " are not ")
    text = text.replace("n't ", " not ")
    text = text.replace(". . .", " . ")
    text =  re.sub(r"\.{2,}", '.', text)  # 删除多余.
    text = re.sub(r'\.$', '', text.strip())
    text = re.sub(r'^\.', '', text.strip())
    text = re.sub(r"[^A-Za-z0-9,.!?\'`]", " ", text)
    text = text.replace(",", " , ")
    text = text.replace("!", " ! ")
    text = text.replace("?", " ? ")
    text = re.sub(r"\s{2,}", " ", text)
    return " ".join(text.strip().split())

In [21]:
def pos_text(text:str):
    pos = text.split()
    return nltk.pos_tag(pos)

In [23]:
texts, labels = load_dataset(dataset)

In [24]:
# handle texts
texts_clean = [filter_text(t) for t in texts]

In [25]:
texts_clean[16]

"slackers' jokey approach to college education is disappointingly simplistic the film is biggest problem and there are no unforgettably stupid stunts or uproariously rude lines of dialogue to remember it by"

In [26]:
text_pos = [pos_text(t) for t in texts_clean]

In [27]:
text_pos[16]

[("slackers'", 'NN'),
 ('jokey', 'NN'),
 ('approach', 'NN'),
 ('to', 'TO'),
 ('college', 'NN'),
 ('education', 'NN'),
 ('is', 'VBZ'),
 ('disappointingly', 'RB'),
 ('simplistic', 'JJ'),
 ('the', 'DT'),
 ('film', 'NN'),
 ('is', 'VBZ'),
 ('biggest', 'JJS'),
 ('problem', 'NN'),
 ('and', 'CC'),
 ('there', 'EX'),
 ('are', 'VBP'),
 ('no', 'DT'),
 ('unforgettably', 'RB'),
 ('stupid', 'JJ'),
 ('stunts', 'NNS'),
 ('or', 'CC'),
 ('uproariously', 'RB'),
 ('rude', 'JJ'),
 ('lines', 'NNS'),
 ('of', 'IN'),
 ('dialogue', 'NN'),
 ('to', 'TO'),
 ('remember', 'VB'),
 ('it', 'PRP'),
 ('by', 'IN')]

In [28]:
word2count = Counter([w for t in texts_clean for w in t.split()])

In [29]:
word_count = [[w, c] for w, c in word2count.items() if c >= least_freq and w not in stop_words]

In [30]:
word2index = {w: i for i, (w, c) in enumerate(word_count)}

In [31]:
words_list = [[w for w in t.split() if w in word2index] for t in texts_clean]

In [32]:
words_list[16]

["slackers'",
 'jokey',
 'approach',
 'to',
 'college',
 'education',
 'is',
 'disappointingly',
 'simplistic',
 'the',
 'film',
 'is',
 'biggest',
 'problem',
 'and',
 'there',
 'are',
 'no',
 'unforgettably',
 'stupid',
 'stunts',
 'or',
 'uproariously',
 'rude',
 'lines',
 'of',
 'dialogue',
 'to',
 'remember',
 'it',
 'by']

In [33]:
def words_pos_list(texts, text_pos, word2index):
    words_list = []
    pos_list = []
    for t, p in zip(texts, text_pos):
        temp = []
        temp_pos = []
        t_split = t.split()
        for i in range(0, len(t_split)):
            if t_split[i] in word2index:
                temp.append(t_split[i])
                temp_pos.append(p[i][1])
        words_list.append(temp)
        pos_list.append(temp_pos)
    return words_list, pos_list

In [34]:
word_list, pos_list = words_pos_list(texts_clean, text_pos, word2index)

In [35]:
pos_list[16]

['NN',
 'NN',
 'NN',
 'TO',
 'NN',
 'NN',
 'VBZ',
 'RB',
 'JJ',
 'DT',
 'NN',
 'VBZ',
 'JJS',
 'NN',
 'CC',
 'EX',
 'VBP',
 'DT',
 'RB',
 'JJ',
 'NNS',
 'CC',
 'RB',
 'JJ',
 'NNS',
 'IN',
 'NN',
 'TO',
 'VB',
 'PRP',
 'IN']

In [36]:
pos2count = Counter([w for t in pos_list for w in t])

In [37]:
pos_count = [[w, c] for w, c in pos2count.items()]

In [38]:
pos2index = {w: i for i, (w, c) in enumerate(pos_count)}

In [39]:
texts_remove = [" ".join(ws) for ws in words_list]

In [40]:
texts_remove[16]

"slackers' jokey approach to college education is disappointingly simplistic the film is biggest problem and there are no unforgettably stupid stunts or uproariously rude lines of dialogue to remember it by"

In [42]:
# labels 2 targets
label2index = {l: i for i, l in enumerate(set(labels))}
targets = [label2index[l] for l in labels]

In [43]:
# save
with open(f"temp/{dataset}.texts.clean.txt", "w") as f:
    f.write("\n".join(texts_clean))

with open(f"temp/{dataset}.texts.remove.txt", "w") as f:
    f.write("\n".join(texts_remove))

np.save(f"temp/{dataset}.targets.npy", targets)
joblib.dump(word2index, f"temp/{dataset}.word2index.pkl")

['temp/mr.word2index.pkl']