In [4]:
import pandas as pd
import numpy as np

data_train = pd.read_csv('./input/train.csv')
data_test = pd.read_csv('./input/test.csv')
test_label = pd.read_csv('./input/test_labels.csv')

In [5]:
score = list(test_label['toxic'] == -1)
score_index = [i for i,x in enumerate(score) if x == False]
data_test = data_test['comment_text'][score_index]
col = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
test_label = test_label[col].values[score_index]

## 数据预处理

### 缺失值

In [4]:
print("数据集中的缺失值：")
null_check=data_train.isnull().sum()
print(null_check)
data_train["comment_text"].fillna("unknown", inplace=True)

数据集中的缺失值：
id               0
comment_text     0
toxic            0
severe_toxic     0
obscene          0
threat           0
insult           0
identity_hate    0
dtype: int64


### 清洗数据

In [5]:
APPO = {
"aren't" : "are not",
"can't" : "cannot",
"couldn't" : "could not",
"didn't" : "did not",
"doesn't" : "does not",
"don't" : "do not",
"hadn't" : "had not",
"hasn't" : "has not",
"haven't" : "have not",
"he'd" : "he would",
"he'll" : "he will",
"he's" : "he is",
"i'd" : "I would",
"i'd" : "I had",
"i'll" : "I will",
"i'm" : "I am",
"isn't" : "is not",
"it's" : "it is",
"it'll":"it will",
"i've" : "I have",
"let's" : "let us",
"mightn't" : "might not",
"mustn't" : "must not",
"shan't" : "shall not",
"she'd" : "she would",
"she'll" : "she will",
"she's" : "she is",
"shouldn't" : "should not",
"that's" : "that is",
"there's" : "there is",
"they'd" : "they would",
"they'll" : "they will",
"they're" : "they are",
"they've" : "they have",
"we'd" : "we would",
"we're" : "we are",
"weren't" : "were not",
"we've" : "we have",
"what'll" : "what will",
"what're" : "what are",
"what's" : "what is",
"what've" : "what have",
"where's" : "where is",
"who'd" : "who would",
"who'll" : "who will",
"who're" : "who are",
"who's" : "who is",
"who've" : "who have",
"won't" : "will not",
"wouldn't" : "would not",
"you'd" : "you would",
"you'll" : "you will",
"you're" : "you are",
"you've" : "you have",
"'re": " are",
"wasn't": "was not",
"we'll":" will",
"didn't": "did not",
"tryin'":"trying"
}

In [6]:
#nlp
import string
import re    #for regex
import nltk
import spacy
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer 
from nltk.tokenize import word_tokenize
from nltk.tokenize import TweetTokenizer
eng_stopwords = set(stopwords.words("english"))
lem = WordNetLemmatizer()
token = TweetTokenizer()

def clean(text):
    #换行
    text = re.sub("\\n"," ",text)
    #数字
    text = re.sub("\d+","",text)
    
    #分词
    words=token.tokenize(text)
    #缩写
    words=[APPO[word] if word in APPO else word for word in words]
    #时态
    words=[lem.lemmatize(word, "v") for word in words]
    return " ".join(words)

In [7]:
data_train['comment_text'] =data_train['comment_text'].apply(lambda x :clean(x))
data_test = data_test.apply(lambda x : clean(x))

### 构建词表

In [8]:
import matplotlib.pyplot as plt
%matplotlib inline
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [9]:
data = data_train["comment_text"]
col = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
y_train = data_train[col].values
max_features = 20000
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(data))
text_train = tokenizer.texts_to_sequences(data)
text_test = tokenizer.texts_to_sequences(data_test)
X_train = pad_sequences(text_train, maxlen = 200)
X_test = pad_sequences(text_test, maxlen = 200)

In [10]:
np.save('./input/dataset_train.npy',X_train)
np.save('./input/dataset_test.npy',X_test)
np.save('./input/train_labels.npy',y_train)
np.save('./input/test_labels.npy',test_label)

## glove-tweet

In [11]:
path = './input/'
EMBEDDING_FILE=path+'glove.twitter.27B.200d.txt'
embeddings_index = {}
def get_coefs(word, *arr): return word, np.asarray(arr, dtype='float32')
embeddings_index = dict(get_coefs(*o.strip().split(' ')) for o in open(EMBEDDING_FILE,encoding='UTF-8'))

In [13]:
embed_size = 200
word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix = np.random.normal(0.001, 0.25, (nb_words, embed_size))
for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector

In [14]:
np.save('./input/embedding_tweet.npy',embedding_matrix)

## glove-840B

In [15]:
path = './input/'
EMBEDDING_FILE=path+'glove.840B.300d.txt'
embeddings_index2 = {}
def get_coefs(word, *arr): return word, np.asarray(arr, dtype='float32')
embeddings_index2 = dict(get_coefs(*o.strip().split(' ')) for o in open(EMBEDDING_FILE,encoding='UTF-8'))

In [17]:
embed_size = 300
nb_words = min(max_features, len(word_index))
embedding_matrix2 = np.random.normal(0.001, 0.25, (nb_words, embed_size))
for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embeddings_index2.get(word)
    if embedding_vector is not None: embedding_matrix2[i] = embedding_vector

In [18]:
np.save('./input/embedding_glove.npy',embedding_matrix2)

## Fasttext

In [19]:
path = './input/'
EMBEDDING_FILE=path+'crawl-300d-2M.vec'
embeddings_index3 = {}
def get_coefs(word, *arr): return word, np.asarray(arr, dtype='float32')
embeddings_index3 = dict(get_coefs(*o.strip().split(' ')) for o in open(EMBEDDING_FILE,encoding='UTF-8'))

In [21]:
embed_size = 300
nb_words = min(max_features, len(word_index))
embedding_matrix3 = np.random.normal(0.001, 0.25, (nb_words, embed_size))
for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embeddings_index3.get(word)
    if embedding_vector is not None: embedding_matrix3[i] = embedding_vector

In [22]:
np.save('./input/embedding_fasttext.npy',embedding_matrix3)