In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

from subprocess import check_output
print(check_output(["ls", "../input"]).decode("utf8"))

# Any results you write to the current directory are saved as output.

In [None]:
import re, os, time
import nltk
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

## 1.清理数据

机器学习工作中广为流传的一句话：“数据决定机器学习的上限，算法让我们不断逼近这个上限”。

一个干净的数据集是我们在运用机器学习算法取得成功的关键，因此，对文本进行合适的处理是非常关键的一步。

In [None]:
train = pd.read_csv('../input/train.csv')
test = pd.read_csv('../input/test.csv')
sample_submission = pd.read_csv('../input/sample_submission.csv')

labels = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]

以下是我在清洗文本过程中主要完成的工作：

1. 把你的文章分成一个个单独的单词。 
2. 将所有字符转换为小写。
3. 删除所有不相关的字符，例如任何非字母、数字字符。
4. 恢复所有简写形式的单词
5. 考虑将“@$&”等字符转换为“at，dollar，and”。
6. 最后，有很多单词是拼写错误的，这个部分还需要想办法来处理。


In [None]:
def clean_text(comment_text):
    comment_list = []
    for text in comment_text:
        # 将单词转换为小写
        text = text.lower()
        # 删除非字母、数字字符
        text = re.sub(r"[^A-Za-z0-9(),!?@&$\'\`\"\_\n]", " ", text)
        text = re.sub(r"\n", " ", text)
        
        # 恢复常见的简写
        text = re.sub(r"what's", "what is ", text)
        text = re.sub(r"\'s", " ", text)
        text = re.sub(r"\'ve", " have ", text)
        text = re.sub(r"can't", "can not ", text)
        text = re.sub(r"cannot", "can not ", text)
        text = re.sub(r"n't", " not ", text)
        text = re.sub(r"i'm", "i am ", text)
        text = re.sub(r"\'re", " are ", text)
        text = re.sub(r"\'d", " would ", text)
        text = re.sub(r"\'ll", " will ", text)
        
        # 恢复特殊符号的英文单词
        text = text.replace('&', ' and')
        text = text.replace('@', ' at')
        text = text.replace('$', ' dollar')
        
        comment_list.append(text)
    return comment_list

train["clean_comment_text"] = clean_text(train['comment_text'])
test['clean_comment_text'] = clean_text(test['comment_text'])

In [None]:
train[['comment_text','clean_comment_text']].head(5)

## 2.提取TF-IDF特征

自然语言处理的一个难点问题就是如何表示文本，机器学习模型都是以数值为输入，所以我们需要找到一种很好的表达方式让我们的算法能够理解文本数据。

### TF-IDF
为了帮助我们的模型更多地关注有意义的单词，我们可以使用TF-IDF进行特征提取。

In [None]:
all_comment_list = list(train['clean_comment_text']) + list(test['clean_comment_text'])
text_vector = TfidfVectorizer(sublinear_tf=True, strip_accents='unicode',token_pattern=r'\w{1,}',
                         max_features=5000, ngram_range=(1, 1), analyzer='word')
text_vector.fit(all_comment_list)
train_vec = text_vector.transform(train['clean_comment_text'])
test_vec = text_vector.transform(test['clean_comment_text'])

In [None]:
train_vec

## 3.训练模型

我们使用一个非常简单的Logistic回归模型来进行分类。

In [None]:
x_train, x_valid, y_train, y_valid = train_test_split(train_vec, train[labels], test_size=0.1, random_state=2018)
x_test = test_vec

In [None]:
accuracy = []
for label in labels:
    clf = LogisticRegression(C=6)
    clf.fit(x_train, y_train[label])
    y_pre = clf.predict(x_valid)
    train_scores = clf.score(x_train, y_train[label])
    valid_scores = accuracy_score(y_pre, y_valid[label])
    print("{} train score is {}, valid score is {}".format(label, train_scores, valid_scores))
    accuracy.append(valid_scores)
    pred_proba = clf.predict_proba(x_test)[:, 1]
    sample_submission[label] = pred_proba
print("Total cv accuracy is {}".format(np.mean(accuracy)))

In [None]:
from datetime import datetime

def submission(submission):
    file_name = '{}.csv'.format(datetime.now().strftime("%Y-%m-%d_%H-%M"))
    submission.to_csv(file_name, index=False)
    print("write to {}".format(file_name))
submission(sample_submission)