In [288]:
import pandas as pd, numpy as np
from sklearn.base import BaseEstimator
from sklearn.pipeline import FeatureUnion, make_union
from importlib import reload 

In [56]:
# 重要特征词抽取: Gini系数
from sklearn.feature_extraction.text import CountVectorizer

# label
df = pd.read_csv('data/train.csv')
label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
df['label'] = df[label_cols].max(axis=1)

# P
P = {}
P[0] = len(df[df['label']==0]) / len(df)
P[1] = len(df[df['label']==1]) / len(df)

# p
p = {}
count_vect = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}')
word_cnt = count_vect.fit_transform(df['comment_text'])
words = count_vect.get_feature_names()
for i in range(len(df)):
    row = word_cnt.getrow(i)
    label = df['label'][i]
    for word_ind in row.indices:
        if word_ind >= len(words):continue
        if word_ind not in p: 
            p[word_ind] = [0,0,0]
        p[word_ind][label] += 1
        p[word_ind][2] += 1

# gini
ginis = []
for word_ind in p:
    norm_p0 = (p[0]/p[2]) / P[0]
    norm_p1 = (p[1]/p[2]) / P[1]
    gini = (norm_p0 / (norm_p0 + norm_p1))**2 + (norm_p1 / (norm_p0 + norm_p1))**2
    
    word = words[word_ind]
    if pw[0] > 10:
        continue
    if (norm_p1< 0.50) or len(word)<=2:
        continue
    if word.isdigit():
        continue
    ginis.append(word, gini)

ginis = sorted(ginis, key=lambda x:-x[1])

with open('data/badwords.txt', 'w') as g:
    line = 'word\tgini\n'
    g.write(line)
    for word, gini in ginis:
        line = '{}\t{}\n'.format(word, gini)
        g.write(line)

In [114]:
import re, string
class WordExtractorFeaturer(BaseEstimator):
    def __init__(self, words):
        self.words = words
        self.re_tok = re.compile(f'([{string.punctuation}“”¨«»®´·º½¾¿¡§£₤‘’])')
        
    def fit(self, xs, ys=None):
        return self
    
    def transform(self, xs):
        new_xs = []
        for x in xs:
            new_x = ' '.join([tok for tok in self._tokenize(x) if tok in self.words])
            new_xs.append(new_x)
        return new_xs
            
    def _tokenize(self, s): 
        return re_tok.sub(r' \1 ', s).split()

In [257]:
class CharGrpExtractorFeaturer(BaseEstimator):
    def __init__(self, char_grps=None):
        self.char_grps = char_grps
        self.re_tok = re.compile(f'([{string.punctuation}“”¨«»®´·º½¾¿¡§£₤‘’])')
        
    def fit(self, xs, ys=None):
        return self
    
    def transform(self, xs):
        new_xs = []
        for x in xs:
            grps = []
            bigrps = []
            for tok in self._tokenize(x):
                grps.extend(extract_group(tok, min_length=3))
                bigrps.extend(extract_ngroup(tok))
                
                print(tok, extract_ngroup(tok))
            
            new_x = ' '.join([grp for grp in grps if not self.char_grps or grp in self.char_grps])
            new_x2 = ' '.join([grp for grp in bigrps if not self.char_grps or grp in self.char_grps])
            new_xs.append(new_x + ' ' + new_x2)
        return new_xs
            
    def _tokenize(self, s): 
        return re_tok.sub(r' \1 ', s).split()

In [250]:
sletters = ['a', 'e', 'i', 'o', 'u', 'y']
def get_char_group(word):
    if not word:
        return ''
    
    if len(word)<=3:
        return word
    
    grp = ''
    #添加所有相连元音
    for i, ch in enumerate(word):
        if ch in sletters:
            grp += ch
        else:
            break
    if len(grp) == len(word):
        return grp
    
    # 添加所有相连辅音，如word=blast,  grp=bl
    starti = len(grp)
    for i in range(starti, len(word)):
        ch = word[i]
        if ch not in sletters:
            grp += ch
        else:
            break
    if len(grp) == len(word):
        return grp
                
    #添加所有元音，如word=blast, grp=bla
    starti = len(grp)
    for i in range(starti, len(word)):
        ch = word[i]
        if ch in sletters:
            grp += ch
        else:
            break
    if len(grp) == len(word):
        return grp
    
    #添加所有相连辅音(除了最后一个)
    starti = len(grp)
    for i in range(starti, len(word)):
        ch = word[i]
        if ch not in sletters:
            grp += ch
        else:
            break
    if len(grp) == len(word):
        return grp
    return grp[:-1]

In [268]:
def extract_group(word, min_length=0):
    groups = []
    while word:
        grp = get_char_group(word)
        if not grp:
            break
        word = word[len(grp):]
        if len(grp) >= min_length:
            groups.append(grp)
    return groups

def extract_ngroup(word, n=2, max_length=100000):
    groups = extract_group(word)
    if len(groups) <n:
        return []
    ngroups = []
    
    for i in range(len(groups)-n+1):
        ngrp =''.join(groups[i:i+n])
        if len(ngrp) < max_length:
            ngroups.append(''.join(groups[i:i+n]))
    return ngroups

In [248]:
f.transform(['omygod'])

['o', 'm', 'y', 'g', 'o', 'd']


['omy god o m y g o d']

In [222]:
from collections import defaultdict
grp_dict = defaultdict(int)
bigrp_dict = defaultdict(int)
for w, g in ginis:
    if w.isdigit():continue
    for grp in extract_group(w):
        if (len(grp)<3):
            continue
        grp_dict[grp] += 1
    for grp in extract_ngroup(w, 2):
        if (len(grp)>7):
            continue
        bigrp_dict[grp] += 1

In [286]:
bws = pd.read_csv('data/badwords.txt', sep='\t')
print(bws.columns)
for wd in bws['word']:
    print(wd)
    break

Index(['word', 'gini'], dtype='object')
cocksucker


In [287]:
bws.head()

Unnamed: 0,word,gini
0,cocksucker,1.0
1,shwain,1.0
2,semitian,1.0
3,greetingshhh,1.0
4,antisemmitian,1.0


In [269]:
f = CharGrpExtractorFeaturer()
print(f.transform(['this is test omygod']))

this []
is []
test []
omygod ['omygod']
['this test omy god omygod']


In [263]:
a=[]
a.extend('asdsf')
a

['a', 's', 'd', 's', 'f']

In [209]:
len(bigrp_dict), len(grp_dict)

(12831, 12271)

In [124]:
import re, string
class CharExtractorFeaturer(CountVectorizer):
    def __init__(self, input='content', encoding='utf-8',
                 decode_error='strict', strip_accents=None,
                 lowercase=True, preprocessor=None, tokenizer=None,
                 stop_words=None, token_pattern=r"(?u)\b\w\w+\b",
                 ngram_range=(1, 1),
                 max_df=1.0, min_df=1, max_features=None,
                 vocabulary=None, binary=False, dtype=np.int64, chars=None):
        super(CharExtractorFeaturer, self).__init__(input='content', encoding='utf-8',
                 decode_error='strict', strip_accents=None,
                 lowercase=True, preprocessor=None, tokenizer=None,
                 stop_words=None, token_pattern=r"(?u)\b\w\w+\b",
                 ngram_range=(1, 1), analyzer='char',
                 max_df=1.0, min_df=1, max_features=None,
                 vocabulary=None, binary=False, dtype=np.int64)
        self.chars = chars
        self.re_tok = re.compile(f'([{string.punctuation}“”¨«»®´·º½¾¿¡§£₤‘’])')
        
    def _char_ngrams(self, text_document):
        """Tokenize text_document into a sequence of character n-grams"""
        # normalize white spaces
        text_document = self._white_spaces.sub(" ", text_document)

        text_len = len(text_document)
        min_n, max_n = self.ngram_range
        if min_n == 1:
            # no need to do any slicing for unigrams
            # iterate through the string
            ngrams = list(text_document)
            min_n += 1
        else:
            ngrams = []

        # bind method outside of loop to reduce overhead
        ngrams_append = ngrams.append
        
        for n in range(min_n, min(max_n + 1, text_len + 1)):
            for i in range(text_len - n + 1):
                piece = text_document[i: i + n]
                if self.chars and piece in self.chars:
                    ngrams_append(piece)
        return ngrams

In [127]:
words = ['this', 'test']
f = CharExtractorFeaturer(chars = words, ngram_range=(2,2))
f.fit(t)
f.transform(t).todense()

matrix([[3, 1, 1, 0, 1, 2, 0, 0, 0, 0, 0, 3, 3, 0],
        [1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0],
        [1, 0, 2, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1]])

In [112]:
t = pd.Series(["this is a test", "line from", " nowhere"])

In [104]:
import re, string
re_tok = re.compile(f'([{string.punctuation}“”¨«»®´·º½¾¿¡§£₤‘’])')
def tokenize(s): return re_tok.sub(r' \1 ', s).split()

In [128]:
ginis[100:130]

[('tharkarn', 1.0),
 ('hunjan', 1.0),
 ('millinos', 1.0),
 ('trollreasons', 1.0),
 ('diz', 1.0),
 ('carrierz', 1.0),
 ('otha', 1.0),
 ('kupla', 1.0),
 ('roun', 1.0),
 ('wrapt', 1.0),
 ('bdh', 1.0),
 ('sumptin', 1.0),
 ('fergit', 1.0),
 ('aftah', 1.0),
 ('fohget', 1.0),
 ('leedle', 1.0),
 ('fuhst', 1.0),
 ('remindah', 1.0),
 ('wallz', 1.0),
 ('rangerz', 1.0),
 ('hollerin', 1.0),
 ('sistahs', 1.0),
 ('brotherz', 1.0),
 ('gitz', 1.0),
 ('latah', 1.0),
 ('gooood', 1.0),
 ('mothafuckin', 1.0),
 ('sodomize', 1.0),
 ('oversited', 1.0),
 ('gorf', 1.0)]

In [118]:
list("this is a test<")

['t', 'h', 'i', 's', ' ', 'i', 's', ' ', 'a', ' ', 't', 'e', 's', 't', '<']

In [117]:
CountVectorizer?