## download python code for drawing

In [None]:
!git clone https://github.com/weizaiff/python_paint_api.git

In [None]:
!ls python_paint_api/

## read data

In [None]:
import pandas as pd
from python_paint_api import paint_func
from collections import Counter
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from tqdm import tqdm
import numpy as np

In [None]:
df_data =pd.read_csv('../input/jigsaw-toxic-severity-rating/validation_data.csv')

## samples

In [None]:
df_data.sample(10)

## validation data number

In [None]:
len(df_data)

## worker number

In [None]:
df_data.hist(column=['worker'])

In [None]:
df_data['worker'].nunique()

## text len of less toxic and more toxic data

In [None]:
def get_len(text):
    return len(text)

In [None]:
df_data['less_toxic_len'] = df_data['less_toxic'].apply(get_len)

In [None]:
df_data['more_toxic_len'] = df_data['more_toxic'].apply(get_len)

In [None]:
df_data.hist(column=['less_toxic_len'])

In [None]:
df_data.hist(column=['more_toxic_len'])

### length between 0-1000

In [None]:
paint_func.bar_char_show_values(df_data.hist(column=['less_toxic_len'],bins=[0,200,400,600,800,1000]))

In [None]:
df_data.hist(column=['more_toxic_len'],bins=[0,200,400,600,800,1000])

## unique text of less toxic one and more toxic one

In [None]:
len(Counter(df_data['less_toxic'].values.reshape(-1).tolist()))

In [None]:
(Counter(df_data['less_toxic'].values.reshape(-1).tolist())).most_common(10)

In [None]:
len(Counter(df_data['more_toxic'].values.reshape(-1).tolist()))

In [None]:
(Counter(df_data['more_toxic'].values.reshape(-1).tolist())).most_common(10)

## intersection of less and more toxic samples

In [None]:
len(set(df_data['more_toxic'].values.reshape(-1).tolist()).intersection(set(df_data['less_toxic'].values.reshape(-1).tolist())))

## the most top*  words from tf-idf

In [None]:
import nltk
from nltk.corpus import stopwords
print(stopwords.words('english'))
stop_words= stopwords.words('english')

In [None]:

# 获取tf-idf的值
def get_tfidf(doc_list,stop_word_list=stop_words ,max_features=len(df_data)//2,min_df=10,norm ='l2'):
    
    # do lower case
    
    doc_list = [idoc.strip().lower() for idoc in doc_list]
    vectorizer = TfidfVectorizer(max_features=max_features,min_df=min_df,max_df=1.0,stop_words=stop_word_list, norm =norm )
    vectors = vectorizer.fit_transform(doc_list)
    feature_names = vectorizer.get_feature_names()
    dense = vectors.todense()
    denselist = dense.tolist()
    #print(denselist)
    df = pd.DataFrame(denselist, columns=feature_names)
    #get word weight
    index_value={i[1]:i[0] for i in vectorizer.vocabulary_.items()}
    word_weight = []
    for row in vectors:
        word_weight.append({index_value[column]:value for (column,value) in zip(row.indices,row.data)})
    
    res={}
    res['df']= df
    res['word_weight']=word_weight
    res['idf']=vectorizer.idf_
    return res
# get key word per sentence /根据文本的分词后的词 确定word_weight 中每个句子中关键词的位置
def get_keyword_persen(sentence_seged,sen_weight):
    
    per_sen_key_word=[]
    for iseg_word_list,iword_weight_dict in tqdm(zip(sentence_seged,sen_weight)):
        per_sen_keyword_set=list()
        for i_word in iseg_word_list:
            if i_word in iword_weight_dict and i_word not in per_sen_keyword_set:
                per_sen_keyword_set.append(i_word)
        per_sen_key_word.append((per_sen_keyword_set))
    return per_sen_key_word
# sorted key words //tfidf 获取重要词的排序
def get_keyword_seq(df_tfidf):
    
    data=np.max(df_tfidf.values,axis=0)
    columns_list=list(df_tfidf.columns)
    key_weight_dict={}
    for ikey,ival in zip(columns_list,data):
        key_weight_dict[ikey]=ival
    sorted_dict=sorted(key_weight_dict.items(),key=lambda x:x[1],reverse=True)
    sorted_word_list=[]
    sorted_word_list_for_paint=[]
    for itup in sorted_dict:
        sorted_word_list.append(itup[0]+"_"+str(itup[1]))
        sorted_word_list_for_paint.append(itup[0])
    return sorted_word_list,sorted_word_list_for_paint

### less toxic part

In [None]:
all_doc = (list(set(df_data['less_toxic'].values.reshape(-1).tolist())))
all_doc = [idoc.strip()  for idoc in all_doc]
all_doc_seg_list = [idoc.split(' ') for idoc in all_doc]
tfidf_res = get_tfidf(all_doc)

df_tfidf,sen_weight=tfidf_res['df'],tfidf_res['word_weight']
per_sen_key_word=get_keyword_persen(all_doc_seg_list,sen_weight)
print(len(df_tfidf),len(per_sen_key_word))
tfidf_res['tfidf_res']=per_sen_key_word
sorted_word_list,sorted_word_list_for_paint = get_keyword_seq(df_tfidf)

In [None]:
sorted_word_list,sorted_word_list_for_paint = get_keyword_seq(df_tfidf)

In [None]:
num_top_word_toshow =100
sorted_word_list_draw =sorted_word_list[:100]
sorted_word_list_for_paint_draw = sorted_word_list_for_paint[:100]

In [None]:
paint_func.draw_wordcolud(sorted_word_list_for_paint_draw)

### more toxic part

In [None]:
all_doc = (list(set(df_data['more_toxic'].values.reshape(-1).tolist())))
all_doc = [idoc.strip()  for idoc in all_doc]
all_doc_seg_list = [idoc.split(' ') for idoc in all_doc]
tfidf_res = get_tfidf(all_doc)

df_tfidf,sen_weight=tfidf_res['df'],tfidf_res['word_weight']
per_sen_key_word=get_keyword_persen(all_doc_seg_list,sen_weight)
print(len(df_tfidf),len(per_sen_key_word))
tfidf_res['tfidf_res']=per_sen_key_word
sorted_word_list,sorted_word_list_for_paint = get_keyword_seq(df_tfidf)

In [None]:
num_top_word_toshow =100
sorted_word_list_draw =sorted_word_list[:100]
sorted_word_list_for_paint_draw = sorted_word_list_for_paint[:100]

In [None]:
paint_func.draw_wordcolud(sorted_word_list_for_paint_draw)

In [None]:
from transformers import AutoTokenizer
import pandas as pd

### valid data len after tokenized

In [None]:
from transformers import AutoTokenizer
import pandas as pd

In [None]:
tokenizer = AutoTokenizer.from_pretrained("../input/bert-base-uncased")

In [None]:
df_val = pd.read_csv('../input/jigsaw-toxic-severity-rating/validation_data.csv')
df_test =pd.read_csv('../input/jigsaw-toxic-severity-rating/comments_to_score.csv')

In [None]:
df_val.head()

In [None]:
df_test.head()

In [None]:
val_text = list(set(df_val[['less_toxic','more_toxic']].values.reshape(-1).tolist()))
test_text = list(set(df_test[['text']].values.reshape(-1).tolist()))

In [None]:
def get_token_len(text_list):
    df_token = pd.DataFrame()
    token_len =[]
    for itext in text_list:
        token_len.append(len(tokenizer.encode(itext)))
    df_token['token_len'] = token_len
    return df_token

In [None]:
df_val_len = get_token_len(val_text)

In [None]:
df_test_len = get_token_len(test_text)

In [None]:
df_val_len.hist(bins=[0,100,200,500,1000,2000])

In [None]:
df_test_len.hist(bins=[0,100,200,500,1000,2000])