# Dota Dataset Notebook 4 - Jigsaw Classifier on Dota Data and EDA

In [74]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import langdetect as ld
from textblob import TextBlob

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

import warnings
warnings.filterwarnings('ignore')

## Parts of Notebook 1

In [2]:
df = pd.read_csv('dota2_chat_messages.csv', nrows=50000)
df.head()

Unnamed: 0,match,time,slot,text
0,0,1005.12122,9,ладно гг
1,0,1005.85442,9,изи
2,0,1008.65372,9,од
3,0,1010.51992,9,ебаный
4,0,1013.91912,9,мусор на войде


In [3]:
# Labeling languages
langs = np.zeros(len(df)).astype(str)
i = -1
for message in df['text'].values:
    i += 1
    try:
        langs[i] = ld.detect(message)
    except:
        continue
df['language'] = langs

In [4]:
# Fixing some languages due to acronyms
lang_fix = df.copy()
lang_fix = lang_fix.mask(df['text'].str.contains('(ez)|(Ez)|(EZ)'), 'en')
lang_fix = lang_fix.mask(df['text'].str.contains('(lol)|(Lol)|(LOL)'), 'en')
lang_fix = lang_fix.mask(df['text'].str.contains('(gg)|(Gg)|(GG)'), 'en')
lang_fix = lang_fix.mask(df['text'].str.contains('(ty)|(Ty)|(TY)'), 'en')
lang_fix = lang_fix.mask(df['text'].str.contains('(xD)|(XD)'), 'en')
lang_fix = lang_fix.mask(df['text'].str.contains('[Rr]eport'), 'en')
lang_fix = lang_fix.mask(df['text'].str.contains('STUPID|[Ss]tupid'), 'en')
lang_fix = lang_fix.mask(df['text'].str.contains('[Ff][Uu][Cc][Kk]|[Ss]hit'), 'en')
lang_fix = lang_fix.mask(df['text'].str.contains('[Nn][Oo][Oo][Bb]'), 'en')
lang_fix = lang_fix.mask(df['text'].str.contains('retard|RETARD'), 'en')
lang_fix = lang_fix.mask(df['text'].str.contains('pls|stfu|omg|OMG|wtf|WTF|wp|guys|kill|KILL|god|feed|FEED|btw'),'en')
lang_fix = lang_fix.mask(df['text'].str.contains('idiot|IDIOT|defend|dumb|end'), 'en')
lang_fix = lang_fix.mask(df['text'].str.contains('good|game|nice|thx|THX'), 'en')
df['language'] = lang_fix['language']

In [5]:
eng = df[df['language']=='en'].drop('language', axis=1)
eng.head()

Unnamed: 0,match,time,slot,text
9,1,-131.14018,0,twitch.tv/rage_channel
29,2,1563.1849,0,fast and furious
31,2,1996.3936,8,idiot drow
32,2,2006.2939,2,no idiot
37,2,2263.3697,2,lol


In [6]:
# text length
eng['text length'] = [len(text) for text in eng['text']]

# num messages sent by that player determined by unique match and slot combinations
num_messages = eng.groupby(['match', 'slot']).size()
num_messages = pd.DataFrame(num_messages).rename({0: 'num messages'}, axis=1)
eng['index'] = eng.index
eng = eng.set_index(['match', 'slot'])
eng = eng.join(num_messages).reset_index().sort_values('index').set_index('index')

# polarity and subjectivity
eng['polarity'] = eng['text'].apply(lambda x: TextBlob(x).sentiment[0])
eng['subjectivity'] = eng['text'].apply(lambda x: TextBlob(x).sentiment[1])

# num caps
capitals = []
for text in eng['text']:
    num_upper = sum([letter.isupper() for letter in text])
    capitals.append(num_upper)
eng['num caps'] = capitals

eng.head()

Unnamed: 0_level_0,match,slot,time,text,text length,num messages,polarity,subjectivity,num caps
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
9,1,0,-131.14018,twitch.tv/rage_channel,22,1,0.0,0.0,0
29,2,0,1563.1849,fast and furious,16,1,0.2,0.6,0
31,2,8,1996.3936,idiot drow,10,1,-0.8,0.8,0
32,2,2,2006.2939,no idiot,8,2,0.4,0.8,0
37,2,2,2263.3697,lol,3,2,0.8,0.7,0


## Parts of Notebook 2

In [7]:
# Dropping links
eng = eng.drop(eng[eng['text'].str.contains("(\.tv)")].index).drop(eng[eng['text'].str.contains("(\.com)")].index)
eng.shape

(14651, 9)

In [8]:
# Loading stop words
import nltk
from nltk.corpus import stopwords

stopwords = stopwords.words('english')
stopwords.remove("you")
stopwords.remove("you're")
stopwords.remove("yourself")

In [9]:
# Dropping stop words
def drop_stop_words(text):
    text = text.split(" ")
    nonstop_words = [word for word in text if word not in stopwords]
    string = ""
    for word in nonstop_words:
        string += word + " "
    return string[:len(string)-1]

eng['text'] = eng['text'].apply(drop_stop_words)

In [10]:
# Dropping words that are only used once or twice
words = []
for row in eng['text'].str.split(" "):
    for word in row:
        words.append(word)
word_counts = pd.Series(words).str.lower().value_counts()
rare_words = word_counts[word_counts < 3].index

def drop_rare_words(text):
    text = text.split(" ")
    nonrare_words = [word for word in text if word.lower() not in rare_words]
    string = ""
    for word in nonrare_words:
        string += word + " "
    return string[:len(string)-1]

eng['text'] = eng['text'].apply(drop_rare_words)

_____

# Start of Notebook 4 Work

## Final Jigsaw Classifier From Notebook 3

Tfidf char + tfidf word + len + prop caps

In [12]:
comments = pd.read_csv("jigsaw_train.csv")
comments['comment_text'] = comments['comment_text'].str.replace("\n", " ")

test = pd.read_csv('jigsaw_test.csv')

comments.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation Why the edits made under my userna...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,""" More I can't make any real suggestions on im...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [13]:
def num_upper(text):
    """Number of capital letters in a string."""
    num = 0
    for i in text:
        if i.isupper():
            num += 1
    return num

In [16]:
import nltk
from nltk.corpus import stopwords

stopwords = stopwords.words('english')
stopwords.remove("you")
stopwords.remove("you're")
stopwords.remove("yourself")
stopwords[:10]

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 "you've",
 "you'll"]

In [17]:
print("There will be {} different stop words dropped.".format(len(stopwords)))

There will be 176 different stop words dropped.


In [18]:
# Dropping stop words
def drop_stop_words(text):
    text = text.split(" ")
    nonstop_words = [word for word in text if word not in stopwords]
    string = ""
    for word in nonstop_words:
        string += word + " "
    return string[:len(string)-1]

eng['text'] = eng['text'].apply(drop_stop_words)

In [19]:
# Finding words that are only used once or twice
words = []
for row in eng['text'].str.split(" "):
    for word in row:
        words.append(word)
word_counts = pd.Series(words).str.lower().value_counts()
rare_words = word_counts[word_counts < 3].index
rare_words[:30]

Index([], dtype='object')

In [20]:
print("There will be {} different rarely used words dropped.".format(len(rare_words)))

There will be 0 different rarely used words dropped.


In [21]:
def drop_rare_words(text):
    """Removes stop words from a string."""
    text = text.split(" ")
    nonrare_words = [word for word in text if word.lower() not in rare_words]
    string = ""
    for word in nonrare_words:
        string += word + " "
    return string[:len(string)-1]

eng['text'] = eng['text'].apply(drop_rare_words)

In [25]:
testing = test.copy()

# Cleaning text
testing['comment_text'] = testing['comment_text'].str.replace(r"[(\.),(\|)!:='&(\*)(\")]", "")
testing['comment_text'] = testing['comment_text'].str.replace("\n", "")

# Getting length
testing['len'] = testing['comment_text'].apply(len) - testing['comment_text'].str.count(" ")
len_min = testing['len'].min()
len_max = testing['len'].max()
testing['len'] = (testing['len'].values - len_min) / (len_max - len_min)

# Getting prop caps
testing['caps'] = testing['comment_text'].apply(num_upper)
testing['proportion of caps'] = testing['caps'] / testing['len']

testing['proportion of caps'] = testing['proportion of caps'].fillna(0)
testing = testing.drop(['id', 'caps'], axis=1)
testing.head(3)

Unnamed: 0,comment_text,len,proportion of caps
0,Yo bitch Ja Rule is more succesful then youll ...,0.056914,0.014085
1,From RfC The title is fine as it is IMO,0.006012,0.233333
2,Sources Zawe Ashton on Lapland — /,0.005611,0.142857


In [26]:
from sklearn.pipeline import make_union

train_text = comments['comment_text']
test_text = test['comment_text']
text = pd.concat([train_text, test_text])

# Tfidf 'word'
word_vectorizer = TfidfVectorizer(sublinear_tf=True, strip_accents='unicode', analyzer='word', 
                                  token_pattern=r'\w{1,}', ngram_range=(1, 1), max_features=30000)
# Tfidf 'char'
char_vectorizer = TfidfVectorizer(sublinear_tf=True, strip_accents='unicode', analyzer='char', 
                                  ngram_range=(1, 4), max_features=30000)

vectorizer = make_union(word_vectorizer, char_vectorizer, n_jobs=2)
vectorizer.fit(text)

train_vector = vectorizer.transform(train_text)
test_vector = vectorizer.transform(test_text)

In [27]:
comments_copy = comments.copy()

# Cleaning text
comments_copy['comment_text'] = comments_copy['comment_text'].str.replace(r"[(\.),(\|)!:='&(\*)(\")]", "")
comments_copy['comment_text'] = comments_copy['comment_text'].str.replace("\n", "")

# Getting length
comments_copy['len'] = comments_copy['comment_text'].apply(len) - comments_copy['comment_text'].str.count(" ")
len_min = comments_copy['len'].min()
len_max = comments_copy['len'].max()
comments_copy['len'] = (comments_copy['len'].values - len_min) / (len_max - len_min)

# Getting prop caps
comments_copy['caps'] = comments_copy['comment_text'].apply(num_upper)
comments_copy['proportion of caps'] = comments_copy['caps'] / comments_copy['len']

comments_copy['proportion of caps'] = comments_copy['proportion of caps'].fillna(0)
comments_copy = comments_copy.drop(['id', 'caps'], axis=1)
comments_copy.head(3)

Unnamed: 0,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,len,proportion of caps
0,Explanation Why the edits made under my userna...,0,0,0,0,0,0,0.042129,0.079812
1,Daww He matches this background colour Im seem...,0,0,0,0,0,0,0.015924,0.096386
2,Hey man Im really not trying to edit war Its j...,0,0,0,0,0,0,0.036686,0.021505


In [28]:
from scipy.sparse import hstack

# Combining all features
final_training = hstack([train_vector, comments_copy[['len', 'proportion of caps']]])
final_testing = hstack([test_vector, testing[['len', 'proportion of caps']]])

In [29]:
from sklearn.linear_model import LogisticRegression

labels = comments.iloc[:,2:]

results = {}
for i in range(len(labels.columns)):
    lr = LogisticRegression(random_state=42, solver='sag').fit(final_training, labels.iloc[:,i])
    results[labels.columns[i]] = lr.predict_proba(final_testing)[:,1]

In [30]:
submission = pd.DataFrame({'id': test['id']})
submission['toxic'] = results['toxic']
submission['severe_toxic'] = results['severe_toxic']
submission['obscene'] = results['obscene']
submission['threat'] = results['threat']
submission['insult'] = results['insult']
submission['identity_hate'] = results['identity_hate']
submission

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,0.999866,0.229119,0.999590,0.048832,0.984013,0.289492
1,0000247867823ef7,0.005770,0.001766,0.002655,0.000289,0.003934,0.002078
2,00013b17ad220c46,0.012792,0.002422,0.007127,0.000634,0.003973,0.001539
3,00017563c3f7919a,0.002720,0.001371,0.002177,0.000794,0.002828,0.000499
4,00017695ad8997eb,0.014492,0.001194,0.004055,0.000786,0.005080,0.000941
...,...,...,...,...,...,...,...
153159,fffcd0960ee309b5,0.431732,0.001495,0.056418,0.000612,0.016494,0.002180
153160,fffd7a9a6eb32c16,0.034610,0.003657,0.018900,0.002041,0.011403,0.009344
153161,fffda9e8d6fafa9e,0.003880,0.000597,0.007355,0.000365,0.002151,0.000835
153162,fffe8f1340a79fc2,0.011747,0.000864,0.006218,0.000630,0.006603,0.008448


____

## Jigsaw Classifier on Dota Data

In [31]:
dota_text = eng.copy()

# Cleaning text
dota_text['text'] = dota_text['text'].str.replace(r"[(\.),(\|)!:='&(\*)(\")]", "")
dota_text['text'] = dota_text['text'].str.replace("\n", "")

# Getting length
dota_text['len'] = dota_text['text'].apply(len) - dota_text['text'].str.count(" ")
len_min = dota_text['len'].min()
len_max = dota_text['len'].max()
dota_text['len'] = (dota_text['len'].values - len_min) / (len_max - len_min)

# Getting prop caps
dota_text['caps'] = dota_text['text'].apply(num_upper)
dota_text['proportion of caps'] = dota_text['caps'] / dota_text['len']

dota_text['proportion of caps'] = dota_text['proportion of caps'].fillna(0)
dota_text = dota_text.drop('caps', axis=1)
dota_text.head(3)

Unnamed: 0_level_0,text,len,proportion of caps
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
29,fast,0.0625,0.0
31,idiot drow,0.140625,0.0
32,idiot,0.078125,0.0


In [32]:
train_text = comments['comment_text']
test_text = dota_text['text']
text = pd.concat([train_text, test_text])

# Tfidf 'word'
word_vectorizer = TfidfVectorizer(sublinear_tf=True, strip_accents='unicode', analyzer='word', 
                                  token_pattern=r'\w{1,}', ngram_range=(1, 1), max_features=30000)
# Tfidf 'char'
char_vectorizer = TfidfVectorizer(sublinear_tf=True, strip_accents='unicode', analyzer='char', 
                                  ngram_range=(1, 4), max_features=30000)

vectorizer = make_union(word_vectorizer, char_vectorizer, n_jobs=2)
vectorizer.fit(text)

train_vector = vectorizer.transform(train_text)
test_vector = vectorizer.transform(test_text)

In [33]:
# Combining all features
final_training = hstack([train_vector, comments_copy[['len', 'proportion of caps']]])
final_testing = hstack([test_vector, dota_text[['len', 'proportion of caps']]])

In [34]:
labels = comments.iloc[:,2:]

results = {}
for i in range(len(labels.columns)):
    lr = LogisticRegression(random_state=42, solver='sag').fit(final_training, labels.iloc[:,i])
    results[labels.columns[i]] = lr.predict_proba(final_testing)[:,1]

In [35]:
labeled_dota = pd.DataFrame({'text': dota_text['text']})
labeled_dota['toxic'] = results['toxic']
labeled_dota['severe_toxic'] = results['severe_toxic']
labeled_dota['obscene'] = results['obscene']
labeled_dota['threat'] = results['threat']
labeled_dota['insult'] = results['insult']
labeled_dota['identity_hate'] = results['identity_hate']
labeled_dota

Unnamed: 0_level_0,text,toxic,severe_toxic,obscene,threat,insult,identity_hate
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
29,fast,0.065864,0.004299,0.019833,0.001090,0.012755,0.005869
31,idiot drow,0.995506,0.009581,0.443928,0.001985,0.968772,0.005197
32,idiot,0.999989,0.009006,0.935135,0.001004,0.999757,0.007520
37,lol,0.032396,0.001351,0.009841,0.000687,0.011788,0.002649
38,COMMEND ME TY,0.198288,0.008663,0.011948,0.002053,0.015666,0.005051
...,...,...,...,...,...,...,...
49991,go end,0.086283,0.003587,0.021943,0.001428,0.035871,0.003878
49992,end,0.007481,0.001937,0.003965,0.000887,0.005692,0.001408
49994,thx,0.022971,0.002578,0.005348,0.001216,0.007994,0.003062
49997,omg,0.042616,0.003700,0.017039,0.000814,0.005284,0.003588


### Observing results:

In [36]:
labeled_dota.sort_values('toxic', ascending=False).head(15)

Unnamed: 0_level_0,text,toxic,severe_toxic,obscene,threat,insult,identity_hate
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
37442,fuck shit,1.0,0.936183,1.0,0.004947,0.996298,0.166963
37399,FUCK,1.0,0.987952,1.0,0.007583,0.997696,0.121982
30280,FUCK,1.0,0.987952,1.0,0.007583,0.997696,0.121982
19712,FUCK,1.0,0.987952,1.0,0.007583,0.997696,0.121982
119,FUCK,1.0,0.987952,1.0,0.007583,0.997696,0.121982
32723,idiot fuck,1.0,0.351721,1.0,0.003123,0.999967,0.035741
17665,stupid fuck,1.0,0.67419,1.0,0.012387,0.999671,0.168069
7913,FUCKING IDIOT,1.0,0.84551,1.0,0.015431,0.999989,0.083444
17730,SO FUCKING IDIOT,1.0,0.804833,1.0,0.014088,0.999979,0.13868
34340,Fuck,1.0,0.97243,1.0,0.004848,0.996921,0.089914


A problem is one-word texts. "Fuck" in this context is not necessarily toxic nor an insult.

**For the final product: When receiving such a short input, return "too little information".**
* Exception: extremes such as racial slurs, especially spammed

Because of this problem, entries with texts that have more than one word will be looked at.

In [37]:
# more than 1 word
labeled_dota[labeled_dota['text'].str.contains(' ')].sort_values('toxic', ascending=False).head(20)

Unnamed: 0_level_0,text,toxic,severe_toxic,obscene,threat,insult,identity_hate
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
37442,fuck shit,1.0,0.936183,1.0,0.004947,0.996298,0.166963
32723,idiot fuck,1.0,0.351721,1.0,0.003123,0.999967,0.035741
17665,stupid fuck,1.0,0.67419,1.0,0.012387,0.999671,0.168069
7913,FUCKING IDIOT,1.0,0.84551,1.0,0.015431,0.999989,0.083444
17730,SO FUCKING IDIOT,1.0,0.804833,1.0,0.014088,0.999979,0.13868
41240,fucking shit,1.0,0.937701,1.0,0.024636,0.997248,0.188213
42956,fucking shit,1.0,0.937701,1.0,0.024636,0.997248,0.188213
43953,fuck stupid,1.0,0.738962,1.0,0.008442,0.99968,0.152913
40053,FUCK YOU,1.0,0.981353,1.0,0.018799,0.998572,0.079567
45836,fuck,1.0,0.958958,1.0,0.003755,0.992296,0.078246


* Toxic seems to do well.

In [38]:
labeled_dota[labeled_dota['text'].str.contains(' ')].sort_values('severe_toxic', ascending=False).head(20)

Unnamed: 0_level_0,text,toxic,severe_toxic,obscene,threat,insult,identity_hate
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
40053,FUCK YOU,1.0,0.981353,1.0,0.018799,0.998572,0.079567
20134,FUCK OFF,1.0,0.971292,1.0,0.008755,0.976415,0.087178
45267,FUCK U,1.0,0.964495,1.0,0.015596,0.988328,0.101441
18611,FUCK U,1.0,0.964495,1.0,0.015596,0.988328,0.101441
25768,FUCK U,1.0,0.964495,1.0,0.015596,0.988328,0.101441
45836,fuck,1.0,0.958958,1.0,0.003755,0.992296,0.078246
29051,FUCK IT,1.0,0.954918,1.0,0.003302,0.951204,0.056171
5137,fuck you,1.0,0.950933,1.0,0.013201,0.998483,0.057055
9960,fuck you,1.0,0.944714,1.0,0.010403,0.997898,0.052017
40330,fuck you,1.0,0.944714,1.0,0.010403,0.997898,0.052017


* severe_toxic is questionable, but decided to remove this label completely due to limited number of rows trained on
   * Other labels have limited rows, but they still do very well and are kept.

In [39]:
labeled_dota[labeled_dota['text'].str.contains(' ')].sort_values('obscene', ascending=False).head(10)

Unnamed: 0_level_0,text,toxic,severe_toxic,obscene,threat,insult,identity_hate
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
37442,fuck shit,1.0,0.936183,1.0,0.004947,0.996298,0.166963
45836,fuck,1.0,0.958958,1.0,0.003755,0.992296,0.078246
41240,fucking shit,1.0,0.937701,1.0,0.024636,0.997248,0.188213
42956,fucking shit,1.0,0.937701,1.0,0.024636,0.997248,0.188213
40053,FUCK YOU,1.0,0.981353,1.0,0.018799,0.998572,0.079567
21472,fuck you,1.0,0.944714,1.0,0.010403,0.997898,0.052017
9960,fuck you,1.0,0.944714,1.0,0.010403,0.997898,0.052017
10835,fuck you,1.0,0.944714,1.0,0.010403,0.997898,0.052017
7155,fuck you,1.0,0.944714,1.0,0.010403,0.997898,0.052017
26038,fuck you,1.0,0.944714,1.0,0.010403,0.997898,0.052017


* obscene does well (indicates bad language)

In [40]:
labeled_dota[labeled_dota['text'].str.contains(' ')].sort_values('threat', ascending=False).head(20)

Unnamed: 0_level_0,text,toxic,severe_toxic,obscene,threat,insult,identity_hate
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
27401,I KILL YOU,0.997216,0.159925,0.279168,0.996062,0.155496,0.032277
20472,I kill you,0.988382,0.066419,0.202735,0.993359,0.115954,0.021918
27983,kill you,0.983451,0.061299,0.1934,0.978671,0.140604,0.023819
29111,kill you,0.983451,0.061299,0.1934,0.978671,0.140604,0.023819
29109,kill you,0.983451,0.061299,0.1934,0.978671,0.140604,0.023819
29106,kill you,0.983451,0.061299,0.1934,0.978671,0.140604,0.023819
16011,kill,0.938727,0.035013,0.11053,0.954028,0.024938,0.017654
23475,kill,0.938727,0.035013,0.11053,0.954028,0.024938,0.017654
19681,you kill ?,0.959866,0.065744,0.160221,0.949001,0.090758,0.021837
41711,you kill,0.976462,0.060887,0.192054,0.943801,0.140968,0.026538


* threat can do well with certain thresholds/text lengths

In [41]:
labeled_dota[labeled_dota['text'].str.contains(' ')].sort_values('insult', ascending=False).head(20)

Unnamed: 0_level_0,text,toxic,severe_toxic,obscene,threat,insult,identity_hate
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
7913,FUCKING IDIOT,1.0,0.84551,1.0,0.015431,0.999989,0.083444
11625,fucking idiot,1.0,0.639903,1.0,0.008526,0.999984,0.054631
9372,fucking idiot,1.0,0.639903,1.0,0.008526,0.999984,0.054631
17730,SO FUCKING IDIOT,1.0,0.804833,1.0,0.014088,0.999979,0.13868
32723,idiot fuck,1.0,0.351721,1.0,0.003123,0.999967,0.035741
42160,hey faggot ass show stupid fuck support,1.0,0.896189,1.0,0.028711,0.999947,0.730871
31622,fucking idiots,1.0,0.751551,0.999995,0.006486,0.999929,0.058888
6287,fucking stupid,1.0,0.770701,1.0,0.040739,0.999781,0.199094
29120,shut fuck bitch,1.0,0.879229,1.0,0.00415,0.99976,0.108887
43953,fuck stupid,1.0,0.738962,1.0,0.008442,0.99968,0.152913


* Insult does pretty well

In [42]:
labeled_dota[labeled_dota['text'].str.contains(' ')].sort_values('identity_hate', ascending=False).head(20)

Unnamed: 0_level_0,text,toxic,severe_toxic,obscene,threat,insult,identity_hate
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
33905,ur nigger,0.986108,0.066859,0.876924,0.003155,0.845558,0.995069
48330,nah nigger,0.94443,0.034314,0.59123,0.002116,0.761974,0.990782
2675,fat nigga,0.982548,0.0616,0.793545,0.004056,0.97832,0.990716
48751,nigga,0.894233,0.024584,0.564195,0.002199,0.840775,0.989783
29304,nigga ever,0.888053,0.02273,0.450727,0.002948,0.830247,0.975339
49643,nigga bitch,0.999486,0.216156,0.999667,0.002098,0.99882,0.968357
41673,I mean IS sand nigger,0.899907,0.057888,0.508628,0.002047,0.592617,0.961329
32112,really nigga,0.816685,0.023987,0.423238,0.001843,0.748751,0.955847
33906,nigger eyes,0.950371,0.024692,0.655697,0.001515,0.528238,0.955697
3537,Big nigga tree,0.792676,0.049658,0.436952,0.002541,0.548145,0.949014


* identity_hate does pretty well

# Labeled EDA

In [43]:
tmp = eng[['match', 'slot', 'time', 'text']].copy()
tmp = tmp.join(labeled_dota, on='index', how='left', lsuffix='_left', rsuffix='_right')
labeled_dota = tmp.drop(['text_right'], axis=1).rename(columns={'text_left': 'text'})
labeled_dota.head(5)

Unnamed: 0_level_0,match,slot,time,text,toxic,severe_toxic,obscene,threat,insult,identity_hate
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
29,2,0,1563.1849,fast,0.065864,0.004299,0.019833,0.00109,0.012755,0.005869
31,2,8,1996.3936,idiot drow,0.995506,0.009581,0.443928,0.001985,0.968772,0.005197
32,2,2,2006.2939,idiot,0.999989,0.009006,0.935135,0.001004,0.999757,0.00752
37,2,2,2263.3697,lol,0.032396,0.001351,0.009841,0.000687,0.011788,0.002649
38,2,4,2263.9049,COMMEND ME TY,0.198288,0.008663,0.011948,0.002053,0.015666,0.005051


### Toxic

In [44]:
toxic = labeled_dota[labeled_dota['toxic'] > .75]
toxic.shape

(1295, 10)

In [45]:
toxic.drop(['match', 'slot'], axis=1).mean()

time             1400.792374
toxic               0.964194
severe_toxic        0.285543
obscene             0.784291
threat              0.051026
insult              0.607698
identity_hate       0.056842
dtype: float64

* A lot of the highly toxic comments are also highly obscene and insulting.

In [46]:
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
from collections import Counter

text = ""
for i in range(len(toxic['text'].values)):
    text += toxic['text'].values[i] + " "

print("Most frequent words among the texts labeled as highly toxic")
freqs = WordCloud().process_text(text)
dict(Counter(freqs).most_common(15))

Most frequent words among the texts labeled as highly toxic


{'fuck': 233,
 'fucking': 214,
 'wtf': 174,
 'shit': 141,
 'kill': 80,
 'idiot': 76,
 'retard': 55,
 'stupid': 39,
 'team': 32,
 'ur': 30,
 'noob': 24,
 'go': 24,
 'game': 20,
 'retarded': 19,
 'bitch': 18}

### Severely toxic

In [47]:
severe_toxic = labeled_dota[labeled_dota['severe_toxic'] > .75]
severe_toxic.shape

(212, 10)

In [48]:
severe_toxic.drop(['match', 'slot'], axis=1).mean()

time             1330.525500
toxic               0.999997
severe_toxic        0.900300
obscene             0.999995
threat              0.018364
insult              0.980062
identity_hate       0.088936
dtype: float64

* A lot of the highly severe_toxic comments are also highly toxic, obscene, and insulting.

In [49]:
text = ""
for i in range(len(severe_toxic['text'].values)):
    text += severe_toxic['text'].values[i] + " "

print("Most frequent words among the texts labeled as highly severe_toxic")
freqs = WordCloud().process_text(text)
dict(Counter(freqs).most_common(10))

Most frequent words among the texts labeled as highly severe_toxic


{'fuck': 131,
 'fucking': 68,
 'fucker': 7,
 'ur': 6,
 'shit': 6,
 'SHUT': 4,
 'suck dick': 4,
 'mom': 3,
 'fuckin': 3,
 'stupid': 3}

### Obscene

In [50]:
obscene = labeled_dota[labeled_dota['obscene'] > .75]
obscene.shape

(966, 10)

In [51]:
obscene.drop(['match', 'slot'], axis=1).mean()

time             1388.211339
toxic               0.984781
severe_toxic        0.370228
obscene             0.957323
threat              0.011276
insult              0.676377
identity_hate       0.057528
dtype: float64

* A lot of the highly obscene comments are also highly toxic and often insulting.

In [52]:
text = ""
for i in range(len(obscene['text'].values)):
    text += obscene['text'].values[i] + " "

print("Most frequent words among the texts labeled as highly obscene")
freqs = WordCloud().process_text(text)
dict(Counter(freqs).most_common(10))

Most frequent words among the texts labeled as highly obscene


{'fuck': 233,
 'fucking': 214,
 'wtf': 148,
 'shit': 136,
 'idiot': 41,
 'team': 26,
 'stupid': 25,
 'ur': 22,
 'cunt': 18,
 'bitch': 18}

### Threat

In [53]:
threat = labeled_dota[labeled_dota['threat'] > .75]
threat.shape

(45, 10)

In [54]:
threat.drop(['match', 'slot'], axis=1).mean()

time             1368.110500
toxic               0.938534
severe_toxic        0.076134
obscene             0.163446
threat              0.907929
insult              0.089093
identity_hate       0.026480
dtype: float64

* A lot of the highly threatening comments are also highly toxic.

In [55]:
text = ""
for i in range(len(threat['text'].values)):
    text += threat['text'].values[i] + " "

print("Most frequent words among the texts labeled as highly threatening")
freqs = WordCloud().process_text(text)
dict(Counter(freqs).most_common(10))

Most frequent words among the texts labeled as highly threatening


{'kill kill': 20,
 'kill': 3,
 'go': 3,
 'gonna': 2,
 'death': 1,
 'kotl': 1,
 'fucking': 1,
 'die': 1,
 'unpause': 1}

### Insult

In [56]:
insult = labeled_dota[labeled_dota['insult'] > .75]
insult.shape

(647, 10)

In [57]:
insult.drop(['match', 'slot'], axis=1).mean()

time             1420.419119
toxic               0.995537
severe_toxic        0.481272
obscene             0.899505
threat              0.014336
insult              0.934621
identity_hate       0.081799
dtype: float64

* A lot of the highly threatening comments are also highly toxic and highly obscene.

In [58]:
text = ""
for i in range(len(insult['text'].values)):
    text += insult['text'].values[i] + " "

print("Most frequent words among the texts labeled as highly insulting")
freqs = WordCloud().process_text(text)
dict(Counter(freqs).most_common(10))

Most frequent words among the texts labeled as highly insulting


{'fuck': 196,
 'fucking': 154,
 'idiot': 75,
 'shit': 44,
 'stupid': 37,
 'retard': 34,
 'bitch': 17,
 'fucker': 16,
 'ur': 16,
 'go': 15}

### Identity hate

In [59]:
identity_hate = labeled_dota[labeled_dota['identity_hate'] > .75]
identity_hate.shape

(22, 10)

In [60]:
identity_hate.drop(['match', 'slot'], axis=1).mean()

time             1139.329438
toxic               0.939388
severe_toxic        0.151493
obscene             0.668012
threat              0.005876
insult              0.773348
identity_hate       0.925457
dtype: float64

* A lot of the high identity_hate comments are also highly toxic and often insulting.

In [61]:
text = ""
for i in range(len(identity_hate['text'].values)):
    text += identity_hate['text'].values[i] + " "

print("Most frequent words among the texts labeled as high identity_hate")
freqs = WordCloud().process_text(text)
dict(Counter(freqs).most_common(7))

Most frequent words among the texts labeled as high identity_hate


{'nigga': 9,
 'nigger': 6,
 'gay': 4,
 'faggot': 3,
 'fucked': 2,
 'miss': 1,
 'shit': 1}

### Can find toxic players rather than toxic comments:

In [62]:
labeled_dota.groupby(['match', 'slot']).mean().iloc[:,1:]

Unnamed: 0_level_0,Unnamed: 1_level_0,toxic,severe_toxic,obscene,threat,insult,identity_hate
match,slot,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2,0,0.065864,0.004299,0.019833,0.001090,0.012755,0.005869
2,2,0.516192,0.005179,0.472488,0.000846,0.505772,0.005085
2,4,0.187266,0.009037,0.017685,0.001990,0.021334,0.004979
2,8,0.995506,0.009581,0.443928,0.001985,0.968772,0.005197
3,6,0.027828,0.003575,0.010717,0.001159,0.008650,0.002929
...,...,...,...,...,...,...,...
2424,0,0.651286,0.027215,0.136006,0.002618,0.083348,0.099994
2425,6,0.175826,0.006294,0.055320,0.001433,0.046996,0.034877
2425,9,0.007481,0.001937,0.003965,0.000887,0.005692,0.001408
2426,4,0.042616,0.003700,0.017039,0.000814,0.005284,0.003588
