Read the dataset

In [7]:
import pandas as pd
import numpy as np

comments = pd.read_csv("jigsaw-toxic-severity-rating/comments_to_score.csv")

In [4]:
comments.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7537 entries, 0 to 7536
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   comment_id  7537 non-null   int64 
 1   text        7537 non-null   object
dtypes: int64(1), object(1)
memory usage: 117.9+ KB


In [8]:
comment_id = comments['comment_id'].tolist()
text = comments['text'].tolist()

In [10]:
len(text)

7537

Apply VADER to the text

In [2]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [12]:
# to tokensze sentences
# sentences = []
# from nltk import tokenize
# for t in text:
#     lines_list = tokenize.sent_tokenize(t)
#     sentences.extend(lines_list)

In [13]:
# len(sentences)

36975

In [25]:
scores = []
sid = SentimentIntensityAnalyzer()
for t in text:
    ss = sid.polarity_scores(t)
    scores.append(ss['compound'])

In [26]:
len(scores)

7537

In [29]:
df = pd.DataFrame({'comment_id':comment_id,'score': scores})

In [31]:
#saving the test file
df.to_csv("submission_test.csv", index=False)

## Validating the model

In [33]:
comments['score'] = scores

In [35]:
#this is the final dataset that we have with the VADER compound score
comments

Unnamed: 0,comment_id,text,score
0,114890,"""\n \n\nGjalexei, you asked about whether ther...",-0.4980
1,732895,"Looks like be have an abuser , can you please ...",0.5537
2,1139051,I confess to having complete (and apparently b...,-0.3404
3,1434512,"""\n\nFreud's ideas are certainly much discusse...",-0.7815
4,2084821,It is not just you. This is a laundry list of ...,-0.5267
...,...,...,...
7532,504235362,"Go away, you annoying vandal.",-0.4019
7533,504235566,This user is a vandal.,0.0000
7534,504308177,""" \n\nSorry to sound like a pain, but one by f...",-0.9354
7535,504570375,Well it's pretty fucking irrelevant now I'm un...,0.6800


In [16]:
#adding the validation dataset
validation_data = pd.read_csv("jigsaw-toxic-severity-rating/validation_data.csv")

In [17]:
score1=[]
score2=[]
results=[]
sid = SentimentIntensityAnalyzer()
for index, row in validation_data.iterrows():
    #comment1
    comment1 = row['less_toxic']
    ss1 = sid.polarity_scores(comment1)
    scr1 = ss1['compound']
    score1.append(scr1)

    #comment1
    comment2 = row['more_toxic']
    ss2 = sid.polarity_scores(comment2)
    scr2 = ss2['compound']
    score2.append(scr2)
    
    #append the result outcome, 1 if our toxicity match with rankers else 0
    if scr1>scr2:
        result = 1
    else:
        result = 0
    results.append(result)

In [18]:
len(validation_data)

30108

In [19]:
validation_data["score1"] = score1
validation_data["score2"] = score2
validation_data["result"] = results

In [20]:
validation_data

Unnamed: 0,worker,less_toxic,more_toxic,score1,score2,result
0,313,This article sucks \n\nwoo woo wooooooo,WHAT!!!!!!!!?!?!!?!?!!?!?!?!?!!!!!!!!!!!!!!!!!...,0.5719,0.8053,0
1,188,"""And yes, people should recognize that but the...",Daphne Guinness \n\nTop of the mornin' my fav...,0.8020,-0.3134,1
2,82,"Western Media?\n\nYup, because every crime in...","""Atom you don't believe actual photos of mastu...",-0.7906,-0.4882,0
3,347,And you removed it! You numbskull! I don't car...,You seem to have sand in your vagina.\n\nMight...,-0.8517,-0.7845,0
4,539,smelly vagina \n\nBluerasberry why don't you ...,"hey \n\nway to support nazis, you racist",0.0000,-0.3182,1
...,...,...,...,...,...,...
30103,461,I'm sorry. I'm not an admin. I will give you t...,get out my large penis,-0.8223,0.0000,0
30104,527,I'm sorry. I'm not an admin. I will give you t...,get out my large penis,-0.8223,0.0000,0
30105,352,"wow...\nare you out of your mind, how was my e...",Piss off you slant eyed-gook,0.0000,-0.4019,1
30106,311,"wow...\nare you out of your mind, how was my e...",Piss off you slant eyed-gook,0.0000,-0.4019,1


In [21]:
validation_data['result'].value_counts(normalize=True)

1    0.56945
0    0.43055
Name: result, dtype: float64

#### Accuracy is only 56.945% when we directly use VADER

#### Let's try it again by tokenizing the sentences and averaging the score

### VADER tuning by tokenizing sentences in each comment

In [22]:
import pandas as pd
import numpy as np

comments = pd.read_csv("jigsaw-toxic-severity-rating/comments_to_score.csv")

In [6]:
comment_id = comments['comment_id'].tolist()
text = comments['text'].tolist()

In [28]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk import tokenize
sid = SentimentIntensityAnalyzer()

scores = []

for t in text:
    line_score=[]
    lines_list = tokenize.sent_tokenize(t)
    for l in lines_list:
        ss = sid.polarity_scores(t)
        line_score.append(ss['compound'])
    avg_score = sum(line_score)/len(line_score)
    scores.append(avg_score)

In [29]:
len(scores)

7537

In [30]:
df = pd.DataFrame({'comment_id':comment_id,'score': scores})

In [31]:
#saving the submission file
df.to_csv("submission_test2.csv", index=False)

### Validate the model with the 2nd VADER method by tokenizing sentences

In [33]:
#adding the validation dataset
val_data = pd.read_csv("jigsaw-toxic-severity-rating/validation_data.csv")

In [34]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk import tokenize

score1_vader2=[]
score2_vader2=[]
results_vader2=[]
sid = SentimentIntensityAnalyzer()
for index, row in val_data.iterrows():
    
    #comment1
    line1_score=[]
    comment1 = row['less_toxic']
    lines1 = tokenize.sent_tokenize(comment1)
    for l in lines1:
        ss = sid.polarity_scores(l)
        line1_score.append(ss['compound'])
    avg_score1 = sum(line1_score)/len(line1_score)
    score1_vader2.append(avg_score)

    #comment2
    line2_score=[]
    comment2 = row['more_toxic']
    lines2 = tokenize.sent_tokenize(comment2)
    for l in lines2:
        ss = sid.polarity_scores(l)
        line2_score.append(ss['compound'])
    avg_score2 = sum(line2_score)/len(line2_score)
    score2_vader2.append(avg_score2)
    
    #append the result outcome, 1 if our toxicity match with rankers else 0
    if avg_score1>avg_score2:
        result = 1
    else:
        result = 0
    results_vader2.append(result)

In [37]:
len(val_data)

30108

In [38]:
val_data["score1"] = score1_vader2
val_data["score2"] = score2_vader2
val_data["result"] = results_vader2

In [39]:
val_data.head()

Unnamed: 0,worker,less_toxic,more_toxic,score1,score2,result
0,313,This article sucks \n\nwoo woo wooooooo,WHAT!!!!!!!!?!?!!?!?!!?!?!?!?!!!!!!!!!!!!!!!!!...,0.34,0.249833,1
1,188,"""And yes, people should recognize that but the...",Daphne Guinness \n\nTop of the mornin' my fav...,0.34,-0.01845,1
2,82,"Western Media?\n\nYup, because every crime in...","""Atom you don't believe actual photos of mastu...",0.34,-0.073043,0
3,347,And you removed it! You numbskull! I don't car...,You seem to have sand in your vagina.\n\nMight...,0.34,-0.39225,1
4,539,smelly vagina \n\nBluerasberry why don't you ...,"hey \n\nway to support nazis, you racist",0.34,-0.3182,1


In [40]:
val_data['result'].value_counts(normalize=True)

1    0.583533
0    0.416467
Name: result, dtype: float64

#### Accuracy is 58.35 after tokenizing sentences so it's not improved significantly. Since VADER model is primarily to guage text sentiment, it may not be a good method for analyzing toxicity