# ☣️ Jigsaw 


Using data from [Toxic Comment Classification Challenge](https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge)

I have created a dataset for this data. It's public here :
* [jigsaw-toxic-comment-classification-challenge](https://www.kaggle.com/julian3833/jigsaw-toxic-comment-classification-challenge)


# Please, _DO_ upvote!

Related Datasets:

https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge

https://www.kaggle.com/c/jigsaw-unintended-bias-in-toxicity-classification

https://www.kaggle.com/ashwiniyer176/toxic-tweets-dataset

https://www.kaggle.com/manishguptads/wikipedia-toxicity

https://www.kaggle.com/surekharamireddy/malignant-comment-classification

https://www.kaggle.com/shobhitupadhyaya/jigsaw-training-dataset

# Imports

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
import scipy
from scipy.spatial import distance
from scipy.sparse.linalg import norm
from scipy.sparse import csr_matrix

In [2]:
def clean(data, col):

    # Clean some punctutations
    data[col] = data[col].str.replace('\n', ' ')
    # Remove ip address
    data[col] = data[col].str.replace(r'(([0-9]+\.){2,}[0-9]+)',' ')
    
    data[col] = data[col].str.replace(r'([a-zA-Z]+)([/!?.])([a-zA-Z]+)',r'\1 \2 \3')
    # Replace repeating characters more than 3 times to length of 3
    data[col] = data[col].str.replace(r'([*!?\'])\1\1{2,}',r'\1\1\1')
    # patterns with repeating characters 
    data[col] = data[col].str.replace(r'([a-zA-Z])\1{2,}\b',r'\1\1')
    data[col] = data[col].str.replace(r'([a-zA-Z])\1\1{2,}\B',r'\1\1\1')
    data[col] = data[col].str.replace(r'[ ]{2,}',' ').str.strip()   
    # Add space around repeating characters
    #data[col] = data[col].str.replace(r'([*!?\']+)',r' \1 ')    
    
    # Lowercase chars
    data[col] = data[col].str.lower()
    
    # Remove punctuation
    #data[col] = data[col].str.replace('[^\w\s]','')
    
    return data

In [3]:
df_orig = pd.read_csv("../input/jigsaw-toxic-severity-rating/validation_data.csv")
df_orig = pd.concat([ df_orig.drop(columns=['more_toxic']).rename(columns={'less_toxic':'text'}),
                     df_orig.drop(columns=['less_toxic']).rename(columns={'more_toxic':'text'}) ]).reset_index(drop=True)
df_orig = clean(df_orig, "text")
#df_orig.sample(20)

  
  
  # Remove the CWD from sys.path while we load stuff.
  if sys.path[0] == '':
  del sys.path[0]
  


In [4]:
df1 = pd.read_csv("../input/jigsaw-toxic-comment-classification-challenge/train.csv")
df1['y'] = (df1[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].sum(axis=1) > 0 ).astype(int)
df1 = df1[['comment_text', 'y']].rename(columns={'comment_text': 'text'})
df1 = clean(df1, "text")
# Drop comun comments from validation data
df1 = df1[ ~df1['text'].isin(df_orig['text']) ].reset_index(drop=True)
#df1.sample(20)

  
  
  # Remove the CWD from sys.path while we load stuff.
  if sys.path[0] == '':
  del sys.path[0]
  


# TF-IDF with Cosine Similarity

In [5]:
def TF_ID_cosine_distance(orig,df):
    len_orig=len(orig.values)
    join = np.concatenate((orig.values, df.values), axis=0)
    vec = TfidfVectorizer()
    X_vec = vec.fit_transform(join)
    X_orig = X_vec[:len_orig,:]
    X_df = X_vec[len_orig:,:]
    idx, cos_sim = calc_cosine_argmax(X_orig,X_df)
    return X_orig, X_df, idx, cos_sim

In [6]:
def calc_cosine_argmax(X_orig,X_df):
    idx = np.empty((0,1), int)
    cos_sim = np.empty((0,1), int)
    for i in range(0, int(X_orig.shape[0] / 100)):
        start = i * 100
        stop = i * 100 + 100
        #print(start)
        cosine = X_orig[start:stop].dot(X_df.T) #/(norm(X_orig[start:stop]) * norm(X_df)) * 100
        cos_aux = csr_matrix.max(cosine,axis=1)
        idx_aux = csr_matrix.argmax(cosine,axis=1)
        X_df = delete_rows_csr(X_df,idx_aux)
        idx = np.vstack((idx,idx_aux))
        cos_sim = np.vstack((cos_sim,cos_aux.toarray()))
        
        if X_df.shape[0] == 0:
            break
            
    return idx, cos_sim

In [7]:
def delete_rows_csr(mat, indices):
    """
    Remove the rows denoted by ``indices`` form the CSR sparse matrix ``mat``.
    """
    if not isinstance(mat, scipy.sparse.csr_matrix):
        raise ValueError("works only for CSR format -- use .tocsr() first")
    indices = list(indices)
    mask = np.ones(mat.shape[0], dtype=bool)
    mask[indices] = False
    return mat[mask]

In [8]:
X_orig, X_df1, idx1, cos_sim1 = TF_ID_cosine_distance(df_orig['text'],df1['text'])

In [9]:
print(len(np.argwhere(cos_sim1 ==1)))
print(cos_sim1.mean())
print(np.median(cos_sim1))
print(len(np.unique(idx1, axis=0)))

38
0.32108676466532066
0.2894809359489255
27702


In [10]:
def see_results(id):
    print(df_orig['text'][id])
    print(df1['text'][int(idx1[id])])

see_results(136)

against community and you don't know what devils are in commmunity jimbo wales, mimick wikinfo, and purge devils from here. muzemike damned for being enemy of god: http://ww . wikinfo.org / index.php / category:primordiality i was banned for this: http://ww . wikinfo.org / index.php / category:primordiality and i am now bri for this: http://ww . wikinfo.org / index.php / category:primordiality and this: http://ww . wikinfo.org / index.php / category:primordiality came from god. so be damned, muzemike, daemon in human skin.
that would be a good idea, this way i could check to see if anon is placing more cw material into the article. but i would have to insist that all current info be erased due to all the cw violations. 16:44, jan 30, 2005 (utc)


In [11]:
df2 = pd.read_csv("../input/jigsaw-unintended-bias-in-toxicity-classification/train.csv")
df2['y'] = (df2["target"] >= 0.5 ).astype(int)

#Keep only toxic comments
df2 = df2[df2['y'] == 1]

df2 = df2[['comment_text', 'y']].rename(columns={'comment_text': 'text'})
df2 = clean(df2, "text")
# Drop comun comments from validation data
df2 = df2[ ~df2['text'].isin(df_orig['text']) ].reset_index(drop=True)
#df2.sample(20)

  
  
  # Remove the CWD from sys.path while we load stuff.
  if sys.path[0] == '':
  del sys.path[0]
  


In [12]:
X_orig, X_df2, idx2, cos_sim2 = TF_ID_cosine_distance(df_orig['text'],df2['text'])

In [13]:
print(len(np.argwhere(cos_sim2 ==1)))
print(cos_sim2.mean())
print(np.median(cos_sim2))
print(len(np.unique(idx2, axis=0)))

4
0.2502857771843841
0.23361670353724456
27716


In [14]:
def see_results(id):
    print(df_orig['text'][id])
    print(df2['text'][int(idx2[id])])

see_results(100)

i'm jim,a retarded idiot .
it's good to know the ira was muslim too. or the ltte from sri lanka. or the khalistani terrorists from punjab, who killed the most number of canadians. the flq from quebec was muslim too. that is why they wanted quebec libre with sharia law. at one time, western governments had the south african anc and nelson mandela listed as terrorists. i guess they were muslim too?


In [15]:
df3 = pd.read_csv("../input/toxic-tweets-dataset/FinalBalancedDataset.csv")
df3 = df3[['tweet', 'Toxicity']].rename(columns={'tweet': 'text', 'Toxicity' : 'y'})
df3 = clean(df3, "text")
# Drop comun comments from validation data
df3 = df3[ ~df3['text'].isin(df_orig['text']) ].reset_index(drop=True)
#df3.sample(20)

  
  
  # Remove the CWD from sys.path while we load stuff.
  if sys.path[0] == '':
  del sys.path[0]
  


In [16]:
X_orig, X_df3, idx3, cos_sim3 = TF_ID_cosine_distance(df_orig['text'],df3['text'])

In [17]:
print(len(np.argwhere(cos_sim3 ==1)))
print(cos_sim3.mean())
print(np.median(cos_sim3))
print(len(np.unique(idx3, axis=0)))

1
0.1918983196003912
0.1779149248451155
22605


In [18]:
df4 = pd.read_csv("../input/wikipedia-toxicity/train.csv")
df4 = df4[['comment_text', 'toxic']].rename(columns={'comment_text': 'text', 'toxic' : 'y'})
df4 = clean(df4, "text")
# Drop comun comments from validation data
df4 = df4[ ~df4['text'].isin(df_orig['text']) ].reset_index(drop=True)
#df4.sample(20)

  
  
  # Remove the CWD from sys.path while we load stuff.
  if sys.path[0] == '':
  del sys.path[0]
  


In [19]:
X_orig, X_df4, idx4, cos_sim4 = TF_ID_cosine_distance(df_orig['text'],df4['text'])

In [20]:
df4.shape

(4810, 2)

In [21]:
print(len(np.argwhere(cos_sim4 ==1)))
print(cos_sim4.mean())
print(np.median(cos_sim4))
print(len(np.unique(idx4, axis=0)))

19
0.15529737035574084
0.15012746182855247
2382


In [22]:
df5 = pd.read_csv("../input/malignant-comment-classification/train.csv")
df5 = df5[['comment_text', 'malignant']].rename(columns={'comment_text': 'text', 'malignant' : 'y'})
df5 = clean(df5, "text")
# Drop comun comments from validation data
df5 = df5[ ~df5['text'].isin(df_orig['text']) ].reset_index(drop=True)
#df5.sample(20)

  
  
  # Remove the CWD from sys.path while we load stuff.
  if sys.path[0] == '':
  del sys.path[0]
  


In [23]:
X_orig, X_df5, idx5, cos_sim5 = TF_ID_cosine_distance(df_orig['text'],df5['text'])

In [24]:
print(len(np.argwhere(cos_sim5 ==1)))
print(cos_sim5.mean())
print(np.median(cos_sim5))
print(len(np.unique(idx5, axis=0)))

42
0.3214220559941172
0.28954738314559375
27737


In [25]:
df6 = pd.read_csv("../input/jigsaw-training-dataset/jigsaw_training_data.csv")
df6['y'] = np.where(df6['target']=='toxic',1,0)
df6 = df6[['comment_text', 'y']].rename(columns={'comment_text': 'text'})

#Subsample
df6_aux1 = df6[df6['y'] ==1]
df6_aux2 = df6[df6['y'] ==0].sample(150000, random_state=62)
df6 = pd.concat([df6_aux1, df6_aux2])

df6 = clean(df6, "text")
# Drop comun comments from validation data
df6 = df6[ ~df6['text'].isin(df_orig['text']) ].reset_index(drop=True)
#df6.sample(20)

  exec(code_obj, self.user_global_ns, self.user_ns)
  
  
  # Remove the CWD from sys.path while we load stuff.
  if sys.path[0] == '':
  del sys.path[0]
  


In [26]:
X_orig, X_df6, idx6, cos_sim6 = TF_ID_cosine_distance(df_orig['text'],df6['text'])

In [27]:
print(len(np.argwhere(cos_sim6 ==1)))
print(cos_sim6.mean())
print(np.median(cos_sim6))
print(len(np.unique(idx6, axis=0)))

38
0.32609465446912295
0.29670190807167124
28506


In [28]:
def append_to_final_data (final_data, df, idx):
    df = df.loc[np.unique(idx, axis=0).flatten().tolist(),:].reset_index(drop=True)
    final_data = pd.concat([final_data, df])
    return final_data.reset_index(drop=True)

In [29]:
final_data=pd.DataFrame()
df_sets = [df1,df2,df3,df4,df5,df6]
idx_sets = [idx1,idx2,idx3,idx4,idx5,idx6]
for i in range(0,6):
    print(i + 1)
    final_data = append_to_final_data(final_data=final_data, df=df_sets[i], idx=idx_sets[i])

1
2
3
4
5
6


In [30]:
final_data.to_csv("./final_data.csv",index=False)

In [31]:
final_data.shape

(136648, 2)

In [32]:
final_data.sample(20)

Unnamed: 0,text,y
106454,great. 70 supports and only 1 oppose or someth...,0
57449,my #hea goes out to the #innocent &amp; #famil...,0
20742,""" image copyright problem with image:malhereux...",0
101826,"both entries are on the same subject, they sho...",0
114020,your statement is narcissistic. this kids not ...,1
61702,it's almost friday! krissyskornerwithpremier@user,0
45234,"""terry thompson did not intend to cause hernan...",1
40571,you are right . because women aren't strong en...,1
121100,when the biggest turd on the liberal swamp say...,1
44065,i always thought wasilla was inhabited by lose...,1
