In [None]:
import pandas as pd
import time
from googleapiclient import discovery
from perspective import PerspectiveAPI
import json
import numpy as np
import sys
import os
import re

In [None]:
API_KEY = '' #Include Perspective API KEY

p = PerspectiveAPI(API_KEY)
client = discovery.build(
  "commentanalyzer",
  "v1alpha1",
  developerKey=API_KEY,
  discoveryServiceUrl="https://commentanalyzer.googleapis.com/$discovery/rest?version=v1alpha1",
  static_discovery=False,
)

analyze_request = {
  'comment': { 'text': 'This is a sample text for toxicity' },
  'requestedAttributes': {'TOXICITY': {}, 'PROFANITY':{}}
}

response = client.comments().analyze(body=analyze_request).execute()
print(json.dumps(response, indent=2))

In [None]:
txt_check = 'Check another text'
txt_lang = 'en'
try:
    analyze_request = {
        'comment': { 'text': txt_check },
        'requestedAttributes': {'TOXICITY': {}, 'PROFANITY':{}, 'SEVERE_TOXICITY': {},
                                'IDENTITY_ATTACK':{}, 'INSULT':{}, 'THREAT':{}},
            'languages':txt_lang
        }
except:
    print('Error')


In [None]:
path_location = '' #Directory of IO dataset file
file_name = '' #CSV File name

In [None]:
df_data = pd.read_csv(path_location + file_name)

In [None]:
supported_lang = ['ar', 'zh', 'cs', 'nl', 'en', 'fr', 'de', 'hi', 'id', 'it', 'ja', 'ko', 'pl', 'pt', 'ru', 'es']
df_data = df_data.loc[df_data.tweet_language.isin(supported_lang)]

In [None]:
def process_data(tweet_df):
    tweet_df['quoted_tweet_tweetid'] = tweet_df['quoted_tweet_tweetid'].astype('Int64')
    tweet_df['retweet_tweetid'] = tweet_df['retweet_tweetid'].astype('Int64')
    
    tweet_type = []
    for i in range(tweet_df.shape[0]):
        if pd.notnull(tweet_df['quoted_tweet_tweetid'].iloc[i]):
            if pd.notnull(tweet_df['retweet_tweetid'].iloc[i]):
                if pd.notnull(tweet_df['in_reply_to_tweetid'].iloc[i]):
                    continue
                else:
                    tweet_type.append('retweet')
            else:
                if pd.notnull(tweet_df['in_reply_to_tweetid'].iloc[i]):
                    tweet_type.append('reply')
                else:
                    tweet_type.append('quoted')
        else:
            if pd.notnull(tweet_df['retweet_tweetid'].iloc[i]):
                if pd.notnull(tweet_df['in_reply_to_tweetid'].iloc[i]):
                    continue
                else:
                    tweet_type.append('retweet')
            else:
                if pd.notnull(tweet_df['in_reply_to_tweetid'].iloc[i]):
                    tweet_type.append('reply')
                else:
                    tweet_type.append('original')
    tweet_df['tweet_type'] = tweet_type
    
    return tweet_df

In [None]:
def preprocess_text(df):
    # Cleaning tweets in en language
    # Removing RT Word from Messages
    df['tweet_text']=df['tweet_text'].str.lstrip('RT')
    df['tweet_text']=df['tweet_text'].str.replace( "\n",'')
    
    return df


def remove_emoji(string):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  
                           u"\U0001F300-\U0001F5FF"  
                           u"\U0001F680-\U0001F6FF"  
                           u"\U0001F1E0-\U0001F1FF"
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', string)



#Message Clean Function
def msg_clean(msg):
    msg = re.sub(r'https?://\S+|www\.\S+', " ", msg)
    msg = re.sub(r'@\w+',' ',msg)
    msg = re.sub('r<.*?>',' ', msg)
    msg = remove_emoji(msg)
    return msg


In [None]:
df_data = process_data(df_data)
df_data = preprocess_text(df_data)
df_data['updated_tweet_text'] = df_data['tweet_text'].astype(str).apply(lambda x: msg_clean(x))
df_data = df_data.reset_index(drop = True)

In [None]:
res_Toxic = []
res_SevereToxic = []
res_Profanity = []
res_IDAttack = []
res_Insult = []
res_Threat = []
tid2 = []

In [None]:
for i in range (0,len(df_data)):
    
    txt_check = df_data.iloc[i].updated_tweet_text
    txt_lang = df_data.iloc[i].tweet_language
    try:
        analyze_request = {
          'comment': { 'text': txt_check },
          'requestedAttributes': {'TOXICITY': {}, 'PROFANITY':{}, 'SEVERE_TOXICITY': {},
                                 'IDENTITY_ATTACK':{}, 'INSULT':{}, 'THREAT':{}},
            'languages':[txt_lang]
        }


        response = client.comments().analyze(body=analyze_request).execute()
        
        res_Toxic.append(response['attributeScores']['TOXICITY']['spanScores'][0]['score']['value'])
        res_SevereToxic.append(response['attributeScores']['SEVERE_TOXICITY']['spanScores'][0]['score']['value'])
        res_Profanity.append(response['attributeScores']['PROFANITY']['spanScores'][0]['score']['value'])
        res_IDAttack.append(response['attributeScores']['IDENTITY_ATTACK']['spanScores'][0]['score']['value'])
        res_Insult.append(response['attributeScores']['INSULT']['spanScores'][0]['score']['value'])
        res_Threat.append(response['attributeScores']['THREAT']['spanScores'][0]['score']['value'])
        tid2.append(df_data.iloc[i].tweetid)
        

    except:
        res_Toxic.append(-1)
        res_SevereToxic.append(-1)
        res_Profanity.append(-1)
        res_IDAttack.append(-1)
        res_Insult.append(-1)
        res_Threat.append(-1)
        tid2.append(df_data.iloc[i].tweetid)
    
    time.sleep(0.05)
    
    if i%50000 == 1:
        print(i)
        df_pers2 = pd.DataFrame({'Tweet_ID': tid2, 'Toxic':res_Toxic, 'Severe_Toxic': res_SevereToxic,
                        'Profanity': res_Profanity, 'Identity_Attack': res_IDAttack, 
                        'Insult': res_Insult, 'Threat': res_Threat})
        df_pers2.to_csv('', index = False) #Store the File

        
df_pers2 = pd.DataFrame({'Tweet_ID': tid2, 'Toxic':res_Toxic, 'Severe_Toxic': res_SevereToxic,
                        'Profanity': res_Profanity, 'Identity_Attack': res_IDAttack, 
                        'Insult': res_Insult, 'Threat': res_Threat})

df_pers2.to_csv('', index = False) #Store the File