In [None]:
pip install detoxify

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting detoxify
  Downloading detoxify-0.5.1-py3-none-any.whl (12 kB)
Collecting sentencepiece>=0.1.94
  Downloading sentencepiece-0.1.97-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m13.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting transformers==4.22.1
  Downloading transformers-4.22.1-py3-none-any.whl (4.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.9/4.9 MB[0m [31m19.0 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.9.0
  Downloading huggingface_hub-0.13.3-py3-none-any.whl (199 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.8/199.8 KB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp39-cp39-manylinux_2_12_x86_64.manylinux2010_x86_64.whl 

In [None]:
from detoxify import Detoxify
from tqdm import tqdm
from textblob import TextBlob
import torch
import os
import pandas as pd
import json
import math

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


Moving 0 files to the new cache system


0it [00:00, ?it/s]

In [None]:
def main():
    #Model to use https://github.com/unitaryai/detoxify
    #unbiased to get multiple toxicty columns
    #multilingual to get the toxicity we use for everything
    model = Detoxify("unbiased", device=torch.device("cuda:0" if torch.cuda.is_available() else "cpu")) #device='cude'
    process_sentiment = True
    #Set output format
    select_output_format = 3
    output_formats = ['json', 'csv', 'xlsx']
    #Text column name
    text_columns = ["caption"]

    #Processing
    save_format = output_formats[select_output_format-1]
    for data, fname in import_data():
        updated_data = process_data(data, text_columns, model, process_sentiment)
        save_data(updated_data, fname, save_format)


In [None]:
def process_data(data:list, text_columns:list, model:Detoxify, process_sentiment:bool=False) ->list:
    """Computes the toxicity score for a batch of data.
    The toxicity score will be added into each dictionary as 'text_column + "_toxicity"'
    Args:
        data (list): list of dictionaries
        text_column (list): the id of the text column you want to compute
        model (Detoxify): the detoxify model you wish to use
    Returns:
        list: list of dictionaries
    """
    #Cleaning text
    for text_column in text_columns:
        texts = []
        for row in data:
            cleaned_text = clean_text(row[text_column])
            text =  cleaned_text if cleaned_text else ''
            texts.append(text)
        results = get_scores(texts, model, process_sentiment)
        #Updating data with toxicity scores
        for row, r in zip(data, results):
            if 'toxic' in model.class_names:
                row[text_column + "_toxicity"] = format_result(r['toxic'])
                row[text_column + "_severe_toxic"] = format_result(r['severe_toxic'])
                row[text_column + "_obscene"] = format_result(r['obscene'])
                row[text_column + "_threat"] = format_result(r['threat'])
                row[text_column + "_insult"] = format_result(r['insult'])
                row[text_column + "_identity_hate"] = format_result(r['identity_hate'])
            elif 'severe_toxicity' in model.class_names:
                row[text_column + "_toxicity"] = format_result(r['toxicity'])
                row[text_column + "_severe_toxicity"] = format_result(r['severe_toxicity'])
                row[text_column + "_obscene"] = format_result(r['obscene'])
                row[text_column + "_threat"] = format_result(r['threat'])
                row[text_column + "_insult"] = format_result(r['insult'])
                row[text_column + "_identity_attack"] = format_result(r['identity_attack'])
                row[text_column + "_sexual_explicit"] = format_result(r['sexual_explicit'])
            else:
                row[text_column + "_toxicity"] = format_result(r['toxicity'])
            if process_sentiment:
                row[text_column + "_sentiment"] = format_result(r['sentiment'])
    return data

In [None]:
def get_scores(text, model, process_sentiment:bool=False):
    with torch.no_grad():
        if type(text) == str:
            if process_sentiment:
                r = model.predict(text)
                r['sentiment'] = get_sentiment(text)
            else:
                r = model.predict(text)
            return r
        elif type(text) == list:
            results = []
            chunk_size = 100
            for l in tqdm(chunk_lst(text, chunk_size), desc="Processing Scores", total=math.ceil(len(text)/chunk_size)):
                if process_sentiment:
                    un_formated_results = model.predict(l)
                    rows = [dict(zip(un_formated_results,t)) for t in zip(*un_formated_results.values())]
                    for r, t in zip(rows, l):
                        r['sentiment'] = get_sentiment(t)
                        results.append(r)
                else:
                    un_formated_results = model.predict(l)
                    results +=  [dict(zip(un_formated_results,t)) for t in zip(*un_formated_results.values())]
        return results

In [None]:
def get_sentiment(text):
    if not text: return None
    blob = TextBlob(text)
    sentiment_score = blob.sentiment.polarity
    sentiment_score_rounded = round(sentiment_score, 6)
    return sentiment_score_rounded

def chunk_lst(lst:list, items_per_chunk:int):
    """Breaks a list into chunks
    Args:
        lst ([list]): List to chunk
        items_per_chunk ([int]): Number of items per list
    Yields:
        [list]: a chunk of lst, with size 'items_per_chunk'
    """
    for i in range(0, len(lst), items_per_chunk):
        yield lst[i:i + items_per_chunk]

def format_result(num):
    if num is None: return num
    return round(float(num), 6) if num >= 1e-4 else 0

def clean_text(text:str):
    if type(text) == float:
        return None
    if text and ''.join(text.split()):
        if type(text) == bytes: #Decoding byte strings
            text = text.decode('utf-8')
        #Removing emails + ***.com urls
        text = ' '.join([item for item in text.split() if '@' not in item and '.com' not in item])
        text = ' '.join(text.split()) #removing all multiple spaces
        if text: return text
    return None

In [None]:
def import_data(import_folder:str = "Import"):
    """Walks through the import folder and yields a list of dicts for each file.
    Will only process json, csv, or xlsx files.
    Args:
        import_folder (str, optional): The folder holding the data. Defaults to "Import".
    Yields:
        [list]: list of dictionaries, representing each row from the file
    """
    for _, _, fnames in os.walk(import_folder):
        with tqdm(total=len([x for x in fnames if '.json' in x or '.xlsx' in x or '.csv' in x])) as pbar:
            for fname in fnames:
                pbar.set_description(fname.replace('.json',''))
                if '.json' in fname:
                    df = pd.read_json(f"{import_folder}//{fname}")
                    data = df.T.to_dict().values()
                    yield data, fname
                elif '.xlsx' in fname:
                    df = pd.read_excel(f"{import_folder}//{fname}", engine='openpyxl')
                    data = df.T.to_dict().values()
                    del df
                    yield data, fname
                elif '.csv' in fname:
                    df = pd.read_csv(f"{import_folder}//{fname}")
                    data = df.T.to_dict().values()
                    del df
                    yield data, fname
                else:
                    pass
                pbar.update(1)

In [None]:
def save_data(data:list, fname:str, save_format:str, loc="Export"):
    """Will save data as xlsx, json, or csv format
    Args:
        data (list): data object. Usualy a list of dictionaries
        fname (str): the file name, without file exiensions
        save_format (str): either xlsx, json, or csv
        loc (str, optional): Save location. Defaults to "Export".
    Raises:
        ValueError: If you did not pick a save format between xlsx, json, or csv
    Returns:
        [type]: file path to where the file was saved.
    """
    fname = fname.split('.')[0]
    if save_format == 'xlsx':
        df = pd.DataFrame(data)
        path = os.path.join(os.getcwd(), loc+"/"+fname+".xlsx")
        with pd.ExcelWriter(path, engine='xlsxwriter', options={'strings_to_urls': False}) as writer:
            df.to_excel(writer, header=True, index=False, encoding='utf-8', na_rep='None')
        del df
    elif save_format == 'csv':
        df = pd.DataFrame(data)
        path = os.path.join(os.getcwd(), loc+"/"+fname+".csv")
        df.to_csv(path, header=True, mode='w', index=False, encoding='utf-8', date_format='%Y-%m-%d %H:%M:%S')
        del df
    elif save_format == 'json':
        path = os.path.join(os.getcwd(), loc+"/"+fname+".json")
        with open(path, 'w') as fp:
            json.dump(data, fp)
    else:
        raise ValueError("The format you selected so not one of the availabe. \nPlease select an save_format of json | csv | xlsx")
    return fname


if __name__ == "__main__":
    main()

Downloading: "https://github.com/unitaryai/detoxify/releases/download/v0.3-alpha/toxic_debiased-c7548aa0.ckpt" to /root/.cache/torch/hub/checkpoints/toxic_debiased-c7548aa0.ckpt


  0%|          | 0.00/476M [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]