# Moderation API

Moderation API는 콘텐츠가 OpenAI의 사용 정책을 준수하는지 확인하는 데 사용할 수 있는 도구입니다. 따라서 개발자는 사용 정책에서 금지하는 콘텐츠를 식별하고 필터링 등의 조치를 취할 수 있습니다.

In [1]:
from openai import OpenAI
client = OpenAI()

In [2]:
response = client.moderations.create(input="Sample text goes here.")

output = response.results[0]

In [3]:
output

Moderation(categories=Categories(harassment=False, harassment_threatening=False, hate=False, hate_threatening=False, illicit=None, illicit_violent=None, self_harm=False, self_harm_instructions=False, self_harm_intent=False, sexual=False, sexual_minors=False, violence=False, violence_graphic=False, self-harm=False, sexual/minors=False, hate/threatening=False, violence/graphic=False, self-harm/intent=False, self-harm/instructions=False, harassment/threatening=False), category_applied_input_types=None, category_scores=CategoryScores(harassment=2.1957972421660088e-05, harassment_threatening=7.920744792500045e-06, hate=3.3822299883468077e-05, hate_threatening=4.28711395272785e-08, illicit=None, illicit_violent=None, self_harm=8.482592761538399e-07, self_harm_instructions=1.33808413238512e-07, self_harm_intent=1.859364004985764e-07, sexual=5.259300451143645e-05, sexual_minors=3.420051871216856e-05, violence=0.0007651916821487248, violence_graphic=3.136725354124792e-05, self-harm=8.4825927615

In [4]:
output.model_dump()

{'categories': {'harassment': False,
  'harassment_threatening': False,
  'hate': False,
  'hate_threatening': False,
  'illicit': None,
  'illicit_violent': None,
  'self_harm': False,
  'self_harm_instructions': False,
  'self_harm_intent': False,
  'sexual': False,
  'sexual_minors': False,
  'violence': False,
  'violence_graphic': False,
  'self-harm': False,
  'sexual/minors': False,
  'hate/threatening': False,
  'violence/graphic': False,
  'self-harm/intent': False,
  'self-harm/instructions': False,
  'harassment/threatening': False},
 'category_applied_input_types': None,
 'category_scores': {'harassment': 2.1957972421660088e-05,
  'harassment_threatening': 7.920744792500045e-06,
  'hate': 3.3822299883468077e-05,
  'hate_threatening': 4.28711395272785e-08,
  'illicit': None,
  'illicit_violent': None,
  'self_harm': 8.482592761538399e-07,
  'self_harm_instructions': 1.33808413238512e-07,
  'self_harm_intent': 1.859364004985764e-07,
  'sexual': 5.259300451143645e-05,
  'sexua

In [5]:
def analysis(text):
    response = client.moderations.create(input=text) #input text가 들어오면, 
    output = response.results[0] #moderation API를 호출하고, 
    output_dict = output.model_dump() #input text를 dcitionary 형태로 만들고,
    flagged_list = [] #moderation uissue를 저장하는 변수.
    for k, v in output_dict['categories'].items(): #카테고리를 순회하면서 , key: value 형태를 얻고,
        #if value = true (detect moderation issue),
        if v:`
            score = output_dict['category_scores'][k] # category score의 key 를 이용해서 점수를 가져오고,
            flagged_list.append((k, score)) #moderation issue 저장하는 변수에 category key와 score를 저장 후 flagged_list에 append해서, 리턴.
    return flagged_list

In [6]:
analysis("I'll kill you")

[('harassment_threatening', 0.397981196641922),
 ('violence', 0.9980654120445251),
 ('harassment/threatening', 0.397981196641922)]

In [7]:
analysis("I hate asian")

[('harassment', 0.9252092838287354), ('hate', 0.7725232839584351)]