# 1. Load Libraries

In [33]:
import numpy as np
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModel
from huggingface_hub import hf_hub_download
import json
import onnxruntime as rt

In [34]:
reddit_df = pd.read_csv('../data/combined_cleaned_500k.csv',  lineterminator='\n', encoding='utf8')

In [35]:
reddit_df.head()

Unnamed: 0,text,timestamp,username,link,link_id,parent_id,id,subreddit_id,moderation\r
0,i think most singaporeans dont give a damn who taiwan belong to,2020-04-11 15:49:23,invigo79,/r/singapore/comments/fz7vtl/im_quite_interested_to_know_why_so_many/fn3gbrg/,t3_fz7vtl,t3_fz7vtl,fn3gbrg,t5_2qh8c,"{'removal_reason': None, 'collapsed': False, 'collapsed_reason': None, 'controversiality': 0}\r"
1,fair point the secrecy aspect of it slipped my mind,2020-04-03 09:59:08,potatetoe_tractor,/r/singapore/comments/fu3axm/government_to_table_bill_to_ensure_safe_general/fmau5k3/,t3_fu3axm,t1_fmasya5,fmau5k3,t5_2qh8c,"{'removal_reason': None, 'collapsed': False, 'collapsed_reason': None, 'controversiality': 0}\r"
2,range,2020-02-15 15:07:03,CrossfittJesus,/r/singapore/comments/f4ac70/what_is_ps_defense_imprest/fhp05xc/,t3_f4ac70,t3_f4ac70,fhp05xc,t5_2qh8c,"{'removal_reason': None, 'collapsed': False, 'collapsed_reason': None, 'controversiality': 0}\r"
3,gt this is binary thinking because you think that im inherently blind and because of the majority privilege\n\ngt this demonstrates your inability to accept opposing views and have no choice but to resort to using ad hominem by casting me into an opposing the others group making it a black and white binary argument in order to have an attempt in giving supposed strength to your argument\n\nwell this is clearly very complicated for you but to bring this back to where i came in the idea of normalizing edmws tone amp tenor just because you think it is anywhere commendable that they bash anyone outside their approved groups\n\nto which all i said was \n\ngt racist against everyone sounds the worst cesspool of assholes then\n\nany other place this would be a open shut point but and here we are with your spirited defense of this mindset,2020-06-04 07:07:39,nomad80,/r/singapore/comments/gw55cx/notoracism/fsu4fyd/,t3_gw55cx,t1_fsu3dsf,fsu4fyd,t5_2qh8c,"{'removal_reason': None, 'collapsed': False, 'collapsed_reason': None, 'controversiality': 0}\r"
4,boo boo poor u lmao,2020-10-31 13:52:12,pirorok,/r/singapore/comments/jl6abo/rsingapore_random_discussion_and_small_questions/gap4vkl/,t3_jl6abo,t1_gap4e9y,gap4vkl,t5_2qh8c,"{'removal_reason': None, 'collapsed': False, 'collapsed_reason': None, 'controversiality': 0}\r"


# 2. Test on the first 5 rows of data

In [36]:
small_reddit_df = reddit_df[0:5]

In [37]:
small_reddit_df

Unnamed: 0,text,timestamp,username,link,link_id,parent_id,id,subreddit_id,moderation\r
0,i think most singaporeans dont give a damn who taiwan belong to,2020-04-11 15:49:23,invigo79,/r/singapore/comments/fz7vtl/im_quite_interested_to_know_why_so_many/fn3gbrg/,t3_fz7vtl,t3_fz7vtl,fn3gbrg,t5_2qh8c,"{'removal_reason': None, 'collapsed': False, 'collapsed_reason': None, 'controversiality': 0}\r"
1,fair point the secrecy aspect of it slipped my mind,2020-04-03 09:59:08,potatetoe_tractor,/r/singapore/comments/fu3axm/government_to_table_bill_to_ensure_safe_general/fmau5k3/,t3_fu3axm,t1_fmasya5,fmau5k3,t5_2qh8c,"{'removal_reason': None, 'collapsed': False, 'collapsed_reason': None, 'controversiality': 0}\r"
2,range,2020-02-15 15:07:03,CrossfittJesus,/r/singapore/comments/f4ac70/what_is_ps_defense_imprest/fhp05xc/,t3_f4ac70,t3_f4ac70,fhp05xc,t5_2qh8c,"{'removal_reason': None, 'collapsed': False, 'collapsed_reason': None, 'controversiality': 0}\r"
3,gt this is binary thinking because you think that im inherently blind and because of the majority privilege\n\ngt this demonstrates your inability to accept opposing views and have no choice but to resort to using ad hominem by casting me into an opposing the others group making it a black and white binary argument in order to have an attempt in giving supposed strength to your argument\n\nwell this is clearly very complicated for you but to bring this back to where i came in the idea of normalizing edmws tone amp tenor just because you think it is anywhere commendable that they bash anyone outside their approved groups\n\nto which all i said was \n\ngt racist against everyone sounds the worst cesspool of assholes then\n\nany other place this would be a open shut point but and here we are with your spirited defense of this mindset,2020-06-04 07:07:39,nomad80,/r/singapore/comments/gw55cx/notoracism/fsu4fyd/,t3_gw55cx,t1_fsu3dsf,fsu4fyd,t5_2qh8c,"{'removal_reason': None, 'collapsed': False, 'collapsed_reason': None, 'controversiality': 0}\r"
4,boo boo poor u lmao,2020-10-31 13:52:12,pirorok,/r/singapore/comments/jl6abo/rsingapore_random_discussion_and_small_questions/gap4vkl/,t3_jl6abo,t1_gap4e9y,gap4vkl,t5_2qh8c,"{'removal_reason': None, 'collapsed': False, 'collapsed_reason': None, 'controversiality': 0}\r"


# 3. Model Text Classification

In [38]:
# Download model config
repo_path = "govtech/lionguard-v1"
config_path = hf_hub_download(repo_id=repo_path, filename="config.json")
with open(config_path, 'r') as f:
    config = json.load(f)

In [39]:
#print(config)

## Embedding Function (Tokenise, Model Configuration and Embeds text data)

In [40]:
def get_embeddings(device, data):
    # Load the model and tokenizer
    tokenizer = AutoTokenizer.from_pretrained(config['embedding']['tokenizer'])
    model = AutoModel.from_pretrained(config['embedding']['model'])
    model.eval()
    model.to(device)

    # Generate the embeddings
    batch_size = config['embedding']['batch_size']
    num_batches = int(np.ceil(len(data)/batch_size))
    output = []
    for i in range(num_batches):
        sentences = data[i*batch_size:(i+1)*batch_size]
        encoded_input = tokenizer(sentences, max_length=config['embedding']['max_length'], padding=True, truncation=True, return_tensors='pt')
        encoded_input.to(device)
        with torch.no_grad():
            model_output = model(**encoded_input)
            sentence_embeddings = model_output[0][:, 0]
        sentence_embeddings = torch.nn.functional.normalize(sentence_embeddings, p=2, dim=1)
        output.extend(sentence_embeddings.cpu().numpy())
    
    return np.array(output)

## Predict Function (score prediction with and without thresholds)

In [41]:
def predict(batch_text):
    device = torch.device("cuda") if torch.cuda.is_available() else "cpu"
    embeddings = get_embeddings(device, batch_text)
    embeddings_df = pd.DataFrame(embeddings)

    # Prepare input data
    X_input = np.array(embeddings_df, dtype=np.float32)

    # Load the classifiers
    results = {}
    for category, details in config['classifier'].items():
        # Download the classifier from HuggingFace hub
        local_model_fp = hf_hub_download(repo_id=repo_path, filename=config['classifier'][category]['model_fp'])

        # Run the inference
        session = rt.InferenceSession(local_model_fp)
        input_name = session.get_inputs()[0].name
        outputs = session.run(None, {input_name: X_input})

        # If calibrated, return only the prediction for the unsafe class
        if config['classifier'][category]['calibrated']: 
            scores = [output[1] for output in outputs[1]]
        else:
            scores = outputs[1].flatten()
        
        # Generate the predictions depending on the recommended threshold score
        results[category] = {
            'scores': scores,
            'predictions': {
                'high_recall': [1 if score >= config['classifier'][category]['threshold']['high_recall'] else 0 for score in scores],
                'balanced': [1 if score >= config['classifier'][category]['threshold']['balanced'] else 0 for score in scores],
                'high_precision': [1 if score >= config['classifier'][category]['threshold']['high_precision'] else 0 for score in scores]
            }
        }

    return results

## Generate results in another dataframe

In [42]:
# Extract the text data and id from the DataFrame
batch_id = small_reddit_df['id'].tolist()
batch_text = small_reddit_df['text'].tolist()

# Generate the scores and predictions
results = predict(batch_text)

# Prepare results for DataFrame
output_data = []
for i in range(len(batch_text)):
    output_row = {
        'id': batch_id[i],
        'Text': batch_text[i],
    }
    # IMPT! THIS LOOP WILL PRODUCE 32 COLUMNS! COMMENT OUT IF NOT NEEDED!
    for category in results.keys():
        # scores
        output_row[f'{category} Score'] = results[category]['scores'][i]
        # predictions with highest recall
        output_row[f'{category} HR'] = results[category]['predictions']['high_recall'][i]
        # balanced predictions
        output_row[f'{category} B'] = results[category]['predictions']['balanced'][i]
        # predictions with highest precision
        output_row[f'{category} HP'] = results[category]['predictions']['high_precision'][i]
    output_data.append(output_row)

# Create a DataFrame from the results
small_results_df = pd.DataFrame(output_data)

In [43]:
# Set display option to show all columns
pd.set_option('display.max_columns', None)

# get results table
print(small_results_df)

        id  \
0  fn3gbrg   
1  fmau5k3   
2  fhp05xc   
3  fsu4fyd   
4  gap4vkl   

                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         Text  \
0                                                                 

In [44]:
# See column names
print(small_results_df.columns)

Index(['id', 'Text', 'binary Score', 'binary HR', 'binary B', 'binary HP',
       'hateful Score', 'hateful HR', 'hateful B', 'hateful HP',
       'harassment Score', 'harassment HR', 'harassment B', 'harassment HP',
       'public_harm Score', 'public_harm HR', 'public_harm B',
       'public_harm HP', 'self_harm Score', 'self_harm HR', 'self_harm B',
       'self_harm HP', 'sexual Score', 'sexual HR', 'sexual B', 'sexual HP',
       'toxic Score', 'toxic HR', 'toxic B', 'toxic HP', 'violent Score',
       'violent HR', 'violent B', 'violent HP'],
      dtype='object')


## Hateful and Toxic Scores

In [45]:
# get id, hateful and toxic scores only
condensed_small_results_df = small_results_df[['id', 'hateful Score', 'toxic Score']]

In [46]:
condensed_small_results_df

Unnamed: 0,id,hateful Score,toxic Score
0,fn3gbrg,-0.582897,-0.419336
1,fmau5k3,-1.116736,-1.869361
2,fhp05xc,-1.027191,-0.798019
3,fsu4fyd,-0.419287,1.119166
4,gap4vkl,-0.952112,1.197501


## New dataframe with hateful and toxic scores

In [47]:
# merge the 2 dataframes on 'id'
small_hateful_and_toxic_results_df = pd.merge(small_reddit_df, condensed_small_results_df, on='id', how='inner')

In [48]:
print(small_hateful_and_toxic_results_df)

                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         text  \
0                                                                                                                                                      

# 4. Quick Analysis

In [49]:
# expand resolution to see full text
pd.set_option('display.max_colwidth', None)

small_hateful_and_toxic_results_df[['text', 'hateful Score', 'toxic Score']]

Unnamed: 0,text,hateful Score,toxic Score
0,i think most singaporeans dont give a damn who taiwan belong to,-0.582897,-0.419336
1,fair point the secrecy aspect of it slipped my mind,-1.116736,-1.869361
2,range,-1.027191,-0.798019
3,gt this is binary thinking because you think that im inherently blind and because of the majority privilege\n\ngt this demonstrates your inability to accept opposing views and have no choice but to resort to using ad hominem by casting me into an opposing the others group making it a black and white binary argument in order to have an attempt in giving supposed strength to your argument\n\nwell this is clearly very complicated for you but to bring this back to where i came in the idea of normalizing edmws tone amp tenor just because you think it is anywhere commendable that they bash anyone outside their approved groups\n\nto which all i said was \n\ngt racist against everyone sounds the worst cesspool of assholes then\n\nany other place this would be a open shut point but and here we are with your spirited defense of this mindset,-0.419287,1.119166
4,boo boo poor u lmao,-0.952112,1.197501


Scores can range is [-1,1], where 1 is positive, -1 is negative in that metric. Overall, hatefulness and toxicity are correlated. However, the last 2 text data say otherwise, Lionguard predicts them as not hateful, but toxic. The last 2 texts also show that the longer text is more hateful, but less toxic than the shorter text.

# 5. Hateful Score + Hateful HR & Toxic Score + Toxic HR Analysis Only (drop other scores)

In [50]:
reddit_df.head()

Unnamed: 0,text,timestamp,username,link,link_id,parent_id,id,subreddit_id,moderation\r
0,i think most singaporeans dont give a damn who taiwan belong to,2020-04-11 15:49:23,invigo79,/r/singapore/comments/fz7vtl/im_quite_interested_to_know_why_so_many/fn3gbrg/,t3_fz7vtl,t3_fz7vtl,fn3gbrg,t5_2qh8c,"{'removal_reason': None, 'collapsed': False, 'collapsed_reason': None, 'controversiality': 0}\r"
1,fair point the secrecy aspect of it slipped my mind,2020-04-03 09:59:08,potatetoe_tractor,/r/singapore/comments/fu3axm/government_to_table_bill_to_ensure_safe_general/fmau5k3/,t3_fu3axm,t1_fmasya5,fmau5k3,t5_2qh8c,"{'removal_reason': None, 'collapsed': False, 'collapsed_reason': None, 'controversiality': 0}\r"
2,range,2020-02-15 15:07:03,CrossfittJesus,/r/singapore/comments/f4ac70/what_is_ps_defense_imprest/fhp05xc/,t3_f4ac70,t3_f4ac70,fhp05xc,t5_2qh8c,"{'removal_reason': None, 'collapsed': False, 'collapsed_reason': None, 'controversiality': 0}\r"
3,gt this is binary thinking because you think that im inherently blind and because of the majority privilege\n\ngt this demonstrates your inability to accept opposing views and have no choice but to resort to using ad hominem by casting me into an opposing the others group making it a black and white binary argument in order to have an attempt in giving supposed strength to your argument\n\nwell this is clearly very complicated for you but to bring this back to where i came in the idea of normalizing edmws tone amp tenor just because you think it is anywhere commendable that they bash anyone outside their approved groups\n\nto which all i said was \n\ngt racist against everyone sounds the worst cesspool of assholes then\n\nany other place this would be a open shut point but and here we are with your spirited defense of this mindset,2020-06-04 07:07:39,nomad80,/r/singapore/comments/gw55cx/notoracism/fsu4fyd/,t3_gw55cx,t1_fsu3dsf,fsu4fyd,t5_2qh8c,"{'removal_reason': None, 'collapsed': False, 'collapsed_reason': None, 'controversiality': 0}\r"
4,boo boo poor u lmao,2020-10-31 13:52:12,pirorok,/r/singapore/comments/jl6abo/rsingapore_random_discussion_and_small_questions/gap4vkl/,t3_jl6abo,t1_gap4e9y,gap4vkl,t5_2qh8c,"{'removal_reason': None, 'collapsed': False, 'collapsed_reason': None, 'controversiality': 0}\r"


In [69]:
def predict2(batch_text):
    device = torch.device("cuda") if torch.cuda.is_available() else "cpu"
    embeddings = get_embeddings(device, batch_text)
    embeddings_df = pd.DataFrame(embeddings)

    # Prepare input data
    X_input = np.array(embeddings_df, dtype=np.float32)

    # Define the classifiers we want to focus on
    selected_categories = ['hateful', 'toxic']  # Only focus on 'hateful' and 'toxic'

    # Load the classifiers
    results = {}
    for category in selected_categories:  # Only loop over selected_categories
        # Download the classifier from HuggingFace hub
        local_model_fp = hf_hub_download(repo_id=repo_path, filename=config['classifier'][category]['model_fp'])

        # Run the inference
        session = rt.InferenceSession(local_model_fp)
        input_name = session.get_inputs()[0].name
        outputs = session.run(None, {input_name: X_input})

        # If calibrated, return only the prediction for the unsafe class
        if config['classifier'][category]['calibrated']:
            scores = [output[1] for output in outputs[1]]
        else:
            scores = outputs[1].flatten()

        # Generate the predictions depending on the recommended threshold score
        results[f'{category} Score'] = {  # Directly access 'hateful' and 'toxic' scores
            'scores': scores,
            'predictions': {
                'high_recall': [1 if score >= config['classifier'][category]['threshold']['high_recall'] else 0 for score in scores]
            }
        }
        results[f'{category} HR'] = results[f'{category} Score']['predictions']['high_recall']  # CHANGE 4: Only high_recall predictions

    return results


# 6. Testing on Medium Size Dataset

In [66]:
mid_reddit_df = reddit_df[0:50]

In [67]:
# Ensure text is clean and all entries are strings
mid_reddit_df['text'] = mid_reddit_df['text'].fillna('').astype(str)
batch_text = mid_reddit_df['text'].tolist()
batch_id = mid_reddit_df['id'].tolist()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mid_reddit_df['text'] = mid_reddit_df['text'].fillna('').astype(str)


In [70]:
# Generate the scores and predictions
results = predict2(batch_text)

# Prepare results for DataFrame
output_data = []
for i in range(len(batch_text)):
    output_row = {
        'id': batch_id[i],
        'Text': batch_text[i],
    }

    # Directly add 'hateful Score', 'hateful HR', 'toxic Score', and 'toxic HR' to the output
    output_row['hateful Score'] = results['hateful Score']['scores'][i]  
    output_row['hateful HR'] = results['hateful HR'][i]  
    output_row['toxic Score'] = results['toxic Score']['scores'][i]  
    output_row['toxic HR'] = results['toxic HR'][i]  

    output_data.append(output_row)

# Create a DataFrame from the results
mid_results_df = pd.DataFrame(output_data)

In [64]:
# Set display option to show all columns
pd.set_option('display.max_columns', None)

# get results table
print(mid_results_df)

         id  \
0   fn3gbrg   
1   fmau5k3   
2   fhp05xc   
3   fsu4fyd   
4   gap4vkl   
..      ...   
95  go89wrq   
96  gd28ioy   
97  hed3h6f   
98  fxbnxin   
99  gycbr4l   

                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   

In [71]:
# See column names
print(mid_results_df.columns)

Index(['id', 'Text', 'hateful Score', 'hateful HR', 'toxic Score', 'toxic HR'], dtype='object')


In [76]:
# get id, hateful and toxic scores only
condensed_mid_results_df = mid_results_df[['id', 'hateful Score', 'hateful HR',  'toxic Score', 'toxic HR']]

In [77]:
condensed_mid_results_df

Unnamed: 0,id,hateful Score,hateful HR,toxic Score,toxic HR
0,fn3gbrg,-0.582897,0,-0.419336,0
1,fmau5k3,-1.116736,0,-1.869361,0
2,fhp05xc,-1.027191,0,-0.798019,0
3,fsu4fyd,-0.419287,0,1.119166,1
4,gap4vkl,-0.952112,0,1.197501,1
...,...,...,...,...,...
95,go89wrq,-0.977192,0,-1.005671,0
96,gd28ioy,-1.035369,0,-0.629649,0
97,hed3h6f,-0.979156,0,0.796944,1
98,fxbnxin,-1.187206,0,-1.169706,0


In [78]:
# merge the 2 dataframes on 'id'
mid_hateful_and_toxic_results_df = pd.merge(mid_reddit_df, condensed_mid_results_df, on='id', how='inner')

In [79]:
mid_hateful_and_toxic_results_df

Unnamed: 0,text,timestamp,username,link,link_id,parent_id,id,subreddit_id,moderation\r,hateful Score,hateful HR,toxic Score,toxic HR
0,i think most singaporeans dont give a damn who taiwan belong to,2020-04-11 15:49:23,invigo79,/r/singapore/comments/fz7vtl/im_quite_interested_to_know_why_so_many/fn3gbrg/,t3_fz7vtl,t3_fz7vtl,fn3gbrg,t5_2qh8c,"{'removal_reason': None, 'collapsed': False, 'collapsed_reason': None, 'controversiality': 0}\r",-0.582897,0,-0.419336,0
1,fair point the secrecy aspect of it slipped my mind,2020-04-03 09:59:08,potatetoe_tractor,/r/singapore/comments/fu3axm/government_to_table_bill_to_ensure_safe_general/fmau5k3/,t3_fu3axm,t1_fmasya5,fmau5k3,t5_2qh8c,"{'removal_reason': None, 'collapsed': False, 'collapsed_reason': None, 'controversiality': 0}\r",-1.116736,0,-1.869361,0
2,range,2020-02-15 15:07:03,CrossfittJesus,/r/singapore/comments/f4ac70/what_is_ps_defense_imprest/fhp05xc/,t3_f4ac70,t3_f4ac70,fhp05xc,t5_2qh8c,"{'removal_reason': None, 'collapsed': False, 'collapsed_reason': None, 'controversiality': 0}\r",-1.027191,0,-0.798019,0
3,gt this is binary thinking because you think that im inherently blind and because of the majority privilege\n\ngt this demonstrates your inability to accept opposing views and have no choice but to resort to using ad hominem by casting me into an opposing the others group making it a black and white binary argument in order to have an attempt in giving supposed strength to your argument\n\nwell this is clearly very complicated for you but to bring this back to where i came in the idea of normalizing edmws tone amp tenor just because you think it is anywhere commendable that they bash anyone outside their approved groups\n\nto which all i said was \n\ngt racist against everyone sounds the worst cesspool of assholes then\n\nany other place this would be a open shut point but and here we are with your spirited defense of this mindset,2020-06-04 07:07:39,nomad80,/r/singapore/comments/gw55cx/notoracism/fsu4fyd/,t3_gw55cx,t1_fsu3dsf,fsu4fyd,t5_2qh8c,"{'removal_reason': None, 'collapsed': False, 'collapsed_reason': None, 'controversiality': 0}\r",-0.419287,0,1.119166,1
4,boo boo poor u lmao,2020-10-31 13:52:12,pirorok,/r/singapore/comments/jl6abo/rsingapore_random_discussion_and_small_questions/gap4vkl/,t3_jl6abo,t1_gap4e9y,gap4vkl,t5_2qh8c,"{'removal_reason': None, 'collapsed': False, 'collapsed_reason': None, 'controversiality': 0}\r",-0.952112,0,1.197501,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,hey hang in there,2021-02-21 14:41:28,captmomo,/r/singapore/comments/lohswy/rsingapore_random_discussion_and_small_questions/go89wrq/,t3_lohswy,t1_go89qc0,go89wrq,t5_2qh8c,"{'collapsed_reason_code': None, 'collapsed_reason': None, 'collapsed': False, 'controversiality': 0}\r",-0.977192,0,-1.005671,0
96,i know that feeling on both counts,2020-11-21 14:27:48,[deleted],/r/singapore/comments/jxykph/rsingapore_random_discussion_and_small_questions/gd28ioy/,t3_jxykph,t1_gd241jo,gd28ioy,t5_2qh8c,"{'removal_reason': None, 'collapsed': False, 'collapsed_reason': None, 'controversiality': 0}\r",-1.035369,0,-0.629649,0
97,married or unmarried or both\n\nliving is expensive especially in 1st worlds and raising a family of 2 wife meaning u gotta earn more to feed more mouths no fault divorce happy waif happy laif make less than ur waif thats like imminent reason for waif to initiate divorce then and u lose almost everything \n\ndont make enough money becomes invisible to the opposite s3x and get rejected just dont participate in the rigged game and let it collapse under its own weight you can never win,2021-09-26 16:34:21,8uwotm8,/r/SingaporeRaw/comments/pvv82l/are_men_more_hungry_for_money_than_women/hed3h6f/,t3_pvv82l,t3_pvv82l,hed3h6f,t5_xnx04,"{'collapsed_reason_code': None, 'collapsed_reason': None, 'collapsed': False, 'controversiality': 0}\r",-0.979156,0,0.796944,1
98,the law is worded such that it can apply to anyone not just those contesting,2020-07-08 15:54:41,Swiftdancer,/r/singapore/comments/hnhrnh/lee_bee_wah_psp_lies_exploits_elderly/fxbnxin/,t3_hnhrnh,t1_fxbhlkp,fxbnxin,t5_2qh8c,"{'removal_reason': None, 'collapsed': False, 'collapsed_reason': None, 'controversiality': 0}\r",-1.187206,0,-1.169706,0


# 7a. Scoring for full dataset (part1)

In [87]:
reddit_df.shape
# (396298, 9)
# part 1 is from row 0:198,148
# part 2 is from row 198,149 : 396,297

(396298, 9)

In [88]:
part1_reddit_df = reddit_df[0:198148]

In [89]:
# Ensure text is clean and all entries are strings
part1_reddit_df['text'] = part1_reddit_df['text'].fillna('').astype(str)
batch_text_part1 = part1_reddit_df['text'].tolist()
batch_id_part1 =part1_reddit_df['id'].tolist()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  part1_reddit_df['text'] = part1_reddit_df['text'].fillna('').astype(str)


In [90]:
# Generate the scores and predictions
results = predict2(batch_text_part1)

# Prepare results for DataFrame
output_data = []
for i in range(len(batch_text_part1)):
    output_row = {
        'id': batch_id_part1[i],
        'Text': batch_text_part1[i],
    }

    # Directly add 'hateful Score', 'hateful HR', 'toxic Score', and 'toxic HR' to the output
    output_row['hateful Score'] = results['hateful Score']['scores'][i]  
    output_row['hateful HR'] = results['hateful HR'][i]  
    output_row['toxic Score'] = results['toxic Score']['scores'][i]  
    output_row['toxic HR'] = results['toxic HR'][i] 

    output_data.append(output_row)

# Create a DataFrame from the results
part1_results_df = pd.DataFrame(output_data)

In [92]:
# See column names
print(part1_results_df.columns)

Index(['id', 'Text', 'hateful Score', 'hateful HR', 'toxic Score', 'toxic HR'], dtype='object')


In [93]:
# get id, hateful and toxic scores only
condensed_part1_results_df = part1_results_df[['id', 'hateful Score', 'hateful HR',  'toxic Score', 'toxic HR']]

In [94]:
condensed_part1_results_df

Unnamed: 0,id,hateful Score,hateful HR,toxic Score,toxic HR
0,fn3gbrg,-0.582897,0,-0.419336,0
1,fmau5k3,-1.116736,0,-1.869361,0
2,fhp05xc,-1.027191,0,-0.798019,0
3,fsu4fyd,-0.419287,0,1.119166,1
4,gap4vkl,-0.952112,0,1.197501,1
5,g7guz2q,-1.064189,0,-0.542219,0
6,hhwhn2u,-0.922029,0,0.31561,1
7,fmb0nae,-1.213592,0,-1.78531,0
8,fy05ljb,-1.126568,0,-0.934198,0
9,h4d6tcs,-1.04922,0,-0.892172,0


In [95]:
# merge the 2 dataframes on 'id'
part1_hateful_and_toxic_results_df = pd.merge(part1_reddit_df, condensed_part1_results_df, on='id', how='inner')

In [96]:
part1_hateful_and_toxic_results_df

Unnamed: 0,text,timestamp,username,link,link_id,parent_id,id,subreddit_id,moderation\r,hateful Score,hateful HR,toxic Score,toxic HR
0,i think most singaporeans dont give a damn who taiwan belong to,2020-04-11 15:49:23,invigo79,/r/singapore/comments/fz7vtl/im_quite_interested_to_know_why_so_many/fn3gbrg/,t3_fz7vtl,t3_fz7vtl,fn3gbrg,t5_2qh8c,"{'removal_reason': None, 'collapsed': False, 'collapsed_reason': None, 'controversiality': 0}\r",-0.582897,0,-0.419336,0
1,fair point the secrecy aspect of it slipped my mind,2020-04-03 09:59:08,potatetoe_tractor,/r/singapore/comments/fu3axm/government_to_table_bill_to_ensure_safe_general/fmau5k3/,t3_fu3axm,t1_fmasya5,fmau5k3,t5_2qh8c,"{'removal_reason': None, 'collapsed': False, 'collapsed_reason': None, 'controversiality': 0}\r",-1.116736,0,-1.869361,0
2,range,2020-02-15 15:07:03,CrossfittJesus,/r/singapore/comments/f4ac70/what_is_ps_defense_imprest/fhp05xc/,t3_f4ac70,t3_f4ac70,fhp05xc,t5_2qh8c,"{'removal_reason': None, 'collapsed': False, 'collapsed_reason': None, 'controversiality': 0}\r",-1.027191,0,-0.798019,0
3,gt this is binary thinking because you think that im inherently blind and because of the majority privilege\n\ngt this demonstrates your inability to accept opposing views and have no choice but to resort to using ad hominem by casting me into an opposing the others group making it a black and white binary argument in order to have an attempt in giving supposed strength to your argument\n\nwell this is clearly very complicated for you but to bring this back to where i came in the idea of normalizing edmws tone amp tenor just because you think it is anywhere commendable that they bash anyone outside their approved groups\n\nto which all i said was \n\ngt racist against everyone sounds the worst cesspool of assholes then\n\nany other place this would be a open shut point but and here we are with your spirited defense of this mindset,2020-06-04 07:07:39,nomad80,/r/singapore/comments/gw55cx/notoracism/fsu4fyd/,t3_gw55cx,t1_fsu3dsf,fsu4fyd,t5_2qh8c,"{'removal_reason': None, 'collapsed': False, 'collapsed_reason': None, 'controversiality': 0}\r",-0.419287,0,1.119166,1
4,boo boo poor u lmao,2020-10-31 13:52:12,pirorok,/r/singapore/comments/jl6abo/rsingapore_random_discussion_and_small_questions/gap4vkl/,t3_jl6abo,t1_gap4e9y,gap4vkl,t5_2qh8c,"{'removal_reason': None, 'collapsed': False, 'collapsed_reason': None, 'controversiality': 0}\r",-0.952112,0,1.197501,1
5,its a very simple trick popular with insta and wedding photogs you just need a surface depth dors not really matter,2020-10-03 00:50:29,FitCranberry,/r/singapore/comments/j3zked/the_magic_a_puddle_of_water_on_the_floor_can_do/g7guz2q/,t3_j3zked,t1_g7go6qv,g7guz2q,t5_2qh8c,"{'removal_reason': None, 'collapsed': False, 'collapsed_reason': None, 'controversiality': 0}\r",-1.064189,0,-0.542219,0
6,because she is angry that despite all the hard times she had stood through with him he seemed to have moved on pretty damn fast from their divorce getting a new gf and dog she feels like he didnt even care about all that she went through and isnt mourning the end of their relationship while she is stuck there\n\ngood news for her is that i read somewhere once that while guys rebound faster long term they take it harder while women take much longer to get through the breakdown of a relationship yet end up happier in the long run\n\nbad news for her is that she does not look good shitting all over female talents just because she cant trust her husband,2021-10-24 20:03:39,StareintotheSun2020,/r/singapore/comments/qeu001/deleted_by_user/hhwhn2u/,t3_qeu001,t1_hhw1usq,hhwhn2u,t5_2qh8c,"{'collapsed_reason_code': None, 'collapsed_reason': None, 'collapsed': False, 'controversiality': 1}\r",-0.922029,0,0.31561,1
7,sometimes there are restrictions on how often people can go out to buy essentials though or how many people per household that isnt the case here,2020-04-03 11:54:04,pm_me_pm_speeches,/r/singapore/comments/fu4ch0/pm_lee_announces_new_stricter_restrictions_to/fmb0nae/,t3_fu4ch0,t1_fmaw7ot,fmb0nae,t5_2qh8c,"{'removal_reason': None, 'collapsed': False, 'collapsed_reason': None, 'controversiality': 0}\r",-1.213592,0,-1.78531,0
8,oo yup thats probably the case im on mobile,2020-07-14 04:04:51,charroxgrin,/r/singapore/comments/hqowfd/rsingapore_random_discussion_and_small_questions/fy05ljb/,t3_hqowfd,t1_fy05ccz,fy05ljb,t5_2qh8c,"{'removal_reason': None, 'collapsed': False, 'collapsed_reason': None, 'controversiality': 0}\r",-1.126568,0,-0.934198,0
9,delta counterpart,2021-07-07 15:55:34,SamBellFromSarang,/r/SingaporeRaw/comments/ofac8p/indian_government_there_are_650000_indians_in/h4d6tcs/,t3_ofac8p,t1_h4bapip,h4d6tcs,t5_xnx04,"{'collapsed_reason_code': None, 'collapsed_reason': None, 'collapsed': False, 'controversiality': 0}\r",-1.04922,0,-0.892172,0


# 7b Scoring for full dataset (part2)
Duplicate section 7a Scoring for full dataset (part1)



Change all variable names with "part1" to "part2".

Change slicing of dataset: 
part 1 is from row 0:198,148
part 2 is from row 198,149 : 396,297