In [None]:
import pandas as pd
import json

# Extracting a subset of OpenHermes-2.5

In [None]:
df = pd.read_json('../data/OpenHermes-2.5.jsonl', lines=True)

print(df.head())
df.keys()

In [None]:
df["source"].unique()

# Make a new dataset with only the English articles and None
df_en = df[df["language"].isin(["English", None])]

values = df_en["source"].value_counts(dropna=False)

In [None]:
num_total = 100_000

new_values = pd.Series()

for category, count in values.items():
    # Normalize
    new_values[category] = int(count / len(df_en) * num_total)

new_values

In [None]:
df_en["source"].isna().sum()

In [None]:
reduced_df = pd.DataFrame()

for category, count in new_values.items():
    if pd.isna(category):
        sampled_df = df_en[df_en["source"].isna()].sample(n=count, random_state=42)
    else:
        sampled_df = df_en[df_en["source"] == category].sample(n=count, random_state=42)
    reduced_df = pd.concat([reduced_df, sampled_df], ignore_index=True)


In [None]:
# Shuffle the entire dataset
reduced_df = reduced_df.sample(frac=1).reset_index(drop=True)

In [None]:
convo = reduced_df.iloc[0]['conversations'][0]
convo

In [None]:
# Save reduced_df to a jsonl file
reduced_df.to_json('../data/OpenHermes-2.5-100k.jsonl', orient='records', lines=True)

In [None]:
reduced_df = pd.read_json('../data/OpenHermes-2.5-100k.jsonl', lines=True)

In [None]:
ROLEMAP = {'human' : '<human>', 'gpt' : '<bot>', 'system' : 'content'}

test = reduced_df.iloc[0]['conversations']
new_json = "test.jsonl"

with open(new_json, 'w') as f:
    for i, row in reduced_df.iterrows():
        turns = []
        conversations = row['conversations']
        for conv in conversations: 
            role = ROLEMAP[conv['from']]
            text = conv['value']


#for i in range(10):

In [None]:
input = "OpenHermes-2.5-300k.jsonl"
output = "test.jsonl"

with open(input, 'r') as f:
    with open(output, 'w') as g:
        for line in f:
            data = json.loads(line)
            conv_dict = data['conversations']

# Extracting .apkg anki-file and creating a dataset from it.

In [None]:
import sqlite3
import json
import pandas as pd

In [None]:
db_path = '../collection.anki21'

con = sqlite3.connect(db_path)
query = """
SELECT id, flds, tags FROM notes;
"""

df = pd.read_sql_query(query, con)

In [None]:
for _, row in df.iterrows():
    print(row['flds'].split('\x1f')[1]), print(row['flds'].split('\x1f')[3])

In [None]:
with open('../hogskoleprovet-ord-8.5k.jsonl', 'w', encoding='utf-8') as file:
    for _, row in df.iterrows():
        data = {
            'metadata': 'hogskoleprovet-ord',
            'text': row['flds'].split('\x1f')
        }
        data['text'][1] = f'{data['text'][0].capitalize()} betyder {data['text'][1]}' 
        data['text'][0] = f'Vad betyder {data['text'][0]}?'
        json.dump(data, file)
        file.write('\n')


In [None]:
with open('../en-swe.jsonl', 'a', encoding='utf-8') as file:
    for _, row in df.iterrows():
        extract_rows = row['flds'].split('\x1f')
        if "en" in extract_rows[1] or "ett" in extract_rows[1] or "att" in extract_rows[1]:
            continue 
        data = {
            "en":extract_rows[3],
            "sv":extract_rows[1]
        }
        json.dump(data, file)
        file.write('\n')

## Convert data into the format I use for training.


In [None]:
with open('test.jsonl', 'r', encoding='utf-8') as file:
    data = []
    for line in file:
        data.append(json.loads(line))
 
len(data)

In [None]:
data[0]['conversations'][0]['value']

In [None]:
with open('OpenHermes-28k-hsp-3k.jsonl', 'w', encoding='utf-8') as file:
    for line in data:
        new_dict = {
            'text':[]
        }
        lst = []
        question_string = ""
        for convs in line['conversations'][:-1]:
            question_string += f"\n{convs['value']}" if question_string else convs['value']
        question = {"<human>" : question_string}
        lst.append(question)
        answer = {"<bot>" : line['conversations'][-1]['value']}
        lst.append(answer)
        new_dict['text'] = lst
        json.dump(new_dict, file)
        file.write("\n")

In [None]:
import json
with open('./data/hogskoleprovet-ord-8.5k.jsonl', 'r', encoding='utf-8') as file:
    data = []
    for line in file:
        data.append(json.loads(line))
 
len(data)

In [None]:
with open('OpenHermes-28k-hsp-3k.jsonl', 'a', encoding='utf-8') as file:
    for line in data:
        new_dict = {
            'text':[
                {"<human>":line['text'][0]},
                {"<bot>":line['text'][1]}
            ]
        }
        json.dump(new_dict, file)
        file.write("\n")
        

In [None]:
# Shuffle the data in 'OpenHermes-28k-hsp-8k.jsonl'
import random
import json

with open('OpenHermes-28k-hsp-8k.jsonl', 'r', encoding='utf-8') as file:
    data = []
    for line in file:
        data.append(json.loads(line))

random.shuffle(data)

with open('OpenHermes-28k-hsp-8k-SHUFFLED.jsonl', 'w', encoding='utf-8') as file:
    for line in data:
        json.dump(line, file)
        file.write("\n")

In [None]:
# Take the data in 'OpenHermes-28k-hsp-8k-SHUFFLED.jsonl' and delete every line that has the occurence of '<|endoftext|>'ArithmeticError
import json

with open('OpenHermes-28k-hsp-8k-SHUFFLED.jsonl', 'r', encoding='utf-8') as file:
    data = []
    for line in file:
        data.append(json.loads(line))

new_data = []
for line in data:
    if line['text'][0]['<human>'].find('<|endoftext|>') == -1 and line['text'][1]['<bot>'].find('<|endoftext|>') == -1:
        new_data.append(line)

with open('OpenHermes-28k-hsp-8k-SHUFFLED.jsonl', 'w', encoding='utf-8') as file:
    for line in new_data:
        json.dump(line, file)
        file.write("\n")



### Deleting english rows from SlimOrca

In [None]:
import json
path = "../data/SlimOrca-sv-CONTINUE-3.jsonl"

with open(path, 'r', encoding='utf-8') as file:
    data = []
    for line in file:
        data.append(json.loads(line))

data 

with open(path, 'w', encoding='utf-8') as file:
    for line in data:
        # Delete 0:th index from conversations value
        for conv in line['conversations']:
            # MAke it a string and not a list
            conv['value'] = conv['value'][1]
        json.dump(line, file)
        file.write("\n")

### Deleting english rows from DPO-format

In [None]:
import json
path = "../data/Orca-DPO-pairs-geq2k.jsonl"

with open(path, 'r', encoding='utf-8') as file:
    data = []
    for line in file:
        data.append(json.loads(line))

data[0]

In [None]:
output_path = "../data/Orca-DPO-pairs-geq2k-extracted.jsonl"
with open(output_path, 'w', encoding='utf-8') as file:
    for line in data:
        if line['system'] != "\"\"":
            line['system'] = json.loads(line['system'])[1]
        else:
            line['system'] = ""
        line['question'] = line['question'][1]
        line['chosen'] = line['chosen'][1]
        line['rejected'] = line['rejected'][1]
        json.dump(line, file)
        file.write("\n")


### Filtering by only 'conversations' entry.

In [None]:
# Delete all entries in the json except conversations
import json

path = "../data/CamelAI-7k-sv-2.jsonl"

with open(path , 'r', encoding='utf-8') as file:
    data = []
    for line in file:
        data.append(json.loads(line))


with open(path, 'w', encoding='utf-8') as file:
    for line in data:
        new_dict = {
            'conversations':line['conversations']
        }
        json.dump(new_dict, file)
        file.write("\n")


### Conveting swe-instruct to conversation format

In [None]:
import json

path = "../data/questions-5.jsonl"

with open(path, 'r', encoding='utf-8') as file:
    data = []
    for line in file:
        data.append(json.loads(line))
data

In [None]:
import json

path = "../data/bibblansvarar-llama-3-aisweden.jsonl"

with open(path, 'r', encoding='utf-8') as file:
    data = []
    for line in file:
        data.append(json.loads(line))
data

In [None]:
data

In [None]:
# Convert data to format {"conversations":[{"from":"human","value":"text"},{"from":"gpt","value":"text"}]}
mapper = {"instruction":"human", "generation":"gpt"}

for line in data:
    new_line = {"conversations": []}
    for key, value in line.items():
        if key == "model_name":
            continue
        new_line['conversations'].append({"from":mapper[key], "value":value})

    with open("../data/BibblanSvarar-Llama3-generated.jsonl", 'a', encoding='utf-8') as file:
        json.dump(new_line, file)
        file.write("\n")


In [None]:
# Merge swe-instruct-slimorcaformat with SlimOrca-sv-11k-v2.jsonl and shuffle

import json

path = "../data/SlimOrca-sv-11k-v2.jsonl"

with open(path, 'r', encoding='utf-8') as file:
    data = []
    for line in file:
        data.append(json.loads(line))


path = "../data/swe-instruct-slimorcaformat.jsonl"

with open(path, 'r', encoding='utf-8') as file:
    data2 = []
    for line in file:
        data2.append(json.loads(line))


# Merge the two datasets and upload it to huggingface
data3 = data + data2


# Shuffle the data
import random

random.shuffle(data3)

# Upload to huggingface
with open("../data/sv-instruct-v1.jsonl", 'w', encoding='utf-8') as file:
    for line in data3:
        json.dump(line, file)
        file.write("\n")

In [None]:
from datasets import load_dataset

dataset = load_dataset('json', data_files='../data/hopkok-v1.jsonl')


In [None]:
dataset.push_to_hub('skvarre/hopkok-v1')

## Load and Save Dataset to file

In [None]:
from datasets import load_dataset
dataset = load_dataset("AI-Sweden-Models/BiaSWE")

In [None]:
save_path = "../data/BiaSWE.jsonl"

dataset.to_json(save_path, orient='records', lines=True)

In [None]:
# Save the dataset to a jsonl file
import json

with open(save_path, 'w', encoding='utf-8') as file:
    for line in dataset:
        json.dump(line, file)
        file.write("\n")

### Split around .user

In [None]:
import json

save_path = "../data/bibblansvarar-llama-3-aisweden-2.jsonl"

# Open the save path file
with open(save_path, 'r', encoding='utf-8') as file:
    data = [json.loads(line) for line in file]

# Split the "generation" entry around ".user" and save it
for item in data:
    item['generation'] = item['generation'].split('.user')[0]

# Save the modified data to a new file
output_path = save_path.replace('.jsonl', '-split.jsonl')
with open(output_path, 'w', encoding='utf-8') as file:
    for item in data:
        json.dump(item, file)
        file.write('\n')

print(f"Split data saved to {output_path}")

In [None]:

import json

bad_path = "../data/bibblansvarar-llama-3-aisweden.jsonl"
good_path = "../data/bibblansvarar-llama-3-aisweden-2-split.jsonl"

# Read the contents of the bad_path file
with open(bad_path, 'r', encoding='utf-8') as file:
    bad_data = [json.loads(line) for line in file]

# Read the contents of the good_path file
with open(good_path, 'r', encoding='utf-8') as file:
    good_data = [json.loads(line) for line in file]

# Replace the lines in bad_data with corresponding lines from good_data
for i, line in enumerate(bad_data):
    if line['generation'].endswith("Fok"):
        bad_data[i] = good_data[i]

# Write the updated bad_data back to the bad_path file
with open(bad_path, 'w', encoding='utf-8') as file:
    for line in bad_data:
        json.dump(line, file)
        file.write('\n')

## Verify JSON

In [None]:
import json

def verify_json_lines(file_path):
    """
    Verifies if a file is a valid JSON Lines file.
    
    Parameters:
    - file_path: The path to the JSON Lines file.
    
    Returns:
    - valid_lines: Number of valid JSON lines.
    - invalid_lines: Number of invalid JSON lines.
    """
    valid_lines = 0
    invalid_lines = 0
    
    with open(file_path, 'r') as file:
        for line_number, line in enumerate(file, start=1):
            try:
                json.loads(line)
                valid_lines += 1
            except json.JSONDecodeError:
                print(f"Invalid JSON on line {line_number}: {line.strip()}")
                invalid_lines += 1
    
    return valid_lines, invalid_lines

# Example usage:
file_path = "../data/bibblansvarar-llama-3-aisweden.jsonl"
valid_lines, invalid_lines = verify_json_lines(file_path)
print(f"Valid lines: {valid_lines}, Invalid lines: {invalid_lines}")

### Adding more bad-examples.

In [None]:
import json 
path = "../corrected-examples-en-sv.jsonl"

data = {
        "en":"",
        "sv":""
}

data['en'] = """"""

data['sv'] = """"""

with open(path, 'a', encoding='utf-8') as file:
        json.dump(data, file)
        file.write("\n")
        # flush




In [None]:
import json 
path = "../data/bibblansvarar.jsonl"

data = {
        "conversations":[
                {"from":"human","value":""},
                {"from":"gpt","value":""}
        ]
}

data['conversations'][0]['value'] = """"""

data['conversations'][1]['value'] = """"""

with open(path, 'a', encoding='utf-8') as file:                
        json.dump(data, file)
        file.write("\n")
        # flush


In [None]:
import json
path = "../bezzerwizzer.jsonl"

data = {
        "conversations":[
                {"from":"human","value":""},
                {"from":"gpt","value":""}
        ]
}

data['conversations'][0]['value'] = """"""

data['conversations'][1]['value'] = """"""

with open(path, 'a', encoding='utf-8') as file:                
        json.dump(data, file)
        file.write("\n")
        # flush


In [None]:
import json

path = "../data/bibblansvarar-llama-3-aisweden.jsonl"

# Read the data from the file
with open(path, 'r', encoding='utf-8') as file:
    data = [json.loads(line) for line in file]

# Define the substring to search for
substring = ""

# Modify the data where the instruction has the given substring
for line in data:
    if substring in line['instruction']:
        print("Found it!")
        # Modify the instruction or any other desired field
        line['generation'] = """"""

# Save the modified data back to the file
with open(path, 'w', encoding='utf-8') as file:
    for line in data:
        json.dump(line, file)
        file.write("\n")

# Clean annotation in BiaSWE

In [1]:
import json

path = "../data/BiaSWE.jsonl"

# Read the data from the file
with open(path, 'r', encoding='utf-8') as file:
    data = [json.loads(line) for line in file]


In [2]:
data[250]['annotations']

{'annotator 1': {'category': 'Anti-feminism and denial of discrimination (Opposing feminism, believing that feminism is not necessary or that women should not have the same rights as men do. Includes: The belief that men are discriminated against/that discrimination doesn’t exist)',
  'comment': None,
  'hate_speech': 'Yes',
  'misogyny': 'Yes',
  'rating': '7'},
 'annotator 2': {'category': 'Stereotype (A widely held but fixed and oversimplified image or idea of a particular type of person or thing. Includes: Gender essentialism/Benevolent sexism/Intersectional, identity-based misogyny/Transmisogyny and homophobia )',
  'comment': "The author thinks men are better survivors than women, doesn't express it very explicitly though but it is obvious.",
  'hate_speech': 'Yes',
  'misogyny': 'Yes',
  'rating': '5'},
 'annotator 3': {'category': 'Anti-feminism and denial of discrimination (Opposing feminism, believing that feminism is not necessary or that women should not have the same right

In [13]:
new_annotation = {
    "hate_speech": False,
    "misogyny": False,
}

for line in data:
    new_annotation = {
        "hate_speech": False,
        "misogyny": False,
    }
    hate_speech = 0
    no_hate = 0
    misogyny = 0
    no_mis = 0
    for annotation in line['annotations']:
        annot_json = line['annotations'][annotation]
        if annot_json != None:
            if annot_json['hate_speech'] == "Yes":
                hate_speech += 1
            else:
                no_hate += 1
            if annot_json['misogyny'] == "Yes":
                misogyny += 1
            else:
                no_mis += 1
    if hate_speech >= no_hate:
        new_annotation['hate_speech'] = True
    if misogyny >= no_mis:
        new_annotation['misogyny'] = True
    line['annotations'] = new_annotation    



In [15]:
data[0]

{'text': 'Ni som bor i hyreslägenhet! Varför i helvete gör ni det? Inte råd?: Hej!  Tycker de är allt för mycket folk som söker bostad och gnäller att det inte finns något.. Köp en för helvete! Vad gör ni av era pengar egentligen? Så min fråga är varför köper inte fler personer lägenhet? Varför super ni upp hela lönen istället för att spara till kontantinsats? Eller trivs ni så bra i hyresghetton?',
 'annotations': {'hate_speech': True, 'misogyny': False}}

In [17]:
# Save the modified data back to the file
new_path = "../data/BiaSWE-annotated.jsonl"
with open(new_path, 'w', encoding='utf-8') as file:
    for line in data:
        json.dump(line, file)
        file.write("\n")
