In [None]:
# from transformers import AutoModelForMaskedLM, AutoTokenizer
# import torch

In [None]:
# from transformers import GPT2Config
# model_name = "sberbank-ai/mGPT"
# model_config = GPT2Config.from_pretrained(model_name)
# model = AutoModelForMaskedLM.from_pretrained(model_name, config=model_config)
# tokenizer = AutoTokenizer.from_pretrained(model_name)

# dataset = ["Your", "unlabeled", "text", "data", "..."]
# inputs = tokenizer(dataset, padding=True, truncation=True, return_tensors="pt")
# labels = inputs.input_ids.detach().clone()

# optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
# num_epochs = 3
# batch_size = 8
# num_batches = len(dataset) // batch_size
# dataloader = torch.utils.data.DataLoader(
#     list(zip(inputs.input_ids, labels)), batch_size=batch_size, shuffle=True
# )
# model.train()
# for epoch in range(num_epochs):
#     for batch in dataloader:
#         outputs = model(**batch)
#         loss = outputs.loss
#         loss.backward()
#         optimizer.step()
#         optimizer.zero_grad()

# # Save your model
# # model.save_pretrained("./your_model_directory")


In [None]:
# import numpy as np
# import pandas as pd

# df = pd.read_csv("/home/ubuntu/Project_Files/Finetune/Data/sentences.csv")
# # select fist 500 rows
# df = df.iloc[:5000]
# # save to csv
# df.to_csv("/home/ubuntu/Project_Files/Finetune/Data/sentences_5000.csv", index=False)

In [1]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoModelForMaskedLM, AutoTokenizer
from torch.nn import DataParallel
from tqdm.auto import tqdm


class CSVDataset(Dataset):
    def __init__(self, filename, tokenizer, max_length=512):
        self.tokenizer = tokenizer
        self.data = pd.read_csv(filename)
        self.texts = self.data['Sentence'].tolist()  # Ensure the column name matches your CSV
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        inputs = self.tokenizer(text, max_length=self.max_length, padding='max_length', truncation=True, return_tensors="pt")
        return {'input_ids': inputs['input_ids'][0], 'attention_mask': inputs['attention_mask'][0]}  
def collate_fn(batch):
    input_ids = torch.stack([item['input_ids'] for item in batch])
    attention_mask = torch.stack([item['attention_mask'] for item in batch])
    return {'input_ids': input_ids, 'attention_mask': attention_mask} 

model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForMaskedLM.from_pretrained(model_name)
print(model)

# dataset = CSVDataset("/home/ubuntu/Project_Files/Finetune/Data/sentences.csv", tokenizer) # full
dataset = CSVDataset("/home/ubuntu/Project_Files/Finetune/Data/sentences_5000.csv", tokenizer) # only 500
dataloader = DataLoader(dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)
print("Data loaded")

# Utilize multiple GPUs
if torch.cuda.is_available():
    device = torch.device("cuda")
    model = DataParallel(model)
    model.to(device)
else:
    device = torch.device("cpu")

dataloader = list(dataloader)  
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
model.train()
num_epochs = 3
print_every_n_batches = 100  
print("Starting training")


for epoch in range(num_epochs):
    loop = tqdm(dataloader, leave=True)
    total_loss = 0.0
    num_batches = len(dataloader)
    
    for i, batch in enumerate(loop):
        batch = {k: v.to(device) for k, v in batch.items()}  # Move the entire batch to the device
        labels = batch['input_ids'].clone()  # Clone input_ids to use as labels
        outputs = model(input_ids=batch['input_ids'], attention_mask=batch['attention_mask'], labels=labels)
        loss = outputs.loss
        
        # Compute the mean loss for the entire batch and accumulate
        total_loss += loss.mean().item()
        
        loss.mean().backward()  # Compute the mean loss gradient
        optimizer.step()
        optimizer.zero_grad()
        loop.set_description(f'Epoch {epoch+1}/{num_epochs}')
        loop.set_postfix(loss=total_loss / (i + 1))  # Compute and display the average loss
        
        if (i + 1) % print_every_n_batches == 0:
            print(f"Epoch [{epoch+1}/{num_epochs}], Step [{i+1}/{num_batches}], Loss: {total_loss / (i + 1):.4f}")


model.module.save_pretrained("/home/ubuntu/Project_Files/Finetune/Data/trained_model.pth")  


  from .autonotebook import tqdm as notebook_tqdm
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_a

Epoch 1/3:  64%|██████▎   | 100/157 [01:05<00:35,  1.59it/s, loss=0.988]

Epoch [1/3], Step [100/157], Loss: 0.9881


Epoch 1/3: 100%|██████████| 157/157 [01:41<00:00,  1.55it/s, loss=0.635]
Epoch 2/3:  64%|██████▎   | 100/157 [01:03<00:35,  1.60it/s, loss=0.00818]

Epoch [2/3], Step [100/157], Loss: 0.0082


Epoch 2/3: 100%|██████████| 157/157 [01:38<00:00,  1.59it/s, loss=0.00706]
Epoch 3/3:  64%|██████▎   | 100/157 [01:03<00:36,  1.58it/s, loss=0.00356]

Epoch [3/3], Step [100/157], Loss: 0.0036


Epoch 3/3: 100%|██████████| 157/157 [01:38<00:00,  1.59it/s, loss=0.00322]


In [None]:

# for epoch in range(num_epochs):
#     loop = tqdm(dataloader, leave=True)
#     for i, batch in enumerate(loop):
#         inputs = {k: v.to(device) for k, v in batch.items()}
#         outputs = model(**inputs)
#         # print(outputs)
#         loss = outputs.loss
#         print(loss)
#         loss.backward()
#         optimizer.step()
#         optimizer.zero_grad()
#         loop.set_description(f'Epoch {epoch+1}/{num_epochs}')
#         loop.set_postfix(loss=loss.item())

#         if (i + 1) % print_every_n_batches == 0:
#             print(f"Epoch [{epoch+1}/{num_epochs}], Step [{i+1}/{len(dataloader)}], Loss: {loss.item():.4f}")

# for epoch in range(num_epochs):
#     loop = tqdm(dataloader, leave=True)
#     for i, batch in enumerate(loop):
#         batch = {k: v.to(device) for k, v in batch.items()}  # Move the entire batch to the device
#         labels = batch['input_ids'].clone()  # Clone input_ids to use as labels
#         outputs = model(input_ids=batch['input_ids'], attention_mask=batch['attention_mask'], labels=labels)
#         loss = outputs.loss
#         loss.backward()
#         optimizer.step()
#         optimizer.zero_grad()
#         loop.set_description(f'Epoch {epoch+1}/{num_epochs}')
#         loop.set_postfix(loss=loss.item())

#         if (i + 1) % print_every_n_batches == 0:
#             print(f"Epoch [{epoch+1}/{num_epochs}], Step [{i+1}/{len(dataloader)}], Loss: {loss.item():.4f}")


# for epoch in range(num_epochs):
#     loop = tqdm(dataloader, leave=True)
#     total_loss = 0.0
#     num_batches = len(dataloader)
    
#     for i, batch in enumerate(loop):
#         batch = {k: v.to(device) for k, v in batch.items()}  # Move the entire batch to the device
#         labels = batch['input_ids'].clone()  # Clone input_ids to use as labels
#         outputs = model(input_ids=batch['input_ids'], attention_mask=batch['attention_mask'], labels=labels)
#         loss = outputs.loss
        
#         total_loss += loss.item()
#         loss.backward()
#         optimizer.step()
#         optimizer.zero_grad()
#         loop.set_description(f'Epoch {epoch+1}/{num_epochs}')
#         loop.set_postfix(loss=total_loss / (i + 1))  # Compute and display the average loss
        
#         if (i + 1) % print_every_n_batches == 0:
#             print(f"Epoch [{epoch+1}/{num_epochs}], Step [{i+1}/{num_batches}], Loss: {total_loss / (i + 1):.4f}")


In [4]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoModelForMaskedLM, AutoTokenizer
from tqdm.auto import tqdm
import pandas as pd

# Define a Dataset class for testing on the same sentences data
class TestCSVDataset(Dataset):
    def __init__(self, sentences, tokenizer, max_length=512):
        self.tokenizer = tokenizer
        self.sentences = sentences
        self.max_length = max_length

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        sentence = self.sentences[idx]
        inputs = self.tokenizer(sentence, max_length=self.max_length, padding='max_length', truncation=True, return_tensors="pt")
        return {'input_ids': inputs['input_ids'][0], 'attention_mask': inputs['attention_mask'][0]}

# Load the trained model and tokenizer
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Set the device for testing
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load the trained model and move it to the device
model = AutoModelForMaskedLM.from_pretrained("/home/ubuntu/Project_Files/Finetune/Data/trained_model.pth").to(device)

# # Prepare the test sentences
# test_sentences = [
#     "This is a test sentence.",
#     "Another example sentence.",
#     "BERT is a powerful model.",
# ]

csv_file = "/home/ubuntu/Project_Files/Finetune/Data/sentences.csv"
df = pd.read_csv(csv_file)
test_sentences = df["Sentence"][:10].tolist()


# Create a DataLoader for testing
test_dataset = TestCSVDataset(test_sentences, tokenizer)
test_dataloader = DataLoader(test_dataset, batch_size=1)  # Batch size 1 for one sentence at a time

# Set the model to evaluation mode
model.eval()

# Test the model on the test sentences
for i, batch in enumerate(test_dataloader):
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)

    # Generate predictions
    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
    
    # Get predicted tokens
    predicted_token_ids = torch.argmax(outputs.logits, dim=2)
    
    # Convert token IDs to tokens
    predicted_tokens = tokenizer.decode(predicted_token_ids[0].tolist(), skip_special_tokens=True)
    
    print(f"Test Sentence {i+1}:")
    print("Input Sentence:", test_sentences[i])
    print("Predicted Sentence:", predicted_tokens)
    print()


Test Sentence 1:
Input Sentence: In the PHYHIP, which is a type of gene/protein, there is a noted ppi of the gene/protein KIF15.
Predicted Sentence: in the phyhip, which is a type of gene / protein, there is a noted ppi of the gene / protein kif15.

Test Sentence 2:
Input Sentence: In the GPANK1, which is a type of gene/protein, there is a noted ppi of the gene/protein PNMA1.
Predicted Sentence: in the gpank1, which is a type of gene / protein, there is a noted ppi of the gene / protein pnma1.

Test Sentence 3:
Input Sentence: In the ZRSR2, which is a type of gene/protein, there is a noted ppi of the gene/protein TTC33.
Predicted Sentence: in the zrsr2, which is a type of gene / protein, there is a noted ppi of the gene / protein ttc33.

Test Sentence 4:
Input Sentence: In the NRF1, which is a type of gene/protein, there is a noted ppi of the gene/protein MAN1B1.
Predicted Sentence: in the nrf1, which is a type of gene / protein, there is a noted ppi of the gene / protein man1b1.

Test

In [1]:
import json

# Path to your input file
input_file_path = '/home/ubuntu/Project_Files/Finetune/Data/sentences.csv'

# Path to your output JSON file
output_file_path = '/home/ubuntu/Project_Files/Finetune/Data/sentences_new.json'

# List to hold all sentences as dictionaries
sentences = []

# Open and read the input file
with open(input_file_path, 'r', encoding='utf-8') as file:
    for line in file:
        # Remove leading and trailing whitespace and the surrounding quotes
        clean_line = line.strip().strip('"')
        sentence_dict = {"text": clean_line}
        sentences.append(sentence_dict)

# Write the list of dictionaries to a JSON file
with open(output_file_path, 'w', encoding='utf-8') as json_file:
    json.dump(sentences, json_file, indent=4, ensure_ascii=False)


In [14]:
# Check if JSON is written correctly
import json

def check_json_file(file_path):
    try:
        with open(file_path, 'r') as file:
            json.load(file)
        print("JSON file is valid.")
    except json.JSONDecodeError as e:
        print(f"Error in JSON file: {e}")
        print("JSON file is not valid.")

# Example usage:
json_file_path = "/home/ubuntu/Project_Files/Finetune/Data/json_files/combined_sentences.json"  # Replace with the path to your JSON file
check_json_file(json_file_path)

JSON file is valid.


In [8]:
def fix_json_file(file_path):
    with open(file_path, 'r') as file:
        json_content = file.read()

    # Find the position of the unterminated string
    error_pos = json_content.find("Unterminated string")
    if error_pos == -1:
        print("No 'Unterminated string' error found in the JSON file.")
        return

    line_start = json_content.rfind("\n", 0, error_pos) + 1
    column = error_pos - line_start

    # Extract the problematic line
    error_line = json_content.splitlines()[line_start:line_start + 1][0]

    # Add a closing quotation mark to the string
    corrected_line = error_line[:column] + '"' + error_line[column:]

    # Replace the problematic line in the JSON content
    corrected_json_content = json_content[:line_start] + corrected_line + json_content[line_start + len(error_line):]

    # Write the corrected content back to the file
    with open(file_path, 'w') as file:
        file.write(corrected_json_content)

    print("JSON file has been corrected.")


In [12]:
import json
import os
from tqdm import tqdm

input_file_path = '/home/ubuntu/Project_Files/Finetune/Data/sentences.csv'
output_folder = '/home/ubuntu/Project_Files/Finetune/Data/json_files'
os.makedirs(output_folder, exist_ok=True)

def save_json(sentences, file_path):
    with open(file_path, 'w', encoding='utf-8') as json_file:
        json.dump(sentences, json_file, indent=4, ensure_ascii=False)

def process_csv_to_json(input_file_path, output_folder):
    sentences = []
    total_rows_processed = 0
    json_file_index = 1
    output_file_path = os.path.join(output_folder, f'sentences_{json_file_index}.json')

    with open(input_file_path, 'r', encoding='utf-8') as file:
        file_length = sum(1 for line in file)
        file.seek(0)  # Reset file pointer to beginning
        progress_bar = tqdm(total=file_length, desc='Processing CSV', unit='row')

        for line in file:
            clean_line = line.strip().strip('"')
            sentence_dict = {"text": clean_line}
            sentences.append(sentence_dict)
            total_rows_processed += 1
            progress_bar.update(1)  # Update progress bar

            # Save JSON file after every 100,000 rows
            if total_rows_processed % 100000 == 0:
                save_json(sentences, output_file_path)
                # Verify if the saved JSON file is valid
                with open(output_file_path, 'r', encoding='utf-8') as json_file:
                    try:
                        json.load(json_file)
                        print(f"JSON file '{output_file_path}' is valid.")
                    except json.JSONDecodeError as e:
                        print(f"Error in JSON file '{output_file_path}': {e}")

                # Prepare for the next JSON file
                json_file_index += 1
                output_file_path = os.path.join(output_folder, f'sentences_{json_file_index}.json')
                sentences = []

        progress_bar.close()

    if sentences:
        save_json(sentences, output_file_path)
        with open(output_file_path, 'r', encoding='utf-8') as json_file:
            try:
                json.load(json_file)
                print(f"JSON file '{output_file_path}' is valid.")
            except json.JSONDecodeError as e:
                print(f"Error in JSON file '{output_file_path}': {e}")

    print("CSV to JSON conversion completed.")

process_csv_to_json(input_file_path, output_folder)

Processing CSV:   2%|▏         | 195284/8196863 [00:00<00:26, 301041.29row/s]

JSON file '/home/ubuntu/Project_Files/Finetune/Data/json_files/sentences_1.json' is valid.


Processing CSV:   3%|▎         | 247421/8196863 [00:01<00:39, 203517.12row/s]

JSON file '/home/ubuntu/Project_Files/Finetune/Data/json_files/sentences_2.json' is valid.


Processing CSV:   4%|▎         | 300001/8196863 [00:01<00:48, 164039.42row/s]

JSON file '/home/ubuntu/Project_Files/Finetune/Data/json_files/sentences_3.json' is valid.


Processing CSV:   5%|▍         | 400001/8196863 [00:02<00:44, 175440.65row/s]

JSON file '/home/ubuntu/Project_Files/Finetune/Data/json_files/sentences_4.json' is valid.


Processing CSV:   6%|▌         | 500001/8196863 [00:02<00:40, 188870.92row/s]

JSON file '/home/ubuntu/Project_Files/Finetune/Data/json_files/sentences_5.json' is valid.


Processing CSV:   7%|▋         | 600001/8196863 [00:02<00:38, 197024.70row/s]

JSON file '/home/ubuntu/Project_Files/Finetune/Data/json_files/sentences_6.json' is valid.


Processing CSV:   9%|▊         | 700001/8196863 [00:03<00:37, 201722.78row/s]

JSON file '/home/ubuntu/Project_Files/Finetune/Data/json_files/sentences_7.json' is valid.


Processing CSV:  10%|▉         | 800001/8196863 [00:03<00:36, 205208.41row/s]

JSON file '/home/ubuntu/Project_Files/Finetune/Data/json_files/sentences_8.json' is valid.


Processing CSV:  11%|█         | 900001/8196863 [00:04<00:35, 206952.06row/s]

JSON file '/home/ubuntu/Project_Files/Finetune/Data/json_files/sentences_9.json' is valid.


Processing CSV:  12%|█▏        | 1000001/8196863 [00:04<00:34, 208278.92row/s]

JSON file '/home/ubuntu/Project_Files/Finetune/Data/json_files/sentences_10.json' is valid.


Processing CSV:  13%|█▎        | 1100001/8196863 [00:05<00:33, 209775.50row/s]

JSON file '/home/ubuntu/Project_Files/Finetune/Data/json_files/sentences_11.json' is valid.


Processing CSV:  15%|█▍        | 1200001/8196863 [00:05<00:33, 210481.63row/s]

JSON file '/home/ubuntu/Project_Files/Finetune/Data/json_files/sentences_12.json' is valid.


Processing CSV:  16%|█▌        | 1300001/8196863 [00:06<00:32, 210881.47row/s]

JSON file '/home/ubuntu/Project_Files/Finetune/Data/json_files/sentences_13.json' is valid.


Processing CSV:  17%|█▋        | 1400001/8196863 [00:06<00:32, 212205.76row/s]

JSON file '/home/ubuntu/Project_Files/Finetune/Data/json_files/sentences_14.json' is valid.


Processing CSV:  18%|█▊        | 1500001/8196863 [00:07<00:31, 213167.12row/s]

JSON file '/home/ubuntu/Project_Files/Finetune/Data/json_files/sentences_15.json' is valid.


Processing CSV:  20%|█▉        | 1600001/8196863 [00:07<00:30, 214067.28row/s]

JSON file '/home/ubuntu/Project_Files/Finetune/Data/json_files/sentences_16.json' is valid.


Processing CSV:  21%|██        | 1700001/8196863 [00:08<00:30, 214620.33row/s]

JSON file '/home/ubuntu/Project_Files/Finetune/Data/json_files/sentences_17.json' is valid.


Processing CSV:  22%|██▏       | 1800001/8196863 [00:08<00:29, 215202.05row/s]

JSON file '/home/ubuntu/Project_Files/Finetune/Data/json_files/sentences_18.json' is valid.


Processing CSV:  23%|██▎       | 1900001/8196863 [00:09<00:29, 215519.29row/s]

JSON file '/home/ubuntu/Project_Files/Finetune/Data/json_files/sentences_19.json' is valid.


Processing CSV:  24%|██▍       | 2000001/8196863 [00:09<00:28, 215593.17row/s]

JSON file '/home/ubuntu/Project_Files/Finetune/Data/json_files/sentences_20.json' is valid.


Processing CSV:  26%|██▌       | 2100001/8196863 [00:09<00:28, 215973.19row/s]

JSON file '/home/ubuntu/Project_Files/Finetune/Data/json_files/sentences_21.json' is valid.


Processing CSV:  27%|██▋       | 2200001/8196863 [00:10<00:27, 216765.27row/s]

JSON file '/home/ubuntu/Project_Files/Finetune/Data/json_files/sentences_22.json' is valid.


Processing CSV:  28%|██▊       | 2300001/8196863 [00:10<00:27, 216298.94row/s]

JSON file '/home/ubuntu/Project_Files/Finetune/Data/json_files/sentences_23.json' is valid.


Processing CSV:  29%|██▉       | 2400001/8196863 [00:11<00:26, 216405.63row/s]

JSON file '/home/ubuntu/Project_Files/Finetune/Data/json_files/sentences_24.json' is valid.


Processing CSV:  30%|███       | 2500001/8196863 [00:11<00:26, 216111.30row/s]

JSON file '/home/ubuntu/Project_Files/Finetune/Data/json_files/sentences_25.json' is valid.


Processing CSV:  32%|███▏      | 2600001/8196863 [00:12<00:25, 216109.27row/s]

JSON file '/home/ubuntu/Project_Files/Finetune/Data/json_files/sentences_26.json' is valid.


Processing CSV:  33%|███▎      | 2700001/8196863 [00:12<00:25, 216853.30row/s]

JSON file '/home/ubuntu/Project_Files/Finetune/Data/json_files/sentences_27.json' is valid.


Processing CSV:  34%|███▍      | 2800001/8196863 [00:13<00:24, 216865.66row/s]

JSON file '/home/ubuntu/Project_Files/Finetune/Data/json_files/sentences_28.json' is valid.


Processing CSV:  35%|███▌      | 2900001/8196863 [00:13<00:24, 217864.87row/s]

JSON file '/home/ubuntu/Project_Files/Finetune/Data/json_files/sentences_29.json' is valid.


Processing CSV:  37%|███▋      | 3000001/8196863 [00:14<00:23, 218474.02row/s]

JSON file '/home/ubuntu/Project_Files/Finetune/Data/json_files/sentences_30.json' is valid.


Processing CSV:  38%|███▊      | 3100001/8196863 [00:14<00:23, 212841.25row/s]

JSON file '/home/ubuntu/Project_Files/Finetune/Data/json_files/sentences_31.json' is valid.


Processing CSV:  39%|███▉      | 3200001/8196863 [00:15<00:24, 204987.33row/s]

JSON file '/home/ubuntu/Project_Files/Finetune/Data/json_files/sentences_32.json' is valid.


Processing CSV:  40%|████      | 3300001/8196863 [00:15<00:24, 200528.32row/s]

JSON file '/home/ubuntu/Project_Files/Finetune/Data/json_files/sentences_33.json' is valid.


Processing CSV:  41%|████▏     | 3400001/8196863 [00:16<00:24, 194304.36row/s]

JSON file '/home/ubuntu/Project_Files/Finetune/Data/json_files/sentences_34.json' is valid.


Processing CSV:  43%|████▎     | 3500001/8196863 [00:16<00:24, 194983.25row/s]

JSON file '/home/ubuntu/Project_Files/Finetune/Data/json_files/sentences_35.json' is valid.


Processing CSV:  44%|████▍     | 3600001/8196863 [00:17<00:23, 192951.17row/s]

JSON file '/home/ubuntu/Project_Files/Finetune/Data/json_files/sentences_36.json' is valid.


Processing CSV:  45%|████▌     | 3700001/8196863 [00:17<00:23, 193140.17row/s]

JSON file '/home/ubuntu/Project_Files/Finetune/Data/json_files/sentences_37.json' is valid.


Processing CSV:  46%|████▋     | 3800001/8196863 [00:18<00:22, 191366.96row/s]

JSON file '/home/ubuntu/Project_Files/Finetune/Data/json_files/sentences_38.json' is valid.


Processing CSV:  48%|████▊     | 3900001/8196863 [00:18<00:23, 186788.22row/s]

JSON file '/home/ubuntu/Project_Files/Finetune/Data/json_files/sentences_39.json' is valid.


Processing CSV:  49%|████▉     | 4000001/8196863 [00:19<00:22, 189586.53row/s]

JSON file '/home/ubuntu/Project_Files/Finetune/Data/json_files/sentences_40.json' is valid.


Processing CSV:  50%|█████     | 4100001/8196863 [00:19<00:21, 191648.82row/s]

JSON file '/home/ubuntu/Project_Files/Finetune/Data/json_files/sentences_41.json' is valid.


Processing CSV:  51%|█████     | 4200001/8196863 [00:20<00:20, 194392.77row/s]

JSON file '/home/ubuntu/Project_Files/Finetune/Data/json_files/sentences_42.json' is valid.


Processing CSV:  52%|█████▏    | 4300001/8196863 [00:20<00:19, 195138.88row/s]

JSON file '/home/ubuntu/Project_Files/Finetune/Data/json_files/sentences_43.json' is valid.


Processing CSV:  54%|█████▎    | 4400001/8196863 [00:21<00:19, 196593.04row/s]

JSON file '/home/ubuntu/Project_Files/Finetune/Data/json_files/sentences_44.json' is valid.


Processing CSV:  55%|█████▍    | 4500001/8196863 [00:21<00:18, 197157.56row/s]

JSON file '/home/ubuntu/Project_Files/Finetune/Data/json_files/sentences_45.json' is valid.


Processing CSV:  56%|█████▌    | 4600001/8196863 [00:22<00:18, 198391.69row/s]

JSON file '/home/ubuntu/Project_Files/Finetune/Data/json_files/sentences_46.json' is valid.


Processing CSV:  57%|█████▋    | 4700001/8196863 [00:22<00:17, 197825.80row/s]

JSON file '/home/ubuntu/Project_Files/Finetune/Data/json_files/sentences_47.json' is valid.


Processing CSV:  59%|█████▊    | 4800001/8196863 [00:23<00:17, 196897.27row/s]

JSON file '/home/ubuntu/Project_Files/Finetune/Data/json_files/sentences_48.json' is valid.


Processing CSV:  60%|█████▉    | 4900001/8196863 [00:23<00:16, 198251.74row/s]

JSON file '/home/ubuntu/Project_Files/Finetune/Data/json_files/sentences_49.json' is valid.


Processing CSV:  61%|██████    | 5000001/8196863 [00:24<00:16, 197906.74row/s]

JSON file '/home/ubuntu/Project_Files/Finetune/Data/json_files/sentences_50.json' is valid.


Processing CSV:  62%|██████▏   | 5100001/8196863 [00:24<00:15, 198093.90row/s]

JSON file '/home/ubuntu/Project_Files/Finetune/Data/json_files/sentences_51.json' is valid.


Processing CSV:  63%|██████▎   | 5200001/8196863 [00:25<00:15, 198365.10row/s]

JSON file '/home/ubuntu/Project_Files/Finetune/Data/json_files/sentences_52.json' is valid.


Processing CSV:  65%|██████▍   | 5300001/8196863 [00:25<00:14, 198158.61row/s]

JSON file '/home/ubuntu/Project_Files/Finetune/Data/json_files/sentences_53.json' is valid.


Processing CSV:  66%|██████▌   | 5400001/8196863 [00:26<00:14, 199125.83row/s]

JSON file '/home/ubuntu/Project_Files/Finetune/Data/json_files/sentences_54.json' is valid.


Processing CSV:  67%|██████▋   | 5500001/8196863 [00:26<00:13, 200213.35row/s]

JSON file '/home/ubuntu/Project_Files/Finetune/Data/json_files/sentences_55.json' is valid.


Processing CSV:  68%|██████▊   | 5600001/8196863 [00:27<00:12, 202256.29row/s]

JSON file '/home/ubuntu/Project_Files/Finetune/Data/json_files/sentences_56.json' is valid.


Processing CSV:  70%|██████▉   | 5700001/8196863 [00:27<00:12, 203875.05row/s]

JSON file '/home/ubuntu/Project_Files/Finetune/Data/json_files/sentences_57.json' is valid.


Processing CSV:  71%|███████   | 5800001/8196863 [00:28<00:11, 203082.68row/s]

JSON file '/home/ubuntu/Project_Files/Finetune/Data/json_files/sentences_58.json' is valid.


Processing CSV:  72%|███████▏  | 5900001/8196863 [00:28<00:11, 201753.90row/s]

JSON file '/home/ubuntu/Project_Files/Finetune/Data/json_files/sentences_59.json' is valid.


Processing CSV:  73%|███████▎  | 6000001/8196863 [00:29<00:11, 198629.02row/s]

JSON file '/home/ubuntu/Project_Files/Finetune/Data/json_files/sentences_60.json' is valid.


Processing CSV:  74%|███████▍  | 6100001/8196863 [00:29<00:10, 196971.15row/s]

JSON file '/home/ubuntu/Project_Files/Finetune/Data/json_files/sentences_61.json' is valid.


Processing CSV:  76%|███████▌  | 6200001/8196863 [00:30<00:10, 196097.83row/s]

JSON file '/home/ubuntu/Project_Files/Finetune/Data/json_files/sentences_62.json' is valid.


Processing CSV:  77%|███████▋  | 6300001/8196863 [00:30<00:09, 192845.45row/s]

JSON file '/home/ubuntu/Project_Files/Finetune/Data/json_files/sentences_63.json' is valid.


Processing CSV:  78%|███████▊  | 6400001/8196863 [00:31<00:09, 193838.00row/s]

JSON file '/home/ubuntu/Project_Files/Finetune/Data/json_files/sentences_64.json' is valid.


Processing CSV:  79%|███████▉  | 6500001/8196863 [00:31<00:08, 193934.59row/s]

JSON file '/home/ubuntu/Project_Files/Finetune/Data/json_files/sentences_65.json' is valid.


Processing CSV:  81%|████████  | 6600001/8196863 [00:32<00:08, 193234.48row/s]

JSON file '/home/ubuntu/Project_Files/Finetune/Data/json_files/sentences_66.json' is valid.


Processing CSV:  82%|████████▏ | 6700001/8196863 [00:33<00:07, 190617.74row/s]

JSON file '/home/ubuntu/Project_Files/Finetune/Data/json_files/sentences_67.json' is valid.


Processing CSV:  83%|████████▎ | 6800001/8196863 [00:33<00:07, 193593.24row/s]

JSON file '/home/ubuntu/Project_Files/Finetune/Data/json_files/sentences_68.json' is valid.


Processing CSV:  84%|████████▍ | 6900001/8196863 [00:34<00:06, 196809.04row/s]

JSON file '/home/ubuntu/Project_Files/Finetune/Data/json_files/sentences_69.json' is valid.


Processing CSV:  85%|████████▌ | 7000001/8196863 [00:34<00:06, 197919.75row/s]

JSON file '/home/ubuntu/Project_Files/Finetune/Data/json_files/sentences_70.json' is valid.


Processing CSV:  87%|████████▋ | 7100001/8196863 [00:35<00:05, 198791.24row/s]

JSON file '/home/ubuntu/Project_Files/Finetune/Data/json_files/sentences_71.json' is valid.


Processing CSV:  88%|████████▊ | 7200001/8196863 [00:35<00:04, 199385.94row/s]

JSON file '/home/ubuntu/Project_Files/Finetune/Data/json_files/sentences_72.json' is valid.


Processing CSV:  89%|████████▉ | 7300001/8196863 [00:36<00:04, 199625.84row/s]

JSON file '/home/ubuntu/Project_Files/Finetune/Data/json_files/sentences_73.json' is valid.


Processing CSV:  90%|█████████ | 7400001/8196863 [00:36<00:03, 201195.47row/s]

JSON file '/home/ubuntu/Project_Files/Finetune/Data/json_files/sentences_74.json' is valid.


Processing CSV:  91%|█████████▏| 7500001/8196863 [00:37<00:03, 201173.64row/s]

JSON file '/home/ubuntu/Project_Files/Finetune/Data/json_files/sentences_75.json' is valid.


Processing CSV:  93%|█████████▎| 7600001/8196863 [00:37<00:02, 201422.99row/s]

JSON file '/home/ubuntu/Project_Files/Finetune/Data/json_files/sentences_76.json' is valid.


Processing CSV:  94%|█████████▍| 7700001/8196863 [00:37<00:02, 201394.77row/s]

JSON file '/home/ubuntu/Project_Files/Finetune/Data/json_files/sentences_77.json' is valid.


Processing CSV:  95%|█████████▌| 7800001/8196863 [00:38<00:01, 201350.16row/s]

JSON file '/home/ubuntu/Project_Files/Finetune/Data/json_files/sentences_78.json' is valid.


Processing CSV:  96%|█████████▋| 7900001/8196863 [00:38<00:01, 201385.91row/s]

JSON file '/home/ubuntu/Project_Files/Finetune/Data/json_files/sentences_79.json' is valid.


Processing CSV:  98%|█████████▊| 8000001/8196863 [00:39<00:00, 202496.53row/s]

JSON file '/home/ubuntu/Project_Files/Finetune/Data/json_files/sentences_80.json' is valid.


Processing CSV: 100%|██████████| 8196863/8196863 [00:40<00:00, 204600.34row/s]

JSON file '/home/ubuntu/Project_Files/Finetune/Data/json_files/sentences_81.json' is valid.





JSON file '/home/ubuntu/Project_Files/Finetune/Data/json_files/sentences_82.json' is valid.
CSV to JSON conversion completed.


In [13]:
# COmbine all json files
import os
import json

input_folder = '/home/ubuntu/Project_Files/Finetune/Data/json_files'
output_combined_file = '/home/ubuntu/Project_Files/Finetune/Data/json_files/combined_sentences.json'

def combine_json_files(input_folder, output_combined_file):
    combined_sentences = []

    # Iterate through all JSON files in the input folder
    for file_name in os.listdir(input_folder):
        if file_name.endswith('.json'):
            file_path = os.path.join(input_folder, file_name)
            with open(file_path, 'r', encoding='utf-8') as json_file:
                sentences = json.load(json_file)
                combined_sentences.extend(sentences)

    # Write the combined sentences to a single JSON file
    with open(output_combined_file, 'w', encoding='utf-8') as output_file:
        json.dump(combined_sentences, output_file, indent=4, ensure_ascii=False)

    print(f"All JSON files in {input_folder} have been combined into {output_combined_file}.")

# Call the function to combine JSON files
combine_json_files(input_folder, output_combined_file)


All JSON files in /home/ubuntu/Project_Files/Finetune/Data/json_files have been combined into /home/ubuntu/Project_Files/Finetune/Data/json_files/combined_sentences.json.


In [15]:
import json

def json_summary(file_path):
    with open(file_path, 'r', encoding='utf-8') as json_file:
        data = json.load(json_file)
    
    num_entries = len(data)
    print(f"Number of entries: {num_entries}")

json_file_path = '/home/ubuntu/Project_Files/Finetune/Data/json_files/combined_sentences.json'  # Replace with the path to your JSON file
json_summary(json_file_path)


Number of entries: 8196863


In [16]:
# read csv file and cont no of rows
import pandas as pd
df = pd.read_csv('/home/ubuntu/Project_Files/Finetune/Data/sentences.csv')
print(df.shape[0])


8196862
