In [None]:
import json

# Path to the JSON file
file_path = '/content/modelag_trunc.json'

# Open the original JSON file and read its contents
with open(file_path, 'r', encoding='utf-8') as file:
    content = file.read()

# Replace or remove specific special characters
# This example replaces Unicode escape sequences with a placeholder
# You can modify this line to suit your needs
processed_content = content.replace('\\u', 'U+')

# Now, try to parse the modified content to ensure it's valid JSON
try:
    data = json.loads(processed_content)
except json.JSONDecodeError as e:
    print(f"Error decoding JSON: {e}")
    # Handle invalid JSON here (e.g., further cleaning or logging)
else:
    # If the content is valid JSON, write it back to a file
    with open('processed_test.json', 'w', encoding='utf-8') as new_file:
        json.dump(data, new_file, indent=4)

    print("Processed file saved as 'processed_test.json'")

Processed file saved as 'processed_test.json'


In [None]:
# Basic Overview
num_entries = len(data)
unique_hypotheses = set(entry['hyp'] for entry in data)
unique_targets = set(entry['tgt'] for entry in data)
unique_sources = set(entry['src'] for entry in data)
unique_refs = set(entry['ref'] for entry in data)
unique_tasks = set(entry['task'] for entry in data)
unique_models = set(entry['model'] for entry in data)

num_hypotheses = len(set(entry['hyp'] for entry in data))
num_targets = len(set(entry['tgt'] for entry in data))
num_sources = len(set(entry['src'] for entry in data))
num_refs = len(set(entry['ref'] for entry in data))
num_tasks = len(set(entry['task'] for entry in data))
num_models = len(set(entry['model'] for entry in data))

# Text Analysis
# src_lengths = [len(entry['src'].split()) for entry in data]
# hyp_lengths = [len(entry['hyp'].split()) for entry in data]

print(num_entries, num_hypotheses, num_targets, num_sources, num_refs, num_tasks, num_models)

# Printing the head (first element) of each set, if available
head_hypotheses = next(iter(unique_hypotheses), None)
head_targets = next(iter(unique_targets), None)
head_sources = next(iter(unique_sources), None)
head_refs = next(iter(unique_refs), None)
head_tasks = next(iter(unique_tasks), None)
head_models = next(iter(unique_models), None)

print(head_hypotheses, head_targets, head_sources, head_refs, head_tasks, head_models)

3285 3049 2915 2978 1 1 1
What does your planet look like? Please answer! U+0412U+044b U+0434U+043eU+043bU+0436U+043dU+044b U+0438U+0445 U+0443U+0432U+0430U+0436U+0430U+0442U+044c. either MT 


In [None]:
import numpy as np

# Function to calculate statistics for text lengths
def calculate_text_statistics(texts):
    lengths = [len(text.split()) for text in texts]
    avg_length = np.mean(lengths)
    median_length = np.median(lengths)
    p5 = np.percentile(lengths, 5)
    p95 = np.percentile(lengths, 95)
    std_dev = np.std(lengths)
    return avg_length, median_length, p5, p95, std_dev

# Calculate statistics for each category
stats_hypotheses = calculate_text_statistics(unique_hypotheses)
stats_targets = calculate_text_statistics(unique_targets)
stats_refs = calculate_text_statistics(unique_refs)
stats_tasks = calculate_text_statistics([" ".join(task) for task in unique_tasks])  # Assuming tasks are words
stats_models = calculate_text_statistics([" ".join(model) for model in unique_models])  # Assuming models are words

# Print the statistics
print("Hypotheses:", stats_hypotheses)
print("Targets:", stats_targets)
print("References:", stats_refs)
print("Tasks:", stats_tasks)
print("Models:", stats_models)

Hypotheses: (6.2226959658904555, 6.0, 3.0, 11.0, 3.1146133752455123)
Targets: (6.406518010291595, 6.0, 3.0, 11.0, 3.066069282184866)
References: (1.0, 1.0, 1.0, 1.0, 0.0)
Tasks: (2.0, 2.0, 2.0, 2.0, 0.0)
Models: (0.0, 0.0, 0.0, 0.0, 0.0)


In [None]:
import re
from collections import Counter
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')

# Assuming 'data' is a list of dictionaries
unique_hypotheses = set(entry['hyp'] for entry in data)

# Repeat for unique_targets, unique_sources, etc.

# Define a function to preprocess and tokenize text
def tokenize(text):
    text = text.lower()
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    tokens = word_tokenize(text)
    return tokens

# Tokenize each set
tokens_hypotheses = [tokenize(text) for text in unique_hypotheses]
# tokens_srcs = [tokenize(text) for text in unique_sources]
tokens_targets = [tokenize(text) for text in unique_targets]

# Count frequency of each word
word_counts_hypotheses = Counter(token for tokens in tokens_hypotheses for token in tokens)
# word_counts_srcs = Counter(token for tokens in tokens_srcs for token in tokens)
word_counts_targets = Counter(token for tokens in tokens_targets for token in tokens)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
print(word_counts_hypotheses)
print(word_counts_targets)



In [None]:
import pandas as pd
from scipy.stats import pearsonr

# Calculating the length of each source and hypothesis
data_df = pd.DataFrame(data[0:1000])
data_df['src_length'] = data_df['src'].apply(len)
data_df['hyp_length'] = data_df['hyp'].apply(len)

# Calculating the Pearson correlation coefficient
correlation, p_value = pearsonr(data_df['src_length'], data_df['hyp_length'])


# Printing the results
print(f"Correlation coefficient: {correlation}")
print(f"P-value: {p_value}")

Correlation coefficient: 0.9120939581001378
P-value: 0.0


In [None]:
unique_models

{''}

In [None]:
import sys
!{sys.executable} -m pip install sentencepiece
!{sys.executable} -m pip install protobuf==3.20.0
import sentencepiece

from transformers import AutoTokenizer, AutoModel
# tokenizer = AutoTokenizer.from_pretrained("tuner007/pegasus_paraphrase", cache_dir="new_cache_directory")

# Define the model names
unique_models = {
    'facebook/nllb-200-distilled-600M',
    'ltg/flan-t5-definition-en-base',
    'tuner007/pegasus_paraphrase'
}

# Initialize dictionaries to store the models and tokenizers
models = {}
tokenizers = {}

# Download and load each model and its tokenizer
for model_name in unique_models:
    print(f"Downloading and loading model and tokenizer for {model_name}")
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name)
    models[model_name] = model
    tokenizers[model_name] = tokenizer

print("All models and tokenizers are successfully downloaded and loaded.")

Collecting sentencepiece
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.1.99
Collecting protobuf==3.20.0
  Downloading protobuf-3.20.0-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (1.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m14.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: protobuf
  Attempting uninstall: protobuf
    Found existing installation: protobuf 3.20.3
    Uninstalling protobuf-3.20.3:
      Successfully uninstalled protobuf-3.20.3
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
google-ai-generativelangu

tokenizer_config.json:   0%|          | 0.00/564 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/4.85M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.3M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/3.55k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/846 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.46G [00:00<?, ?B/s]

Downloading and loading model and tokenizer for ltg/flan-t5-definition-en-base


tokenizer_config.json:   0%|          | 0.00/2.56k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.51k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

Downloading and loading model and tokenizer for tuner007/pegasus_paraphrase


tokenizer_config.json:   0%|          | 0.00/86.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.14k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/1.91M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.28G [00:00<?, ?B/s]

Some weights of PegasusModel were not initialized from the model checkpoint at tuner007/pegasus_paraphrase and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


All models and tokenizers are successfully downloaded and loaded.


In [None]:
!pip install transformers
!pip install cpm_kernels
import tensorflow as tf
import keras

from transformers import pipeline
import keras
import transformers
# Define a generic input text for testing
test_input = "This is a test sentence."

unique_models = {
    'facebook/nllb-200-distilled-600M',
    'ltg/flan-t5-definition-en-base',
    'tuner007/pegasus_paraphrase'
}

!pip install torch torchvision
!brew install cmake pkg-config
transformers.utils.GENERATION_CONFIG_NAME = None
transformers.utils.cached_file = None
transformers.utils.download_url = None
transformers.utils.extract_commit_hash = None
# model = AutoModel.from_pretrained("THUDM/chatglm-6b", trust_remote_code=True).half()

# Perform a smoke test for each model
for model_name in unique_models:
    print(f"Testing model: {model_name}")

    # Select the appropriate task based on the model
    if "nllb-200-distilled" in model_name:
        task = "translation_en_to_fr"
    elif "flan-t5-definition-en-base" in model_name:
        task = "text2text-generation"
    elif "pegasus_paraphrase" in model_name:
        task = "summarization"
    else:
        print(f"Unknown task for model {model_name}")
        continue

    # Initialize the pipeline
    model_pipeline = pipeline(task, model=model_name, tokenizer=tokenizers[model_name])

    # Perform the task
    try:
        result = model_pipeline(test_input)
        print(f"Result for {model_name}: {result}")
    except Exception as e:
        print(f"Error testing {model_name}: {e}")

Collecting cpm_kernels
  Downloading cpm_kernels-1.0.11-py3-none-any.whl (416 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m416.6/416.6 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: cpm_kernels
Successfully installed cpm_kernels-1.0.11
/bin/bash: line 1: brew: command not found
Testing model: facebook/nllb-200-distilled-600M


generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

Result for facebook/nllb-200-distilled-600M: [{'translation_text': 'fr This is a test sentence.'}]
Testing model: ltg/flan-t5-definition-en-base




Result for ltg/flan-t5-definition-en-base: [{'generated_text': 'A sentence that is a test of the meaning of a word or phrase .'}]
Testing model: tuner007/pegasus_paraphrase


Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at tuner007/pegasus_paraphrase and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Your max_length is set to 60, but your input_length is only 7. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=3)


Result for tuner007/pegasus_paraphrase: [{'summary_text': 'This is a test.'}]
