In [None]:
import os
import numpy as np
import zipfile

with zipfile.ZipFile('/content/drive/MyDrive/Datasets/SOURCE5/SOURCE4.zip', 'r') as z_f:
  z_f.extractall('/content')

In [None]:
!pip install english-words

In [None]:
import pandas as pd

freq_df = pd.read_csv('/content/drive/MyDrive/Datasets/etc/unigram_freq.csv')
freq_df = freq_df.sort_values(by=['count'], axis=0, ascending=False)
freq_words = freq_df['word'].to_numpy()
freq_df.head()

In [None]:
from english_words import get_english_words_set
import numpy as np

words_list = list(get_english_words_set(['web2']))

SOURCE2_WORDS_PATH = '/content/drive/MyDrive/Datasets/SOURCE2/words_new.txt'
words_txt = []
with open(SOURCE2_WORDS_PATH, 'r') as w_f:
    lines = w_f.readlines()
    for line in lines:
        if line[0] == "#":
            continue
        spl = line.strip().split(" ")
        words_txt.append(spl[-1])

unique_source2_words = np.unique(np.array(words_txt))
freq_without_source2 = [w for w in freq_words if w not in unique_source2_words]
freq_source2_intersection = [w for w in freq_words if w in unique_source2_words]


In [None]:
model_vocab_size = 50250

freq_new_size = model_vocab_size - len(unique_source2_words)
cropped_freq = freq_without_source2[:freq_new_size]

unique_source2_words, source2_counts = np.unique(words_txt, return_counts=True)
sorted_pairs = sorted(zip(unique_source2_words, source2_counts), key=lambda x: x[1], reverse=True)
sorted_unique, sorted_counts = zip(*sorted_pairs)

sorted_unique = list(sorted_unique)
sorted_counts = list(sorted_counts)

total_counts = sum(sorted_counts)
probabilities = [count / total_counts for count in sorted_counts]

In [None]:
DATA_DIR = '/content/words'
DATA_WRITERS_DIRS = [d for d in os.listdir(DATA_DIR) if os.path.isdir(os.path.join(DATA_DIR, d))]

In [None]:
WRITERS_DATA_IN_VOCAB = {}

for d in DATA_WRITERS_DIRS:
  writer_dir = os.path.join(DATA_DIR, d)
  writer_sources_dirs = [d for d in os.listdir(writer_dir) if os.path.isdir(os.path.join(writer_dir, d))]

  writer_valid_data = []

  for source_d in writer_sources_dirs:
    contents_file_path = os.path.join(writer_dir, source_d + '.txt')

    with open(contents_file_path, 'r') as c_f:
      lines = c_f.readlines()
      for line in lines:
        splitted = line.strip().split()

        if splitted[-1] in cropped_freq or splitted[-1] in sorted_unique:
          writer_valid_data.append([splitted[0], splitted[-1]])

  WRITERS_DATA_IN_VOCAB[d] = writer_valid_data

In [None]:
print([len(WRITERS_DATA_IN_VOCAB[d]) for d in [*WRITERS_DATA_IN_VOCAB]])

In [None]:
print(WRITERS_DATA_IN_VOCAB["0"])

In [None]:
valid_count = sum([len(WRITERS_DATA_IN_VOCAB[d]) for d in [*WRITERS_DATA_IN_VOCAB]])
print(valid_count, int(valid_count*0.13), int(valid_count*0.21), int(valid_count*0.13)/640, int(valid_count*0.20)/640)

In [None]:
test_writers_n = 8
validation_writers_n = 5

In [None]:
OVERALL_COUNT = 0
for d in DATA_WRITERS_DIRS:
    writer_sources_dirs = [d for d in os.listdir(writer_dir) if os.path.isdir(os.path.join(writer_dir, d))]
    for src_d in writer_sources_dirs:
      OVERALL_COUNT += len(os.listdir(os.path.join(DATA_DIR, d, src_d)))

print(f"Overall count: {OVERALL_COUNT}")
print(f"Percentage of data that intersects with model dict: {(valid_count/OVERALL_COUNT) * 100.0:.2f}%")

In [None]:
import random

WRITERS_TEST_DATA = {}
WRITERS_VALIDATION_DATA = {}
WRITERS_TRAIN_DATA = {}

for d in [*WRITERS_DATA_IN_VOCAB][:test_writers_n]:
  WRITERS_TEST_DATA[d] = []
  for fp, transcription in WRITERS_DATA_IN_VOCAB[d]:
      WRITERS_TEST_DATA[d].append([fp, transcription])

for d in [*WRITERS_DATA_IN_VOCAB][:test_writers_n + validation_writers_n]:
  WRITERS_VALIDATION_DATA[d] = []
  for fp, transcription in WRITERS_DATA_IN_VOCAB[d]:
      WRITERS_VALIDATION_DATA[d].append([fp, transcription])

for d in [*WRITERS_DATA_IN_VOCAB][test_writers_n + validation_writers_n:]:
  WRITERS_TRAIN_DATA[d] = []
  for fp, transcription in WRITERS_DATA_IN_VOCAB[d]:
      WRITERS_TRAIN_DATA[d].append([fp, transcription])

In [None]:
from PIL import Image
import time
import shutil

def generate_filename(frmt='jpg'):
    return str(time.time()) + '_' + str(random.randint(100, 999)) + '.' + frmt

out_train_dir = '/content/SOURCE4/train'
out_validation_dir = '/content/SOURCE4/validation'
out_test_dir = '/content/SOURCE4/test'

train_csv = '/content/SOURCE4/train.csv'
validation_csv = '/content/SOURCE4/validation.csv'
test_csv = '/content/SOURCE4/test.csv'

os.makedirs(out_train_dir, exist_ok=True)
os.makedirs(out_validation_dir, exist_ok=True)
os.makedirs(out_test_dir, exist_ok=True)

filenames_list = []
transcriptions_list = []

def process_images(writers_data, out_dir):
  for w_dir in [*writers_data]:
    for fp, transcription in writers_data[w_dir]:
      src_path = os.path.join(DATA_DIR, w_dir, fp)

      with Image.open(src_path) as img:
          out_filename = generate_filename()
          out_path = os.path.join(out_dir, out_filename)

          while os.path.exists(out_path):
              out_path = os.path.join(out_dir, generate_filename())

          img.convert('RGB').save(out_path, 'JPEG')

          filenames_list.append(out_filename)
          transcriptions_list.append(transcription)

def save_to_csv_and_archive(filenames_list, transcriptions_list, csv_path, archive_path, out_dir):
    df = pd.DataFrame({
        'filename': filenames_list,
        'transcription': transcriptions_list
    })

    df.to_csv(csv_path, index=False)

    shutil.make_archive(archive_path, "zip", out_dir)


process_images(WRITERS_TEST_DATA, out_test_dir)
save_to_csv_and_archive(filenames_list, transcriptions_list, test_csv, "/content/SOURCE4/test", out_test_dir)

filenames_list = []
transcriptions_list = []

process_images(WRITERS_VALIDATION_DATA, out_validation_dir)
save_to_csv_and_archive(filenames_list, transcriptions_list, validation_csv, "/content/SOURCE4/validation", out_validation_dir)

filenames_list = []
transcriptions_list = []

process_images(WRITERS_TRAIN_DATA, out_train_dir)
save_to_csv_and_archive(filenames_list, transcriptions_list, train_csv, "/content/SOURCE4/train", out_train_dir)