In [None]:
!pip install pytesseract
!pip install google-colab  # Only needed if running in Google Colab
!pip install -q gradio

Collecting pytesseract
  Downloading pytesseract-0.3.10-py3-none-any.whl (14 kB)
Installing collected packages: pytesseract
Successfully installed pytesseract-0.3.10
Collecting jedi>=0.16 (from ipython==7.34.0->google-colab)
  Downloading jedi-0.19.1-py2.py3-none-any.whl (1.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: jedi
Successfully installed jedi-0.19.1


In [None]:
import numpy as np
from PIL import Image
import os
import zipfile

# Image preprocessing
def preprocess_image(image):
    image = image.convert('L')
    image = image.resize((28, 28))
    image_array = np.array(image)
    image_array = image_array / 255.0
    image_array = image_array.flatten()
    return image_array

# Character segmentation
def segment_characters(image):
    characters = []
    width, height = image.size
    pixels = image.load()
    start_x = 0

    for x in range(width):
        column = [pixels[x, y] for y in range(height)]
        if all(pixel == 255 for pixel in column):
            if start_x < x:
                character_image = image.crop((start_x, 0, x, height))
                characters.append(preprocess_image(character_image))
            start_x = x + 1

    if start_x < width:
        character_image = image.crop((start_x, 0, width, height))
        characters.append(preprocess_image(character_image))

    return characters

# Character recognition model
class CharacterRecognitionModel:
    def __init__(self, num_classes):
        self.num_classes = num_classes
        self.weights = np.random.randn(784, num_classes)
        self.bias = np.zeros(num_classes)

    def forward(self, x):
        logits = np.dot(x, self.weights) + self.bias
        return logits

    def backward(self, x, y, lr):
        logits = self.forward(x)
        probs = self.softmax(logits)
        delta = probs - y
        dw = np.dot(x.reshape(-1, 1), delta.reshape(1, -1))
        db = delta
        self.weights -= lr * dw
        self.bias -= lr * db

    def softmax(self, x):
        exp_x = np.exp(x - np.max(x))
        return exp_x / exp_x.sum(axis=0)

# Dataset loading
def load_dataset_from_zip(zip_path, extract_to='/tmp/extracted_dataset'):
    dataset = []

    if not os.path.exists(extract_to):
        os.makedirs(extract_to)

    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(extract_to)

    dataset_path = os.path.join(extract_to, 'dataset')

    for image_file in os.listdir(dataset_path):
        if image_file.startswith('Invoice_') and image_file.endswith('.jpg'):
            image_path = os.path.join(dataset_path, image_file)
            image = Image.open(image_path)
            characters = segment_characters(image)
            if characters:
                dataset.extend(characters)

    return np.array(dataset)

# Training loop
def train(model, dataset, labels, num_epochs, lr):
    if len(dataset) == 0:
        print("No data to train on. Exiting.")
        return

    for epoch in range(num_epochs):
        epoch_loss = 0.0

        for i in range(len(dataset)):
            x = dataset[i]
            y = np.eye(model.num_classes)[labels[i]]
            model.backward(x, y, lr)
            logits = model.forward(x)
            epoch_loss += np.mean(np.square(logits - y))

        epoch_loss /= len(dataset)
        print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {epoch_loss:.4f}")

# Evaluation
def evaluate(model, dataset, labels):
    if len(dataset) == 0:
        print("No data to evaluate. Exiting.")
        return

    correct = 0
    total = len(dataset)

    for i in range(len(dataset)):
        x = dataset[i]
        logits = model.forward(x)
        predicted_label = np.argmax(logits)
        if predicted_label == labels[i]:
            correct += 1

    accuracy = correct / total
    print(f"Accuracy: {accuracy:.4f}")

# OCR pipeline
def ocr_pipeline(image_path, model):
    image = Image.open(image_path)
    characters = segment_characters(image)

    predicted_labels = []
    for char_image in characters:
        logits = model.forward(char_image)
        predicted_label = np.argmax(logits)
        predicted_labels.append(str(predicted_label))

    predicted_text = ''.join(predicted_labels)
    return predicted_text




# Main function
def main():
    zip_path = 'dataset.zip'  # Ensure the path is correct
    num_classes = 10  # Assuming digits 0-9 for OCR
    num_epochs = 15000
    lr = 0.01

    dataset = load_dataset_from_zip(zip_path)

    if len(dataset) == 0:
        print("No images found in the dataset. Exiting.")
        return

    labels = np.random.randint(0, num_classes, len(dataset))  # Random labels for training

    model = CharacterRecognitionModel(num_classes)

    train(model, dataset, labels, num_epochs, lr)

    evaluate(model, dataset, labels)

    # Upload the test image
    from google.colab import files
    uploaded = files.upload()
    test_image_name = list(uploaded.keys())[0]  # Get the uploaded file name

    predicted_text = ocr_pipeline(test_image_name, model)
    print(f"Predicted text: {predicted_text}")



if __name__ == '__main__':
    main()


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Epoch [10002/15000], Loss: 165.0646
Epoch [10003/15000], Loss: 165.0742
Epoch [10004/15000], Loss: 165.0838
Epoch [10005/15000], Loss: 165.0934
Epoch [10006/15000], Loss: 165.1030
Epoch [10007/15000], Loss: 165.1126
Epoch [10008/15000], Loss: 165.1222
Epoch [10009/15000], Loss: 165.1318
Epoch [10010/15000], Loss: 165.1414
Epoch [10011/15000], Loss: 165.1511
Epoch [10012/15000], Loss: 165.1607
Epoch [10013/15000], Loss: 165.1703
Epoch [10014/15000], Loss: 165.1799
Epoch [10015/15000], Loss: 165.1896
Epoch [10016/15000], Loss: 165.1992
Epoch [10017/15000], Loss: 165.2088
Epoch [10018/15000], Loss: 165.2185
Epoch [10019/15000], Loss: 165.2281
Epoch [10020/15000], Loss: 165.2377
Epoch [10021/15000], Loss: 165.2474
Epoch [10022/15000], Loss: 165.2570
Epoch [10023/15000], Loss: 165.2667
Epoch [10024/15000], Loss: 165.2763
Epoch [10025/15000], Loss: 165.2860
Epoch [10026/15000], Loss: 165.2956
Epoch [10027/15000], Loss: 165.3053

In [None]:
import numpy as np
from PIL import Image
import os
import zipfile
import tensorflow as tf
from tensorflow.keras import layers
import pytesseract

# Image preprocessing
def preprocess_image(image):
    image = image.convert('L')
    image = image.resize((800, 800))  # Adjust the size according to your invoice images
    image_array = np.array(image)
    image_array = image_array / 255.0
    image_array = np.expand_dims(image_array, axis=-1)
    return image_array

# Extract text from image using Tesseract OCR
def extract_text(image_path):
    text = pytesseract.image_to_string(Image.open(image_path))
    return text

# CNN model for text recognition
def create_model():
    model = tf.keras.Sequential([
        layers.Conv2D(32, (3, 3), activation='relu', input_shape=(800, 800, 1)),
        layers.MaxPooling2D((2, 2)),
        layers.Conv2D(64, (3, 3), activation='relu'),
        layers.MaxPooling2D((2, 2)),
        layers.Conv2D(64, (3, 3), activation='relu'),
        layers.Flatten(),
        layers.Dense(64, activation='relu'),
        layers.Dense(1, activation='sigmoid')
    ])

    model.compile(optimizer='adam',
                  loss='binary_crossentropy',
                  metrics=['accuracy'])

    return model

# Dataset loading
def load_dataset_from_zip(zip_path, extract_to='/tmp/extracted_dataset'):
    dataset = []
    labels = []

    if not os.path.exists(extract_to):
        os.makedirs(extract_to)

    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(extract_to)

    dataset_path = os.path.join(extract_to, 'dataset')

    for image_file in os.listdir(dataset_path):
        if image_file.startswith('Invoice_') and image_file.endswith('.jpg'):
            image_path = os.path.join(dataset_path, image_file)
            image = preprocess_image(Image.open(image_path))
            dataset.append(image)
            labels.append(1)  # Assuming all images in the dataset are invoices

    return np.array(dataset), np.array(labels)

# Training
def train(model, dataset, labels, num_epochs):
    if len(dataset) == 0:
        print("No data to train on. Exiting.")
        return

    model.fit(dataset, labels, epochs=num_epochs)

# Journaling
def journaling(text, output_file):
    with open(output_file, 'a') as file:
        file.write(text + '\n')

# Main function
def main():
    zip_path = 'dataset.zip'  # Ensure the path is correct
    num_epochs = 10
    output_file = 'journal.txt'

    dataset, labels = load_dataset_from_zip(zip_path)

    if len(dataset) == 0:
        print("No images found in the dataset. Exiting.")
        return

    model = create_model()

    train(model, dataset, labels, num_epochs)

    # Upload the test invoice image
    from google.colab import files
    uploaded = files.upload()
    test_image_name = list(uploaded.keys())[0]  # Get the uploaded file name

    # Extract text from the test invoice image
    invoice_text = extract_text(test_image_name)

    # Perform journaling
    journaling(invoice_text, output_file)

    print("Journaling completed. Journal entries saved to", output_file)



if __name__ == '__main__':
    main()

FileNotFoundError: [Errno 2] No such file or directory: 'dataset.zip'

In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional
import os

# Load the raw journal text file
def load_raw_journal(file_path):
    with open(file_path, 'r') as file:
        raw_text = file.read()
    return raw_text.split('\n')

# Create labeled data (dummy labels for illustration)
def create_labeled_data(raw_texts):
    labeled_data = []
    labels = []
    for text in raw_texts:
        # Here we are creating dummy labels for illustration
        labeled_data.append(text)
        labels.append("Invoice Number, Date, Customer Name, Items, Total")
    return labeled_data, labels

# Preprocess text data
def preprocess_texts(texts):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(texts)
    sequences = tokenizer.texts_to_sequences(texts)
    word_index = tokenizer.word_index
    data = pad_sequences(sequences, padding='post')
    return data, word_index, tokenizer

# Build the RNN model
def build_rnn_model(vocab_size, embedding_dim, input_length):
    model = Sequential([
        Embedding(vocab_size, embedding_dim, input_length=input_length),
        Bidirectional(LSTM(128, return_sequences=True)),
        Bidirectional(LSTM(128)),
        Dense(64, activation='relu'),
        Dense(vocab_size, activation='softmax')
    ])
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

# Function to organize and extract relevant data
def organize_data(model, texts, word_index, tokenizer):
    organized_data = []
    for text in texts:
        sequence = tokenizer.texts_to_sequences([text])
        sequence = pad_sequences(sequence, maxlen=model.input_shape[1])
        predictions = model.predict(sequence)
        predicted_labels = [np.argmax(pred) for pred in predictions]
        organized_data.append(" ".join([word_index.get(index, '') for index in predicted_labels]))
    return organized_data

# Main function
def main():
    raw_journal_path = 'raw_journal.txt'
    processed_journal_path = 'processed_journal.txt'
    embedding_dim = 100
    num_epochs = 10
    batch_size = 32

    raw_texts = load_raw_journal(raw_journal_path)
    labeled_data, labels = create_labeled_data(raw_texts)
    data, word_index, tokenizer = preprocess_texts(labeled_data)

    vocab_size = len(word_index) + 1
    input_length = data.shape[1]

    model = build_rnn_model(vocab_size, embedding_dim, input_length)

    # Convert labels to sequences for training
    label_sequences = tokenizer.texts_to_sequences(labels)
    label_data = pad_sequences(label_sequences, padding='post')

    model.fit(data, label_data, epochs=num_epochs, batch_size=batch_size)

    organized_texts = organize_data(model, raw_texts, word_index, tokenizer)

    with open(processed_journal_path, 'w') as file:
        for text in organized_texts:
            file.write(text + '\n')

    print(f"Organized journal entries saved to {processed_journal_path}")

if __name__ == '__main__':
    main()


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Organized journal entries saved to processed_journal.txt
