# DATA522 Final Exploration

In [None]:
# Install libraries for reading in Parquet and OCR
%pip install pyarrow
%pip install ocrmac
# %pip install pytesseract
%pip install easyocr

# Pre-trained sentiment analysis model
%pip install transformers

In [1]:
import os
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
import easyocr
from ocrmac import ocrmac
import cv2
import PIL
from transformers import pipeline

In [38]:
# Get all data Parquets
def read_dataset(dirname):
    training_filenames = [
        filename for filename in os.listdir(dirname) if filename.endswith('.parquet') and filename.beginswith('train') 
    ]
    return [
        pd.read_parquet(f'{dirname}/{filename}').drop(columns = ['caption']) for filename in training_filenames
    ]

In [5]:
state_easyocr = {
    "reader": easyocr.Reader(['en'])
}

state_sentiment = {
    "pipeline": pipeline("sentiment-analysis")
}

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Device set to use mps:0


In [39]:
def bytes2cv2(image):
    img_bytes = image['bytes']
    return cv2.imdecode(
        np.frombuffer(
            img_bytes, 
            np.uint8
        ),
        cv2.IMREAD_COLOR_RGB
    )
# https://stackoverflow.com/questions/17170752/python-opencv-load-image-from-byte-string
def imshow(image):
    decoded_image = bytes2cv2(image)
    plt.imshow(decoded_image)

def analyze_sentiment(text, state_sentiment):
    sentiment = state_sentiment["pipeline"](text)[0]
    if sentiment["label"] == "NEGATIVE":
        return -1 * sentiment["score"]
    return sentiment["score"]

def easyocr_filtertext(read_text):
    return ' '.join(read_text).upper()
    
def ocr_easyocr(image, state):
    decoded_image = bytes2cv2(image)
    return easyocr_filtertext(state['reader'].readtext(decoded_image, detail = 0))

def ocr_ocrmac(image):
    return ' '.join(
        [
            ocr_tuple[0] 
            for ocr_tuple 
            in ocrmac.text_from_image(
                PIL.Image.fromarray(
                    bytes2cv2(image)
                )
            )
        ]
    )

def text_sentiment_row(row):
    text_ocrmac = ocr_ocrmac(row.image)
    sentiment_ocrmac = analyze_sentiment(text_ocrmac, state_sentiment)
    return pd.Series(
        {
            "text_ocrmac": text_ocrmac,
            "sentiment_ocrmac": sentiment_ocrmac
        }
    )

# Create pared-down dataset
def text_sentiment(df):
    df_ocr = pd.concat([df, df.apply(
        text_sentiment_row,
        axis = 1,
        result_type = 'expand'
    )], axis = 1)
    return df_ocr

def create_pared_dataset(dirname):
    training_parquets = read_dataset(dirname)
    
    for i, training_parquet in enumerate(training_parquets):
        text_sentiment(training_parquet).drop(columns = 'image').to_parquet(f'{dirname}/parquet_{i}-sentiment.parquet')

def read_pared_dataset(dirname):
    sentiment_filenames = [
        filename for filename in os.listdir(dirname) if filename.endswith('.parquet') and not filename.beginswith('train')
    ]
    print(sentiment_filenames)
    sentiment_parquets = [
        pd.read_parquet(f'{dirname}/{filename}') for filename in sentiment_filenames
    ]
    return pd.concat(sentiment_parquets)
    

In [40]:
# create_pared_dataset('./data')
sentiment_dataset = read_pared_dataset('./data')

AttributeError: 'str' object has no attribute 'beginswith'

In [None]:
"".endswith