# Notebook for text extraction on image
Inga Ulusoy, SSC, July 2022

In [None]:
# if running on google colab
# flake8-noqa-cell
import os

if "google.colab" in str(get_ipython()):
    # update python version
    # install setuptools
    !pip install setuptools==61 -qqq
    # install misinformation
    !pip install git+https://github.com/ssciwr/misinformation.git -qqq
    # mount google drive for data and API key
    from google.colab import drive

    drive.mount("/content/drive")

In [None]:
import os
from IPython.display import Image, display
import misinformation

# download the models if they are not there yet
!python -m spacy download en_core_web_md
!python -m textblob.download_corpora

In [None]:
images = misinformation.find_files(path="../data/images-little-text/", limit=1000)

In [None]:
for i in images[0:3]:
    display(Image(filename=i))

In [None]:
mydict = misinformation.utils.initialize_dict(images[0:10])

# google cloud vision API
First 1000 images per month are free.

In [None]:
os.environ[
    "GOOGLE_APPLICATION_CREDENTIALS"
] = "../data/misinformation-campaign-981aa55a3b13.json"

## Inspect the elements per image

In [None]:
misinformation.explore_analysis(mydict, identify="text-on-image")

## Or directly analyze for further processing

In [None]:
for key in mydict:
    print(key)
    mydict[key] = misinformation.text.TextDetector(
        mydict[key], analyse_text=True
    ).analyse_image()

## Convert to dataframe and write csv

In [None]:
outdict = misinformation.utils.append_data_to_dict(mydict)
df = misinformation.utils.dump_df(outdict)

In [None]:
# check the dataframe
df.head(10)

In [None]:
# Write the csv
df.to_csv("drive/MyDrive/misinformation-data/data_out.csv")

## Topic analysis visual output

In [None]:
import spacy
from bertopic import BERTopic

In [None]:
# make a list of all the text_english entries per analysed image
list_keys = list(mydict.keys())
list_text_english = []
for key in mydict.keys():
    list_text_english.append(mydict[key]["text_english"])

In [None]:
print(list_keys)

In [None]:
all_texts = "".join(list_text_english)
print(list_text_english)

In [None]:
nlp = spacy.load(
    "en_core_web_md",
    exclude=["tagger", "parser", "ner", "attribute_ruler", "lemmatizer"],
)

topic_model = BERTopic(embedding_model=nlp)
topics, probs = topic_model.fit_transform(list_text_english)

In [None]:
topic_model.get_topic_info()

In [None]:
topic_model.get_topic(0)