# Notebook for text extraction on image
Inga Ulusoy, SSC, July 2022

In [None]:
# if running on google colab
# flake8-noqa-cell
import os

if "google.colab" in str(get_ipython()):
    # update python version
    # install setuptools
    !pip install setuptools==61 -qqq
    # install misinformation
    !pip install git+https://github.com/ssciwr/misinformation.git -qqq
    # mount google drive for data and API key
    from google.colab import drive

    drive.mount("/content/drive")

In [None]:
import os
from IPython.display import Image, display
import misinformation
import tensorflow as tf

print(tf.config.list_physical_devices("GPU"))

In [None]:
# download the models if they are not there yet
!python -m spacy download en_core_web_md
!python -m textblob.download_corpora

In [None]:
images = misinformation.find_files(path="../data/all/", limit=1000)

In [None]:
for i in images[0:3]:
    display(Image(filename=i))

In [None]:
mydict = misinformation.utils.initialize_dict(images[0:3])

# google cloud vision API
First 1000 images per month are free.

In [None]:
os.environ[
    "GOOGLE_APPLICATION_CREDENTIALS"
] = "../data/misinformation-campaign-981aa55a3b13.json"

## Inspect the elements per image

In [None]:
misinformation.explore_analysis(mydict, identify="text-on-image")

## Or directly analyze for further processing

In [None]:
for key in mydict:
    print(key)
    mydict[key] = misinformation.text.TextDetector(
        mydict[key], analyse_text=True
    ).analyse_image()

## Convert to dataframe and write csv

In [None]:
outdict = misinformation.utils.append_data_to_dict(mydict)
df = misinformation.utils.dump_df(outdict)

In [None]:
# check the dataframe
df.head(10)

In [None]:
# Write the csv
df.to_csv("./data_out.csv")

# Topic analysis
The topic analysis is carried out using [BERTopic](https://maartengr.github.io/BERTopic/index.html).

BERTopic takes a list of strings as input. The more items in the list, the better for the topic modeling.
### Option 1: Use the dictionary as obtained from the above analysis.

In [None]:
# make a list of all the text_english entries per analysed image from the mydict variable as above
topic_df, most_frequent_topics = misinformation.text.PostprocessText(
    mydict=mydict
).analyse_topic()

### Option 2: Read in a csv
Not to analyse too many images on google Cloud Vision, use the csv output to obtain the text (when rerunning already analysed images).

In [None]:
input_file_path = "data_out.csv"
topic_df, most_frequent_topics = misinformation.text.PostprocessText(
    use_csv=True, csv_path=input_file_path
).analyse_topic(return_topics=10)

In [None]:
# debug
print(topic_df)
for topic in most_frequent_topics:
    print("Topic:", topic)

## Compute the topics
Now load the spacy pipeline and perform BERT topic modeling using an embedded model

In [None]:
nlp = spacy.load(
    "en_core_web_md",
    exclude=["tagger", "parser", "ner", "attribute_ruler", "lemmatizer"],
)

topic_model = BERTopic(embedding_model=nlp)
# topic_model = BERTopic()
topics, probs = topic_model.fit_transform(list_text_english)

### Access frequent topics
A topic of `-1` stands for an outlier and should be ignored. Topic count is the number of occurence of that topic. The output is structured from most frequent to least frequent topic.

In [None]:
topic_model.get_topic_info()

### Get information for specific topic
The most frequent topic can be accessed using the index "0".

In [None]:
topic_model.get_topic(0)

In [None]:
topic_model.get_topic(1)

### Extract document level info
Further information about the analysed texts can be extracted as a dataframe (and then exportet to csv if one wishes to).

In [None]:
topic_model.get_document_info(list_text_english)

### Topic visualization
The topics can also be visualized. Careful: This only works if there is sufficient data (quantity and quality).

In [None]:
topic_model.visualize_topics()

### Visualize documents
You can also visualize the documents for debugging.

In [None]:
from sentence_transformers import SentenceTransformer
from bertopic import BERTopic
from umap import UMAP

# Prepare embeddings
sentence_model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = sentence_model.encode(list_text_english, show_progress_bar=False)

# Train BERTopic
topic_model = BERTopic().fit(list_text_english, embeddings)

# Run the visualization with the original embeddings
topic_model.visualize_documents(list_text_english, embeddings=embeddings)

# Reduce dimensionality of embeddings, this step is optional but much faster to perform iteratively:
reduced_embeddings = UMAP(
    n_neighbors=10, n_components=2, min_dist=0.0, metric="cosine"
).fit_transform(embeddings)
topic_model.visualize_documents(
    list_text_english, reduced_embeddings=reduced_embeddings
)

In [None]:
topic_model.visualize_barchart()

### Save the model
The model can be saved for future use.

In [None]:
topic_model.save("misinfo_posts")