# Imports

In [1]:
import io
import json
import os
import pickle
import re
import string
from collections import defaultdict
from datetime import datetime
from pprint import pprint

import cv2
import emoji
import numpy as np
import pandas as pd
import pymongo
import pytesseract
from dotenv import load_dotenv
from google.cloud import vision
from google.oauth2 import service_account
from pytesseract import Output

from helpers import (account_info, connect_to_db, extract_from_json,
                     extract_hashtags, extract_text, extract_text_from_image)

# Connection to DB

In [2]:
project_folder = os.path.expanduser('../../')
load_dotenv(os.path.join(project_folder, '.env'))


# Google Vision API
GOOGLE_APPLICATION_CREDENTIALS = os.getenv("GOOGLE_APPLICATION_CREDENTIALS")
credentials = service_account.Credentials.from_service_account_file(project_folder + GOOGLE_APPLICATION_CREDENTIALS)
client_options = {'api_endpoint': 'eu-vision.googleapis.com'}
google_client = vision.ImageAnnotatorClient(client_options=client_options, credentials=credentials)

# MongoDB
client = connect_to_db()
db = client.accounts # connect to my database
posts = db['posts'] # collection of posts
authors = db['authors'] # collection of authors

### Load to pandas

In [3]:
cursor = posts.find()
entries = list(cursor)

df = pd.DataFrame(entries)
df.head()

Unnamed: 0,_id,account,post_id,likes,comments,date,content,hashtags,number_hashtags,img_text,number_emojis,mentions,emoji_terms,pre_cleaned_text,url_email
0,5f34706c8091264adc7b3c0f,mindfulmft,ierq6xRnBL,55,1,2013-12-28 15:53:14,#therapy #positivequotes #quotes #marriage #re...,guidance challenge mentalhealth quotes positiv...,14,true humility is staying teachabl,0.0,,,,
1,5f34706d8091264adc7b3c11,mindfulmft,ietVO_RnD_,33,0,2013-12-28 16:07:45,#motivation #love #power #encourage #journey #...,journey power forward relationship encourage a...,14,YOU ARE FAR Coo. pnart- TO BE THE ONLY THING S...,0.0,,,,
2,5f34706d8091264adc7b3c12,mindfulmft,ietrkpRnEl,65,1,2013-12-28 16:10:48,#storms #accomplishments #roots #strength #liv...,storms wisdom forward relationship encourageme...,15,Storms make trees take deeper root ss - Dolly ...,0.0,,,,
3,5f34706d8091264adc7b3c13,mindfulmft,ieum0hxnF-,39,1,2013-12-28 16:18:53,#words #self #life #MINDFULMFT #mindfulness #c...,wisdom control encouragement lessons motivatio...,13,- Let anyone determine your self-worth. = Spea...,0.0,,,,
4,5f34706d8091264adc7b3c14,mindfulmft,jH1aVfxnDG,61,1,2014-01-13 15:27:13,#truth #wisdom #wise #think #act #motivation #...,think wisdom wise family counseling encourage ...,16,Most of the problems in life are because of tw...,0.0,,,,


```python
for folder in accounts: # folder == username
    query = {'account': account}
    for row in authors.find(query):
        print(pprint(row))
```

# GOOGLE VISION API

# Fix entries with missing text

In [4]:
df.img_text.isna().sum()

27

In [5]:
missing_img_text = df[df.img_text.isna()]
missing_img_text

Unnamed: 0,_id,account,post_id,likes,comments,date,content,hashtags,number_hashtags,img_text,number_emojis,mentions,emoji_terms,pre_cleaned_text,url_email
152,5f3470858091264adc7b3ca8,mindfulmft,mAipZuxnGT,210,7,2014-03-26 10:02:45,Time\n\nSs\nnothing\nunless\n\nyou move\n\nwit\n,,0,,0.0,,,Time Ss nothing unless you move wit,
237,5f3470928091264adc7b3cfd,mindfulmft,oqa0JUxnBH,349,15,2014-05-31 09:25:12,YOUR LARGEST\nFEAR CARRIES\n\nYOUR GREATEST\nG...,,0,,0.0,,,YOUR LARGEST FEAR CARRIES YOUR GREATEST GROWTH.,
243,5f3470938091264adc7b3d03,mindfulmft,o5-DqDxnPm,443,42,2014-06-06 10:21:46,The best kind of people are\nthe ones that com...,,0,,0.0,,,The best kind of people are the ones that come...,
323,5f34709e8091264adc7b3d53,mindfulmft,sNKuUXRnG4,644,27,2014-08-27 10:52:12,“And suddenly\nyou know... It’s\ntime to start...,,0,,0.0,,,And suddenly you know It is time to start some...,
1755,5f34717d8091264adc7b42ec,minaa_b,BNFLBMgghX4,161,10,2016-11-21 12:36:51,\n,,0,,0.0,,,,
1785,5f3471828091264adc7b430a,minaa_b,BNmqLA-gBv3,167,7,2016-12-04 12:44:01,"darling,\n\nyou were created\nmagnificently.\n...",,0,,0.0,,,"darling, you were created magnificently. flee ...",
1834,5f3471898091264adc7b433b,minaa_b,BOfzKoxATmT,297,8,2016-12-26 17:19:19,do what's hard.\n\ndo what's unexpected.\n\ndo...,,0,,0.0,,,do what is hard. do what is unexpected. do wha...,
2012,5f3471a68091264adc7b43ed,minaa_b,BR0sByPDtyX,200,2,2017-03-19 10:35:10,,,0,,0.0,,,,
2459,5f3471ec8091264adc7b45ad,nedratawwab,BaRpSzgg_6E,18,1,2017-10-15 12:40:25,\n,,0,,0.0,,,,
3045,5f3472478091264adc7b47f7,minaa_b,BgjBXM6APou,547,16,2018-03-20 09:46:11,GIVE YovRselh peRmission To FEEL.\nort pont iv...,,0,,0.0,,,GIVE YovRselh peRmission To FEEL. ort pont ive...,


In [None]:
entries_to_update

In [62]:
entries_to_delete = missing_img_text[['_id']].to_dict(orient='records')

In [64]:
ids = [obj_id['_id'] for obj_id in entries_to_delete]
ids

[ObjectId('5f3470858091264adc7b3ca8'),
 ObjectId('5f3470928091264adc7b3cfd'),
 ObjectId('5f3470938091264adc7b3d03'),
 ObjectId('5f34709e8091264adc7b3d53'),
 ObjectId('5f34717d8091264adc7b42ec'),
 ObjectId('5f3471828091264adc7b430a'),
 ObjectId('5f3471898091264adc7b433b'),
 ObjectId('5f3471a68091264adc7b43ed'),
 ObjectId('5f3471ec8091264adc7b45ad'),
 ObjectId('5f3472478091264adc7b47f7'),
 ObjectId('5f3472568091264adc7b484a'),
 ObjectId('5f34726f8091264adc7b48f3'),
 ObjectId('5f3472708091264adc7b48fb'),
 ObjectId('5f3473148091264adc7b4cef'),
 ObjectId('5f3473a68091264adc7b5081'),
 ObjectId('5f3473ac8091264adc7b50a5'),
 ObjectId('5f3473ae8091264adc7b50b5'),
 ObjectId('5f3473b08091264adc7b50c1'),
 ObjectId('5f3473b18091264adc7b50c9'),
 ObjectId('5f3473b38091264adc7b50d4'),
 ObjectId('5f3473b68091264adc7b50e2'),
 ObjectId('5f3473ba8091264adc7b50ee'),
 ObjectId('5f3473f98091264adc7b526b'),
 ObjectId('5f34743d8091264adc7b5410'),
 ObjectId('5f347c6d8091264adc7b5d82'),
 ObjectId('5f347ca5809126

In [67]:
type(ids[0])

bson.objectid.ObjectId

In [68]:
result = posts.delete_many({'_id': {'$in': ids}})

In [69]:
result

<pymongo.results.DeleteResult at 0x14a10b800>

In [70]:
cursor = posts.find()
entries = list(cursor)

df = pd.DataFrame(entries)
df.head()

Unnamed: 0,_id,account,post_id,likes,comments,date,content,hashtags,number_hashtags,img_text,number_emojis,mentions,emoji_terms,pre_cleaned_text,url_email
0,5f34706c8091264adc7b3c0f,mindfulmft,ierq6xRnBL,55,1,2013-12-28 15:53:14,#therapy #positivequotes #quotes #marriage #re...,guidance challenge mentalhealth quotes positiv...,14,true humility is staying teachabl,0.0,,,,
1,5f34706d8091264adc7b3c11,mindfulmft,ietVO_RnD_,33,0,2013-12-28 16:07:45,#motivation #love #power #encourage #journey #...,journey power forward relationship encourage a...,14,YOU ARE FAR Coo. pnart- TO BE THE ONLY THING S...,0.0,,,,
2,5f34706d8091264adc7b3c12,mindfulmft,ietrkpRnEl,65,1,2013-12-28 16:10:48,#storms #accomplishments #roots #strength #liv...,storms wisdom forward relationship encourageme...,15,Storms make trees take deeper root ss - Dolly ...,0.0,,,,
3,5f34706d8091264adc7b3c13,mindfulmft,ieum0hxnF-,39,1,2013-12-28 16:18:53,#words #self #life #MINDFULMFT #mindfulness #c...,wisdom control encouragement lessons motivatio...,13,- Let anyone determine your self-worth. = Spea...,0.0,,,,
4,5f34706d8091264adc7b3c14,mindfulmft,jH1aVfxnDG,61,1,2014-01-13 15:27:13,#truth #wisdom #wise #think #act #motivation #...,think wisdom wise family counseling encourage ...,16,Most of the problems in life are because of tw...,0.0,,,,


In [71]:
df[df.img_text.isna()]

Unnamed: 0,_id,account,post_id,likes,comments,date,content,hashtags,number_hashtags,img_text,number_emojis,mentions,emoji_terms,pre_cleaned_text,url_email
