In [13]:
import pandas as pd
import matplotlib.pyplot as plt
import pytesseract
import cv2
import sys, os

sys.path.append(os.path.abspath(os.path.join("../..")))
sys.path.append(os.path.abspath(os.path.join("../scripts")))

In [2]:
from text_extraction import extract_text

extract_text = extract_text()

In [90]:
from dotenv import load_dotenv

load_dotenv()

path_to_dataset = os.getenv('path_to_provided_data')
path_to_extracted = os.getenv('path_to_extracted_data')
output_path = os.getenv('output_home')

In [4]:
# load the performance data

data = pd.read_csv(path_to_dataset+'performance_data.csv')
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 905 entries, 0 to 904
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   game_id       905 non-null    object 
 1   preview_link  905 non-null    object 
 2   ER            905 non-null    float64
 3   CTR           905 non-null    float64
dtypes: float64(2), object(2)
memory usage: 28.4+ KB


In [5]:
# write a txt file for path to creative assets

df = pd.DataFrame(columns=['game_id','start_frame', 'end_frame'])

df['game_id'] = data['game_id']
df['start_frame'] = data['game_id'].apply(lambda x: path_to_extracted + x + '/start_frame.png' )
df['end_frame'] = data['game_id'].apply(lambda x: path_to_extracted + x + '/end_frame.png' )

df.to_csv('../data/path_to_creative_assets.csv', index=False)

## use pytesseract and opencv to extract text from startframes

In [6]:
game_ids, pics = df.loc[:, 'game_id'], df.loc[:, 'start_frame']

In [7]:
output_df = pd.DataFrame(columns=['game_id', 'text_start_frame'])

output_df['game_id'] = game_ids

output_df['text_start_frame'] = extract_text.text_to_df(pics, writeout=True, file_name='start_frame.png')

In [9]:
output_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 905 entries, 0 to 904
Data columns (total 2 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   game_id           905 non-null    object
 1   text_start_frame  905 non-null    object
dtypes: object(2)
memory usage: 14.3+ KB


## use pytesseract and opencv to extract text from endframes

In [10]:
game_ids, pics = df.loc[:, 'game_id'], df.loc[:, 'end_frame']

In [11]:
end_df = pd.DataFrame(columns=['game_id', 'text_end_frame'])

end_df['game_id'] = game_ids

end_df['text_end_frame'] = extract_text.text_to_df(pics, writeout=True, file_name='end_frame.png')

In [12]:
end_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 905 entries, 0 to 904
Data columns (total 2 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   game_id         905 non-null    object
 1   text_end_frame  905 non-null    object
dtypes: object(2)
memory usage: 14.3+ KB


# Extracting Engagement text

In [14]:
data_eng = pd.read_csv('../data/engagement_instructions.csv')
data_eng.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 638 entries, 0 to 637
Data columns (total 2 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   game_id                      638 non-null    object
 1   engagement_instruction_path  638 non-null    object
dtypes: object(2)
memory usage: 10.1+ KB


In [15]:
# some game_id has more than 1 file for engagement values are repeated
data_eng['game_id'].nunique()

587

In [40]:
indx1 = data_eng.drop_duplicates(subset='game_id').index.to_list()

indx = set([i for i in range(len(data_eng))]) - set(indx1)
indx = list(indx)
indx2 = data_eng.loc[indx].drop_duplicates(subset='game_id').index.to_list()

indx3 = list(set(indx) - set(indx2))


data_eng1 = data_eng.loc[indx1]

data_eng2 = data_eng.loc[indx2]

data_eng3 = data_eng.loc[indx3]

In [80]:
eng_df = pd.DataFrame(columns=['game_id', 'text_eng_instruction_1', 'text_eng_instruction_2', 'text_eng_instruction_3'])

In [83]:
game_ids, pics = data_eng1['game_id'].values, data_eng1['engagement_instruction_path'].values

eng_df['game_id'] = game_ids

eng_df['text_eng_instruction_1'] = extract_text.text_to_df(pics, writeout=False)

In [85]:
game_ids, pics = data_eng2['game_id'].values, data_eng2['engagement_instruction_path'].values

output = extract_text.text_to_df(pics, writeout=False)

for i, gameid in enumerate(game_ids):
    eng_df.loc[eng_df['game_id']==gameid,'text_eng_instruction_2']  = output.iloc[i]

In [87]:
game_ids, pics = data_eng3['game_id'].values, data_eng3['engagement_instruction_path'].values

output = extract_text.text_to_df(pics, writeout=False)

for i, gameid in enumerate(game_ids):
    eng_df.loc[eng_df['game_id']==gameid,'text_eng_instruction_3']  = output.iloc[i]

In [89]:
eng_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 587 entries, 0 to 586
Data columns (total 4 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   game_id                 587 non-null    object
 1   text_eng_instruction_1  587 non-null    object
 2   text_eng_instruction_2  47 non-null     object
 3   text_eng_instruction_3  4 non-null      object
dtypes: object(4)
memory usage: 18.5+ KB


In [92]:
eng_df.to_csv(output_path+'text_engagement_instructions.csv', index=False)