In [1]:
import os
import pandas as pd
import numpy as np
import datetime
import yaml
import re

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import datetime
from datetime import datetime
from datetime import date

import nltk
from nltk import word_tokenize

# Google Cloud Language Translation API
# We're using the basic version here == "v2" 
from google.cloud import translate_v2

import timeit

In [2]:
#This was amazingly helpful https://www.youtube.com/watch?v=YapTts_An9A 
# @RJ make sure that the json file is in the /code folder
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = r'kv-translate-2023396507e3.json'

# 1. Create function for performing the translations

In [3]:
def google_translate_messages(one_row):
    '''
    Pass in a df row.
    Find the message under the 'content column'
    '''
    one_message = one_row.loc['content']
    
    # initialize the Google Cloud translation client
    translate_client = translate_v2.Client()
    
    # set the target language
    target = 'en'
    
    
    try:
    # apply the translation 
        output = translate_client.translate(one_message, 
                                            format_='html',
                                            target_language=target)
    except:
        output = {'translatedText': 'translation_error', 
                  'detectedSourceLanguage': 'translation_error', 
                  'orig_content':'translation_error'}

    return list(output.values())

# 2. Prepare dataset for translation

In [3]:
# All parent-school messages, output from Script 06_no_ra_status_osse_merge
all_msgs = pd.read_pickle('../data/analysis_data/messages_w_demographics_osse6_schools_pickle.pkl')
all_msgs.shape

(375417, 67)

## 2.1. Create id for deduplicated content (without removing personalization) 

In [4]:
# Create an id for de-duplicated content with no personalization
new_id_creation = all_msgs[['content']].drop_duplicates()
new_id_creation.shape


new_id_creation['id_content_deduped_no_personalization'] = \
    ["id_" + str(i) for i in np.arange(1, new_id_creation.shape[0]+1).tolist()]

all_msgs_new_id = pd.merge(all_msgs, 
                           new_id_creation, 
                           on = "content",
                           how = "left")

(95300, 1)

## 2.2. Cut down the number of characters we'll run through the translation function

We care about this because the Google Cloud pricing structure is the same for language detection and language translation itself. Since language translation will return the language detected, we'd only want to run through it once

In [5]:
# count N chars in text messages. We care about this bc Google Cloud pricing structure is based on N chars
all_msgs_new_id['content_len'] = all_msgs_new_id.content.str.len()

In [6]:
deduped_msgs = all_msgs_new_id[['id_content_deduped_no_personalization', 
                                'content', 'content_len']].drop_duplicates()

deduped_msgs.shape

# len of characters 
# 16.5million characters might be too much, so we'll cut it down
deduped_msgs.content_len.sum()

(95300, 3)

16549892.0

In [7]:
# common phrases to look for in our text message to cut down on messages to translate

common_phrases = ['no problem', 'No problem',
                  'Hello', 'thank you', 'Thank you', 'Thank You', 'Thanks', 'thanks',
                  "You're welcome", "You are very welcome",
                  'English teacher', 
                  'attendance', 
                  'was absent from',
                  'was not in class',
                  'was late to',
                  'I will let you know',
                  'Please reply to this message',
                  'Good Afternoon', 'Good afternoon', 'good afternoon', 
                  'Good Morning', 'Good morning', 'good morning',
                  'Please provide', 'Please respond', 'please check out', 'please contact me',
                  'assignment',
                  'detention', 'suspended',
                  'Good Evening', 'Good evening',
                  'Please make sure', 'Please be sure', 'Please send',
                  'Please Join',
                  "That's great",
                  'hall sweep', 'The message was',
                  'Parent teacher', 'parent teacher', 'parent-teacher',
                  'parent/teacher', 'parent/teacher',
                  'conferences', 'Conferences', "That's awesome",
                  'failing', 'fail',
                  'I am sending this message to inform ',
                  'This message is from',
                  'I want to say thank you for your support this first week of distance-learning',
                  'did not turn in', 
                  'First, I would like to say we made it','Early Release Day','Uber','Idgaf', 'presentation',
                  'Let us have a great week',
                  'Wishing you', 
                  'Dear Parent']

In [8]:
# Join the common phrases
# If phrase in content, give the message a 0, else, leave blank
deduped_msgs['non_english'] = np.where(deduped_msgs.content.str.contains(('|').join(common_phrases)),
                                       0, 
                                       '')

deduped_msgs.shape

deduped_msgs.non_english.value_counts()

(95300, 4)

0    55677
     39623
Name: non_english, dtype: int64

In [9]:
# There is one parent that uses Spanish primarily but includes "Thanks" in text, so false positive
# Manually add this back by looking for the id
deduped_msgs_parent_check = pd.merge(deduped_msgs, 
                                     all_msgs_new_id[['id_content_deduped_no_personalization',
                                                      'StudentID', 
                                                      'broad_type']], 
                                    how = 'left', 
                                    on = 'id_content_deduped_no_personalization')

deduped_msgs_parent_check['non_english'] = \
    np.where((deduped_msgs_parent_check.StudentID == 9209061) & \
             (deduped_msgs_parent_check.broad_type == 'parent_sent'), 
                                                    '', 
                                                    deduped_msgs_parent_check.non_english)

deduped_msgs_parent_check.shape

(375417, 6)

In [10]:
# Wrong N rows , so need to go through the process of dropping things again
deduped_msgs_rm_enes = deduped_msgs_parent_check[['id_content_deduped_no_personalization', 
                                                  'content', 'content_len', 'non_english']]\
                       .drop_duplicates()

deduped_msgs_rm_enes.shape

deduped_msgs_rm_enes.non_english.value_counts()

print('N chars to run through translator:', 
      deduped_msgs_rm_enes[deduped_msgs_rm_enes.non_english != '0'].content_len.sum())

(95300, 4)

0    55669
     39631
Name: non_english, dtype: int64

N chars to run through translator: 4652525.0


# 3. Run the messages that did not have common phrases above through the translation function

In [15]:
# Subset df to the ones that are non_english. English == 0
df_to_translate = deduped_msgs_rm_enes[deduped_msgs_rm_enes.non_english != '0'].copy()
df_to_translate.shape

# Split df into 20 dataframes, so that we can revisit if code breaks + internet crashes 
split_df = np.array_split(df_to_translate, 20)

# N rows/columns per df. 
for i in range(len(split_df)):
    print('df', i+1, ':', 
        split_df[i].shape)

(39631, 4)

df 1 : (1982, 4)
df 2 : (1982, 4)
df 3 : (1982, 4)
df 4 : (1982, 4)
df 5 : (1982, 4)
df 6 : (1982, 4)
df 7 : (1982, 4)
df 8 : (1982, 4)
df 9 : (1982, 4)
df 10 : (1982, 4)
df 11 : (1982, 4)
df 12 : (1981, 4)
df 13 : (1981, 4)
df 14 : (1981, 4)
df 15 : (1981, 4)
df 16 : (1981, 4)
df 17 : (1981, 4)
df 18 : (1981, 4)
df 19 : (1981, 4)
df 20 : (1981, 4)


In [None]:
# Set up to run translation

# output path
path = '../data/gcloud_translation_results/'
filename = 'translated_msgs_'
ext = '.pkl'


# For every dataframe in split_df, 
# run through the translation, unpack the results, 
# and save as pickles

for i in range(len(split_df)):
    one_df = split_df[i]
    
    start_translation_time = timeit.default_timer() #time start
    
    # run translation
    one_df['output_list'] = one_df.apply(google_translate_messages, axis = 1)

    stop_translation_time = timeit.default_timer() #time end
    
    time_lapse = stop_translation_time - start_translation_time
    print("took " + str(time_lapse) + " seconds to run")
    
    # unpack the translation results into their own columns
    one_df[['translatedText', 'detectedSourceLanguage', 'orig_content']] = \
        pd.DataFrame(one_df.output_list.to_list(),  
                     index = one_df.index)

    one_df.to_pickle(path + filename + str(i) + ext)
    
    print("wrote results for df ", i + 1)

took 540.9612335229999 seconds to run
wrote results for df  1
took 630.955311149 seconds to run
wrote results for df  2
took 601.288407005 seconds to run
wrote results for df  3
took 665.6133545510002 seconds to run
wrote results for df  4
took 619.1940848019999 seconds to run
wrote results for df  5
took 549.7868174780001 seconds to run
wrote results for df  6
took 547.2950779530001 seconds to run
wrote results for df  7
took 596.2183222570002 seconds to run
wrote results for df  8
took 567.3641524929999 seconds to run
wrote results for df  9
took 565.2665151170004 seconds to run
wrote results for df  10
took 570.3391485190004 seconds to run
wrote results for df  11
took 558.8706188649994 seconds to run
wrote results for df  12
took 600.4019234949992 seconds to run
wrote results for df  13
took 661.2173991979998 seconds to run
wrote results for df  14
took 662.4690210930003 seconds to run
wrote results for df  15
took 629.1659048500005 seconds to run
wrote results for df  16
took 637.

# 4. Combine msgs back together

## 4.1. Read in the translated pickles and stitch back together

In [16]:
# output path
path = '../data/gcloud_translation_results/'
filename = 'translated_msgs_'
ext = '.pkl'

# init first df
translated_msgs_init = pd.read_pickle(path + filename + '0' + ext)
print(translated_msgs_init.shape)

(1982, 8)


In [17]:
# initialize list to store our pickles
pickles = []

# For every file in the folder, read it in as a dataframe, 
# then append to the pickles list
for i in range(1, len(split_df)):
    df = pd.read_pickle(path + filename + str(i) + ext)
    print(df.shape)
    pickles.append(df)

(1982, 8)
(1982, 8)
(1982, 8)
(1982, 8)
(1982, 8)
(1982, 8)
(1982, 8)
(1982, 8)
(1982, 8)
(1982, 8)
(1981, 8)
(1981, 8)
(1981, 8)
(1981, 8)
(1981, 8)
(1981, 8)
(1981, 8)
(1981, 8)
(1981, 8)


In [18]:
# Append all the dataframes back together
translated_msgs = pd.concat([translated_msgs_init, *pickles])

print('Does the shape of the new dataframe match the one pre-translation? ')
translated_msgs.shape[0] == df_to_translate.shape[0]

Does the shape of the new dataframe match the one pre-translation? 


True

In [19]:
# Look at the languages detected
translated_msgs.detectedSourceLanguage.value_counts()

en    27125
es    11821
fr      108
ca       59
pt       57
      ...  
et        1
ta        1
ga        1
uz        1
lv        1
Name: detectedSourceLanguage, Length: 71, dtype: int64

In [20]:
translated_msgs.detectedSourceLanguage.unique()

array(['en', 'es', 'ro', 'el', 'tl', 'ca', 'ta', 'hu', 'zh-CN', 'ar',
       'pl', 'fr', 'so', 'az', 'gd', 'vi', 'it', 'pt', 'haw', 'id', 'cs',
       'tr', 'gl', 'sn', 'hi', 'sw', 'ht', 'ja', 'is', 'fi', 'da', 'ml',
       'cy', 'st', 'lt', 'sq', 'sk', 'hmn', 'la', 'sv', 'de', 'no', 'ha',
       'ms', 'eo', 'bg', 'bs', 'yo', 'hr', 'co', 'mt', 'mr', 'ru', 'ga',
       'nl', 'uz', 'ku', 'zu', 'bn', 'ceb', 'af', 'sl', 'mg', 'kn', 'ig',
       'te', 'mi', 'ny', 'gu', 'lv', 'et'], dtype=object)

In [21]:
# Explore a couple of messages
translated_msgs[translated_msgs.detectedSourceLanguage == 'es'].sample(n=10)

Unnamed: 0,id_content_deduped_no_personalization,content,content_len,non_english,output_list,translatedText,detectedSourceLanguage,orig_content
356951,id_77723,Gracias y nueva mente ofrezco mis disculpas,44.0,,"[Thank you and again I offer my apologies, es,...",Thank you and again I offer my apologies,es,Gracias y nueva mente ofrezco mis disculpas
363217,id_83517,Espero en Dios Que de encuentre bien,37.0,,"[I hope in God that you find yourself well, es...",I hope in God that you find yourself well,es,Espero en Dios Que de encuentre bien
289826,id_67546,Mrs. Dinora Ramirez quiero felicitarla por el ...,278.0,,[Mrs. Dinora Ramirez I want to congratulate yo...,Mrs. Dinora Ramirez I want to congratulate you...,es,Mrs. Dinora Ramirez quiero felicitarla por el ...
358712,id_79300,Ok gracias lo hare,18.0,,"[Ok thanks i will, es, Ok gracias lo hare]",Ok thanks i will,es,Ok gracias lo hare
56796,id_15016,Le agradesere mucho su informacion maestra,42.0,,"[I really appreciate your master information, ...",I really appreciate your master information,es,Le agradesere mucho su informacion maestra
61353,id_17625,"Buenas tardes sr Administrador, mi hijo es Bry...",130.0,,"[Good afternoon Mr. Administrator, my son is B...","Good afternoon Mr. Administrator, my son is Br...",es,"Buenas tardes sr Administrador, mi hijo es Bry..."
79767,id_27565,Estimada familia. Este S√°bado 8 de febrero Gr...,231.0,,"[Dear family. This Saturday, February 8, Grelv...","Dear family. This Saturday, February 8, Grelvi...",es,Estimada familia. Este S√°bado 8 de febrero Gr...
366879,id_87034,Entonces miguel va bien en sus trabajos y √°re...,60.0,,[So miguel is doing well in his jobs and areas...,So miguel is doing well in his jobs and areas ...,es,Entonces miguel va bien en sus trabajos y √°re...
356749,id_77549,Buenos dias \nMuchas gracias,29.0,,"[Good morning thank you very much, es, Buenos ...",Good morning thank you very much,es,Buenos dias \nMuchas gracias
368501,id_88604,Buenas tardes es una gran\nMaestra se prenupa ...,94.0,,"[Good afternoon, it&#39;s a great Teacher, be ...","Good afternoon, it&#39;s a great Teacher, be v...",es,Buenas tardes es una gran\nMaestra se prenupa ...


## 4.2. Add the translated text back to the non-translated deduped messages

In [22]:
# Grab the english ones in our original based on the common phrases
deduped_msgs_en = deduped_msgs_rm_enes[deduped_msgs_rm_enes.non_english=='0'].copy()


deduped_msgs_w_translation = deduped_msgs_en.append(translated_msgs, ignore_index = True)

# In the old script (non-Google Cloud API), if we translated an English text with incorrect spelling, the 
# translation would do spell-check/correct. Looks like this isn't the case here
deduped_msgs_w_translation[(deduped_msgs_w_translation.detectedSourceLanguage == 'en') &
                           (deduped_msgs_w_translation.orig_content != deduped_msgs_w_translation.translatedText)]

Unnamed: 0,id_content_deduped_no_personalization,content,content_len,non_english,output_list,translatedText,detectedSourceLanguage,orig_content


## 4.3. Clean up deduped messages

In [23]:
# Remove the columns we don't need
deduped_msgs_w_translation.drop(columns = ['content_len', 'output_list', 'orig_content'], inplace = True)

In [24]:
import html2text

def fix_html(one_row):
    '''
    This function takes in a row and removes the 
    html tags from the translated messages.
    
    Example input:  Hello, I am Ms. Johnson. Vicky&#39;s teacher
    Example output: Hello, I am Ms. Johnson. Vicky's teacher
    '''
    
    message = one_row.loc['translatedText']
    
    html_conv = html2text.HTML2Text()
    
    try:
        converted_msg = html_conv.handle(message)
    except:
        # if errors, e.g. with phone numbers, just go with the original message
        converted_msg = message
    
    return converted_msg

In [25]:
# Apply the function to remove the tags
deduped_msgs_w_translation['translatedText_rm_html_init'] = deduped_msgs_w_translation.apply(fix_html, axis = 1)

# the function adds '\n\n' so remove thatfrom the text
deduped_msgs_w_translation['translatedText_rm_html'] = deduped_msgs_w_translation.translatedText_rm_html_init\
                                                       .str.strip('\n\n')

# check results
deduped_msgs_w_translation[~deduped_msgs_w_translation.translatedText.isna()].head()

deduped_msgs_w_translation.drop(columns= ['translatedText_rm_html_init'],
                                inplace = True)

Unnamed: 0,id_content_deduped_no_personalization,content,non_english,translatedText,detectedSourceLanguage,translatedText_rm_html_init,translatedText_rm_html
55669,id_187,"ok, sent",,"ok, sent",en,"ok, sent\n\n","ok, sent"
55670,id_261,Ok cool! Marvin’s a great kid. I’ve known him ...,,Ok cool! Marvin’s a great kid. I’ve known him ...,en,Ok cool! Marvin’s a great kid. I’ve known him ...,Ok cool! Marvin’s a great kid. I’ve known him ...
55671,id_336,"I understand! If possible, can she complete he...",,"I understand! If possible, can she complete he...",en,"I understand! If possible, can she complete he...","I understand! If possible, can she complete he..."
55672,id_408,Will do!,,Will do!,en,Will do!\n\n,Will do!
55673,id_409,I am looking forward to being his teacher!,,I am looking forward to being his teacher!,en,I am looking forward to being his teacher!\n\n,I am looking forward to being his teacher!


In [34]:
# flag for whether the message was translated or not 
# for the english ones that were run through the translator, we'll keep that as no.
deduped_msgs_w_translation['translated'] = np.where((~deduped_msgs_w_translation.translatedText.isna()) &\
                                                    (deduped_msgs_w_translation.detectedSourceLanguage != 'en'),
                                                    1, 0
                                                    )

# new column with either the original content or the translated content
deduped_msgs_w_translation['content_w_translation'] = np.where(deduped_msgs_w_translation.translated == 0, 
                                                               deduped_msgs_w_translation.content, 
                                                               deduped_msgs_w_translation.translatedText_rm_html)


deduped_msgs_w_translation['run_thru_translate'] = np.where(deduped_msgs_w_translation.non_english == '0', 
                                                            0, 1)

In [36]:
deduped_msgs_w_translation[deduped_msgs_w_translation.translated==1].sample(n=10) 

Unnamed: 0,id_content_deduped_no_personalization,content,non_english,translatedText,detectedSourceLanguage,translatedText_rm_html,translated,content_w_translation,run_thru_translate
82070,id_77942,Ok si el lo puede reponer entre semana mejor y...,,"Ok if he can replace it during the week, I bet...",es,"Ok if he can replace it during the week, I bet...",1,"Ok if he can replace it during the week, I bet...",1
63037,id_27498,Estimada familia. Este S√°bado 8 de febrero Ka...,,"Dear family. This Saturday, February 8 Katerin...",es,"Dear family. This Saturday, February 8 Katerin...",1,"Dear family. This Saturday, February 8 Katerin...",1
93886,id_92964,Como lo puedo ayudar,,How can I help you,es,How can I help you,1,How can I help you,1
87866,id_85381,Quisiera saber por favor,,I would like to know please,es,I would like to know please,1,I would like to know please,1
57796,id_14771,Buenos días J. Warren. Muchas gracias por ésta...,,Good morning J. Warren. Thank you very much fo...,es,Good morning J. Warren. Thank you very much fo...,1,Good morning J. Warren. Thank you very much fo...,1
92715,id_91457,No puedo ingresar ala reuni√≥n,,I can not enter the meeting,es,I can not enter the meeting,1,I can not enter the meeting,1
87852,id_85364,Tengo malo el internet anbenido muchos tennico...,,I have bad the internet and many tennics have ...,es,I have bad the internet and many tennics have ...,1,I have bad the internet and many tennics have ...,1
72231,id_51284,"Buenas tardes, soy Mr. L√≥pez el maestro de qu...",,"Good afternoon, I am Mr. L√≥pez, Miguel&#39;s ...",es,"Good afternoon, I am Mr. L√≥pez, Miguel's chem...",1,"Good afternoon, I am Mr. L√≥pez, Miguel's chem...",1
92615,id_91329,En zoom abuses el tel√©fono no meda,,"In zoom you abuse the phone, I don&#39;t know",es,"In zoom you abuse the phone, I don't know",1,"In zoom you abuse the phone, I don't know",1
82745,id_78828,Hable con cindy por que razon llego tarde seg...,,I talked to cindy why I was late for second an...,es,I talked to cindy why I was late for second an...,1,I talked to cindy why I was late for second an...,1


## 4.4. Add back to the original set of messages w demographics

In [37]:
all_msgs_new_id.shape

(375417, 69)

In [38]:
all_msgs_new_id.head()

Unnamed: 0,school_name,role,sender_full_name,relationship,receiver_full_name,StudentID,student_name,date,time,sms_delivery_status,...,gender,ell,farms,at-risk,highest_swd_level,ward,osse_school_merge,source_of_match,id_content_deduped_no_personalization,content_len
0,Columbia Heights EC (CHEC),Teacher,Joseph P. Talarico,Mother,Arcenia Melendez,9218141,Estephanie Melendez,29/08/19,10:49:34 PM,delivered,...,Female,NO,CEP,NO,-,Ward 4,CHEC,osse_sy1920,id_1,278.0
1,Columbia Heights EC (CHEC),Teacher,Joseph P. Talarico,Mother,Loleata Griffin,9221959,Paige Griffin,29/08/19,10:49:34 PM,delivered,...,Female,NO,CEP,NO,-,Ward 4,CHEC,osse_sy1920,id_2,268.0
2,Columbia Heights EC (CHEC),Teacher,Joseph P. Talarico,Parent,Walter Mundo,9321550,Walter Mundo-Barillas,29/08/19,10:49:35 PM,delivered,...,Male,NO,CEP,NO,-,Ward 1,CHEC,osse_sy1920,id_3,270.0
3,Columbia Heights EC (CHEC),Teacher,Joseph P. Talarico,Mother,Marta Cecilia Martinez,9324834,Katerin Hernandez Martinez,29/08/19,10:49:35 PM,delivered,...,Female,YES,CEP,NO,-,Ward 4,CHEC,osse_sy1920,id_4,272.0
4,Columbia Heights EC (CHEC),Teacher,Joseph P. Talarico,Mother,Ilia Perez,9207914,Tania Mendez-Perez,29/08/19,10:49:35 PM,delivered,...,Female,NO,CEP,NO,-,Ward 5,CHEC,osse_sy1920,id_5,268.0


In [39]:
all_msgs_wtranslation = pd.merge(all_msgs_new_id, 
                                 deduped_msgs_w_translation.drop(columns = ['content', 'non_english']), 
                                 how = 'left', 
                                 on = 'id_content_deduped_no_personalization')

all_msgs_wtranslation.shape

(375417, 75)

In [40]:
all_msgs_wtranslation.head()

Unnamed: 0,school_name,role,sender_full_name,relationship,receiver_full_name,StudentID,student_name,date,time,sms_delivery_status,...,osse_school_merge,source_of_match,id_content_deduped_no_personalization,content_len,translatedText,detectedSourceLanguage,translatedText_rm_html,translated,content_w_translation,run_thru_translate
0,Columbia Heights EC (CHEC),Teacher,Joseph P. Talarico,Mother,Arcenia Melendez,9218141,Estephanie Melendez,29/08/19,10:49:34 PM,delivered,...,CHEC,osse_sy1920,id_1,278.0,,,,0,Hello! This is Mr. Talarico and I'm excited to...,0
1,Columbia Heights EC (CHEC),Teacher,Joseph P. Talarico,Mother,Loleata Griffin,9221959,Paige Griffin,29/08/19,10:49:34 PM,delivered,...,CHEC,osse_sy1920,id_2,268.0,,,,0,Hello! This is Mr. Talarico and I'm excited to...,0
2,Columbia Heights EC (CHEC),Teacher,Joseph P. Talarico,Parent,Walter Mundo,9321550,Walter Mundo-Barillas,29/08/19,10:49:35 PM,delivered,...,CHEC,osse_sy1920,id_3,270.0,,,,0,Hello! This is Mr. Talarico and I'm excited to...,0
3,Columbia Heights EC (CHEC),Teacher,Joseph P. Talarico,Mother,Marta Cecilia Martinez,9324834,Katerin Hernandez Martinez,29/08/19,10:49:35 PM,delivered,...,CHEC,osse_sy1920,id_4,272.0,,,,0,Hello! This is Mr. Talarico and I'm excited to...,0
4,Columbia Heights EC (CHEC),Teacher,Joseph P. Talarico,Mother,Ilia Perez,9207914,Tania Mendez-Perez,29/08/19,10:49:35 PM,delivered,...,CHEC,osse_sy1920,id_5,268.0,,,,0,Hello! This is Mr. Talarico and I'm excited to...,0


In [41]:
#all_msgs_wtranslation.to_pickle('../data/analysis_data/msgs_wdem_wtrans_1124.pkl')

all_msgs_wtranslation.to_pickle('../data/analysis_data/msgs_wdem_wtrans_1203.pkl')

# To delete

In [56]:
# RJ's suggestion to test out on 100 to see 
gtrans_api_sample100 = original_msgs_all.sample(n = 100, 
                                                random_state = 1118)

gtrans_api_sample100['output_list'] = gtrans_api_sample100.apply(google_translate_messages,
                                                                 axis = 1)

gtrans_api_sample100[['translatedText', 'detectedSourceLanguage', 'orig_content']] = \
    pd.DataFrame(gtrans_api_sample100.output_list.to_list(),  
                 index = gtrans_api_sample100.index)

gtrans_api_sample100.to_pickle('../data/analysis_data/sample100_to_translate1119.pkl')

Unnamed: 0,id,content,output_list,translatedText,detectedSourceLanguage,orig_content
356019,id_313838,Good afternoon \nIf you want to ensure that y...,[Good afternoon \nIf you want to ensure that ...,Good afternoon \nIf you want to ensure that y...,en,Good afternoon \nIf you want to ensure that y...
333435,id_294083,https://calendly.com/david-perez-5/ptc-mr-pere...,[https://calendly.com/david-perez-5/ptc-mr-per...,https://calendly.com/david-perez-5/ptc-mr-pere...,en,https://calendly.com/david-perez-5/ptc-mr-pere...
421426,id_369171,Yes,"[Yes, en, Yes]",Yes,en,Yes
85532,id_75815,"Join us for coffee, this morning between 11am-...","[Join us for coffee, this morning between 11am...","Join us for coffee, this morning between 11am-...",en,"Join us for coffee, this morning between 11am-..."
78360,id_69096,"Hello parents, we are starting Spanish class f...","[Hello parents, we are starting Spanish class ...","Hello parents, we are starting Spanish class f...",en,"Hello parents, we are starting Spanish class f..."
...,...,...,...,...,...,...
352960,id_311057,Hello! We are at the last week of school! Deon...,[Hello! We are at the last week of school! Deo...,Hello! We are at the last week of school! Deon...,en,Hello! We are at the last week of school! Deon...
173286,id_153177,"Buenas tardes estudiantes, para ver c√≥mo ingr...","[Good afternoon students, to see how to enter ...","Good afternoon students, to see how to enter t...",es,"Buenas tardes estudiantes, para ver c√≥mo ingr..."
32711,id_29346,"Good morning,\r\n\r\nThis is just to confirm t...","[Good morning,\r\n\r\nThis is just to confirm ...","Good morning,\r\n\r\nThis is just to confirm t...",en,"Good morning,\r\n\r\nThis is just to confirm t..."
198255,id_174259,"Class, due to timing issues, I will have offic...","[Class, due to timing issues, I will have offi...","Class, due to timing issues, I will have offic...",en,"Class, due to timing issues, I will have offic..."
