# Data Augmentation

## Back-translation with open source hugginface transformer models for NMT

In [1]:
from fastai2.text.all import *

In [2]:
from transformers import MarianMTModel, MarianTokenizer

In [3]:
path = Path('.')
path.ls()

(#20) [Path('Tweets_valid_translated.csv'),Path('training.1600000.processed.noemoticon.csv'),Path('tweets_ULMFit_QRNN_subword.ipynb'),Path('dls_lm.pkl'),Path('dls_lm_qrnn.pkl'),Path('Tweets_valid_full.csv'),Path('tweets_QRNN_Final.ipynb'),Path('.ipynb_checkpoints'),Path('tweets_Data-Aug.ipynb'),Path('tmp')...]

## Data loading

In [4]:
df = pd.read_csv(path/'Tweets_valid.csv')

In [7]:
df['text'] = df['text'].apply(lambda x: x.replace('\n', ' '))

## Back translation part 1: English-German-English

Define source and target language and model name

In [4]:
src = 'en'
trg = 'de'

In [9]:
mname = f'Helsinki-NLP/opus-mt-{src}-{trg}'

In [10]:
mname

'Helsinki-NLP/opus-mt-en-de'

Download pretrained NMT model and the corresponding tokenizer from huggingface model hub

In [11]:
tokenizer = MarianTokenizer.from_pretrained(mname)
model = MarianMTModel.from_pretrained(mname)



Put model on GPU to speed up inference

In [12]:
model.cuda();

Translation loop for translating from EN to DE:
- loops through the dataset in batches of size 32
- tokenizes the input texts
- runs inference on the pre-trained NMT model
- decodes the translated outputs from indices to words

In [13]:
bs = 32
translated = []
for i in range(int(len(df)/bs)+1):
    if len(df)-(i*bs)<=bs:
        print(f'Selecting reviews {i*bs} until {len(df)}')
        src_texts=df.loc[i*bs:,'text']
    else: 
        print(f'Selecting reviews {i*bs} until {i*bs+bs-1}')
        src_texts=df.loc[i*bs:i*bs+bs-1,'text']
    batch = tokenizer.prepare_translation_batch(src_texts=src_texts)
    batch
    print('translating...')
    gen = model.generate(batch.input_ids.cuda())
    print('decoding...')
    decoded = [tokenizer.decode(g, skip_special_tokens=True) for g in gen]
    translated.extend(decoded)

Selecting reviews 0 until 31
translating...
decoding...
Selecting reviews 32 until 63
translating...
decoding...
Selecting reviews 64 until 95
translating...
decoding...
Selecting reviews 96 until 127
translating...
decoding...
Selecting reviews 128 until 159
translating...
decoding...
Selecting reviews 160 until 191
translating...
decoding...
Selecting reviews 192 until 223
translating...
decoding...
Selecting reviews 224 until 255
translating...
decoding...
Selecting reviews 256 until 287
translating...
decoding...
Selecting reviews 288 until 319
translating...
decoding...
Selecting reviews 320 until 351
translating...
decoding...
Selecting reviews 352 until 383
translating...
decoding...
Selecting reviews 384 until 415
translating...
decoding...
Selecting reviews 416 until 447
translating...
decoding...
Selecting reviews 448 until 479
translating...
decoding...
Selecting reviews 480 until 511
translating...
decoding...
Selecting reviews 512 until 543
translating...
decoding...
Selec

decoding...
Selecting reviews 4352 until 4383
translating...
decoding...
Selecting reviews 4384 until 4415
translating...
decoding...
Selecting reviews 4416 until 4447
translating...
decoding...
Selecting reviews 4448 until 4479
translating...
decoding...
Selecting reviews 4480 until 4511
translating...
decoding...
Selecting reviews 4512 until 4543
translating...
decoding...
Selecting reviews 4544 until 4575
translating...
decoding...
Selecting reviews 4576 until 4607
translating...
decoding...
Selecting reviews 4608 until 4639
translating...
decoding...
Selecting reviews 4640 until 4671
translating...
decoding...
Selecting reviews 4672 until 4703
translating...
decoding...
Selecting reviews 4704 until 4735
translating...
decoding...
Selecting reviews 4736 until 4767
translating...
decoding...
Selecting reviews 4768 until 4799
translating...
decoding...
Selecting reviews 4800 until 4831
translating...
decoding...
Selecting reviews 4832 until 4863
translating...
decoding...
Selecting re

decoding...
Selecting reviews 8672 until 8703
translating...
decoding...
Selecting reviews 8704 until 8735
translating...
decoding...
Selecting reviews 8736 until 8767
translating...
decoding...
Selecting reviews 8768 until 8799
translating...
decoding...
Selecting reviews 8800 until 8831
translating...
decoding...
Selecting reviews 8832 until 8863
translating...
decoding...
Selecting reviews 8864 until 8895
translating...
decoding...
Selecting reviews 8896 until 8927
translating...
decoding...
Selecting reviews 8928 until 8959
translating...
decoding...
Selecting reviews 8960 until 8991
translating...
decoding...
Selecting reviews 8992 until 9023
translating...
decoding...
Selecting reviews 9024 until 9055
translating...
decoding...
Selecting reviews 9056 until 9087
translating...
decoding...
Selecting reviews 9088 until 9119
translating...
decoding...
Selecting reviews 9120 until 9151
translating...
decoding...
Selecting reviews 9152 until 9183
translating...
decoding...
Selecting re

decoding...
Selecting reviews 12896 until 12927
translating...
decoding...
Selecting reviews 12928 until 12959
translating...
decoding...
Selecting reviews 12960 until 12991
translating...
decoding...
Selecting reviews 12992 until 13023
translating...
decoding...
Selecting reviews 13024 until 13055
translating...
decoding...
Selecting reviews 13056 until 13087
translating...
decoding...
Selecting reviews 13088 until 13119
translating...
decoding...
Selecting reviews 13120 until 13151
translating...
decoding...
Selecting reviews 13152 until 13183
translating...
decoding...
Selecting reviews 13184 until 13215
translating...
decoding...
Selecting reviews 13216 until 13247
translating...
decoding...
Selecting reviews 13248 until 13279
translating...
decoding...
Selecting reviews 13280 until 13311
translating...
decoding...
Selecting reviews 13312 until 13343
translating...
decoding...
Selecting reviews 13344 until 13375
translating...
decoding...
Selecting reviews 13376 until 13407
transla

In [14]:
len(translated), len(df)

(14640, 14640)

In [15]:
df['text_de'] = translated

In [6]:
df[['text', 'text_de']].tail()

Unnamed: 0,text,text_de
14635,@SouthwestAir How do I stop getting credit card apps? I already have a card!,"@SouthwestAir Wie höre ich auf, Kreditkarten-Apps zu bekommen? Ich habe bereits eine Karte!"
14636,@united I was sincerely thanking the pilot of flight 4461 of braving the snow and getting me home amongst many other Cancelled Flightlations.,"@united Ich bedankte mich aufrichtig bei dem Piloten des Fluges 4461, der den Schnee ausbrütete und mich unter vielen anderen Cancelled Flightlations nach Hause brachte."
14637,"@united and btw, the @Virgin and @JetBlue managed on time departures same time same destination- &amp; wouldn't take away a 9year old kids bag!","@united und btw, die @Virgin und @JetBlue verwaltet auf Zeit Abfahrten gleichzeitig gleiche Ziel- &amp; würde nicht wegnehmen eine 9 Jahre alte Kindertasche!"
14638,"@USAirways #not happy, wife 40th Bday trip. would like to be fully compensated for both flights","@USAirways #nicht glücklich, Frau 40. Bday Reise. möchte für beide Flüge vollständig entschädigt werden"
14639,@Delta @JetBlue received a voucher but if you want to improve relations be up front with passengers.,"@Delta @JetBlue erhielt einen Gutschein, aber wenn Sie die Beziehungen zu den Passagieren verbessern wollen, sind Sie vorne."


Reversing source and target language to translate back from German to English

In [7]:
mname = f'Helsinki-NLP/opus-mt-{trg}-{src}'
mname

'Helsinki-NLP/opus-mt-de-en'

In [8]:
tokenizer = MarianTokenizer.from_pretrained(mname)
model = MarianMTModel.from_pretrained(mname)



In [9]:
model.cuda();

Now translating the German translations back to English

In [10]:
bs = 32
translated_bwd = []
for i in range(int(len(df)/bs)+1):
    if len(df)-(i*bs)<=bs:
        print(f'Selecting reviews {i*bs} until {len(df)}')
        src_texts=df.loc[i*bs:,'text_de']
    else: 
        print(f'Selecting reviews {i*bs} until {i*bs+bs-1}')
        src_texts=df.loc[i*bs:i*bs+bs-1,'text_de']
    batch = tokenizer.prepare_translation_batch(src_texts=src_texts)
    print('translating...')
    gen = model.generate(batch.input_ids.cuda())
    print('decoding...')
    decoded = [tokenizer.decode(g, skip_special_tokens=True) for g in gen]
    translated_bwd.extend(decoded)

Selecting reviews 0 until 31
translating...
decoding...
Selecting reviews 32 until 63
translating...
decoding...
Selecting reviews 64 until 95
translating...
decoding...
Selecting reviews 96 until 127
translating...
decoding...
Selecting reviews 128 until 159
translating...
decoding...
Selecting reviews 160 until 191
translating...
decoding...
Selecting reviews 192 until 223
translating...
decoding...
Selecting reviews 224 until 255
translating...
decoding...
Selecting reviews 256 until 287
translating...
decoding...
Selecting reviews 288 until 319
translating...
decoding...
Selecting reviews 320 until 351
translating...
decoding...
Selecting reviews 352 until 383
translating...
decoding...
Selecting reviews 384 until 415
translating...
decoding...
Selecting reviews 416 until 447
translating...
decoding...
Selecting reviews 448 until 479
translating...
decoding...
Selecting reviews 480 until 511
translating...
decoding...
Selecting reviews 512 until 543
translating...
decoding...
Selec

decoding...
Selecting reviews 4352 until 4383
translating...
decoding...
Selecting reviews 4384 until 4415
translating...
decoding...
Selecting reviews 4416 until 4447
translating...
decoding...
Selecting reviews 4448 until 4479
translating...
decoding...
Selecting reviews 4480 until 4511
translating...
decoding...
Selecting reviews 4512 until 4543
translating...
decoding...
Selecting reviews 4544 until 4575
translating...
decoding...
Selecting reviews 4576 until 4607
translating...
decoding...
Selecting reviews 4608 until 4639
translating...
decoding...
Selecting reviews 4640 until 4671
translating...
decoding...
Selecting reviews 4672 until 4703
translating...
decoding...
Selecting reviews 4704 until 4735
translating...
decoding...
Selecting reviews 4736 until 4767
translating...
decoding...
Selecting reviews 4768 until 4799
translating...
decoding...
Selecting reviews 4800 until 4831
translating...
decoding...
Selecting reviews 4832 until 4863
translating...
decoding...
Selecting re

decoding...
Selecting reviews 8672 until 8703
translating...
decoding...
Selecting reviews 8704 until 8735
translating...
decoding...
Selecting reviews 8736 until 8767
translating...
decoding...
Selecting reviews 8768 until 8799
translating...
decoding...
Selecting reviews 8800 until 8831
translating...
decoding...
Selecting reviews 8832 until 8863
translating...
decoding...
Selecting reviews 8864 until 8895
translating...
decoding...
Selecting reviews 8896 until 8927
translating...
decoding...
Selecting reviews 8928 until 8959
translating...
decoding...
Selecting reviews 8960 until 8991
translating...
decoding...
Selecting reviews 8992 until 9023
translating...
decoding...
Selecting reviews 9024 until 9055
translating...
decoding...
Selecting reviews 9056 until 9087
translating...
decoding...
Selecting reviews 9088 until 9119
translating...
decoding...
Selecting reviews 9120 until 9151
translating...
decoding...
Selecting reviews 9152 until 9183
translating...
decoding...
Selecting re

decoding...
Selecting reviews 12896 until 12927
translating...
decoding...
Selecting reviews 12928 until 12959
translating...
decoding...
Selecting reviews 12960 until 12991
translating...
decoding...
Selecting reviews 12992 until 13023
translating...
decoding...
Selecting reviews 13024 until 13055
translating...
decoding...
Selecting reviews 13056 until 13087
translating...
decoding...
Selecting reviews 13088 until 13119
translating...
decoding...
Selecting reviews 13120 until 13151
translating...
decoding...
Selecting reviews 13152 until 13183
translating...
decoding...
Selecting reviews 13184 until 13215
translating...
decoding...
Selecting reviews 13216 until 13247
translating...
decoding...
Selecting reviews 13248 until 13279
translating...
decoding...
Selecting reviews 13280 until 13311
translating...
decoding...
Selecting reviews 13312 until 13343
translating...
decoding...
Selecting reviews 13344 until 13375
translating...
decoding...
Selecting reviews 13376 until 13407
transla

In [11]:
len(df), len(translated_bwd)

(14640, 14640)

Just some pandas operations to get the data into the right format

In [49]:
df_new = df[['airline_sentiment', 'airline_sentiment_confidence', 'text', 'is_valid', 'name']]

In [50]:
df_new.loc[:,'text'] = translated_bwd

In [51]:
df_new.loc[:,'name'] = True

In [54]:
df_new.rename(columns={'name':'translated'}, inplace=True)

In [55]:
df_new.head()

Unnamed: 0,airline_sentiment,airline_sentiment_confidence,text,is_valid,translated
0,negative,1.0,@united sit on UAL 683 - a comedy of mistakes. UAL is incompetent,False,True
1,neutral,0.66,@SouthwestAir When is the last chance to get #DestinationDragons tickets? I would die of luck if I won -,False,True
2,negative,1.0,@united Why the hell are my miles running out? Was really excited enough for a free flight in 10 years.,False,True
3,neutral,0.6694,@SouthwestAir when we can expect customer service to be available in Dallas,False,True
4,negative,1.0,We waited 40 min for our bags after a 45 min flight #nomorecheckedbags,False,True


In [56]:
df_new.to_csv('Tweets_valid_translated.csv', index=False)

In [58]:
df_full = pd.concat([df, df_new]).reset_index(drop=True)

In [62]:
len(df_full)

29280

In [64]:
df_full[df_full.translated==True].head()

Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text_old,tweet_coord,tweet_created,tweet_location,user_timezone,is_valid,text,text_de,translated
14640,,negative,1.0,,,,,,,,,,,,,False,@united sit on UAL 683 - a comedy of mistakes. UAL is incompetent,,True
14641,,neutral,0.66,,,,,,,,,,,,,False,@SouthwestAir When is the last chance to get #DestinationDragons tickets? I would die of luck if I won -,,True
14642,,negative,1.0,,,,,,,,,,,,,False,@united Why the hell are my miles running out? Was really excited enough for a free flight in 10 years.,,True
14643,,neutral,0.6694,,,,,,,,,,,,,False,@SouthwestAir when we can expect customer service to be available in Dallas,,True
14644,,negative,1.0,,,,,,,,,,,,,False,We waited 40 min for our bags after a 45 min flight #nomorecheckedbags,,True


In [65]:
df_full.to_csv('Tweets_valid_full.csv', index=False)

## Back translation part 2: English-French-English

Doing exactly the same as above but using French as intermediate language instead of German. Also note that I only use this second type of back-translate to further increase the number of neutral and positive training examples in order to have a balanced training dataset. Negative examples and the validation set are excluded for this step.

In [4]:
df = pd.read_csv('Tweets_valid_full.csv')

In [5]:
idx = df[df.translated!=True].index
df.loc[idx,'translated'] = False

In [6]:
cnt = Counter(df[df.is_valid==False].airline_sentiment)
cnt

Counter({'negative': 14624, 'neutral': 5018, 'positive': 3782})

In [7]:
diff_neu = int(cnt['negative']/2 - cnt['neutral'])
diff_pos = int(cnt['negative']/2 - cnt['positive'])
diff_neu, diff_pos

(2294, 3530)

In [8]:
df_transl_neu = df[(df.is_valid==False) & (df.translated==False) & (df.airline_sentiment=='neutral')].sample(n=diff_neu)
df_transl_pos = df[(df.is_valid==False) & (df.airline_sentiment=='positive')].sample(n=diff_pos)
len(df_transl_neu), len(df_transl_pos)

(2294, 3530)

In [9]:
df_transl = pd.concat([df_transl_neu, df_transl_pos]).reset_index(drop=True)
len(df_transl)

5824

In [10]:
src = 'en'
trg = 'fr'

In [11]:
mname = f'Helsinki-NLP/opus-mt-{src}-{trg}'
mname

'Helsinki-NLP/opus-mt-en-fr'

In [12]:
tokenizer = MarianTokenizer.from_pretrained(mname)
model = MarianMTModel.from_pretrained(mname)



In [13]:
model.cuda();

In [14]:
bs = 32
translated = []
for i in range(int(len(df_transl)/bs)):
    if len(df_transl)-(i*bs)<=bs:
        print(f'Selecting reviews {i*bs} until {len(df_transl)}')
        src_texts=df_transl.loc[i*bs:,'text']
    else: 
        print(f'Selecting reviews {i*bs} until {i*bs+bs-1}')
        src_texts=df_transl.loc[i*bs:i*bs+bs-1,'text']
    batch = tokenizer.prepare_translation_batch(src_texts=src_texts)
    batch
    print('translating...')
    gen = model.generate(batch.input_ids.cuda())
    print('decoding...')
    decoded = [tokenizer.decode(g, skip_special_tokens=True) for g in gen]
    translated.extend(decoded)

Selecting reviews 0 until 31
translating...
decoding...
Selecting reviews 32 until 63
translating...
decoding...
Selecting reviews 64 until 95
translating...
decoding...
Selecting reviews 96 until 127
translating...
decoding...
Selecting reviews 128 until 159
translating...
decoding...
Selecting reviews 160 until 191
translating...
decoding...
Selecting reviews 192 until 223
translating...
decoding...
Selecting reviews 224 until 255
translating...
decoding...
Selecting reviews 256 until 287
translating...
decoding...
Selecting reviews 288 until 319
translating...
decoding...
Selecting reviews 320 until 351
translating...
decoding...
Selecting reviews 352 until 383
translating...
decoding...
Selecting reviews 384 until 415
translating...
decoding...
Selecting reviews 416 until 447
translating...
decoding...
Selecting reviews 448 until 479
translating...
decoding...
Selecting reviews 480 until 511
translating...
decoding...
Selecting reviews 512 until 543
translating...
decoding...
Selec

decoding...
Selecting reviews 4352 until 4383
translating...
decoding...
Selecting reviews 4384 until 4415
translating...
decoding...
Selecting reviews 4416 until 4447
translating...
decoding...
Selecting reviews 4448 until 4479
translating...
decoding...
Selecting reviews 4480 until 4511
translating...
decoding...
Selecting reviews 4512 until 4543
translating...
decoding...
Selecting reviews 4544 until 4575
translating...
decoding...
Selecting reviews 4576 until 4607
translating...
decoding...
Selecting reviews 4608 until 4639
translating...
decoding...
Selecting reviews 4640 until 4671
translating...
decoding...
Selecting reviews 4672 until 4703
translating...
decoding...
Selecting reviews 4704 until 4735
translating...
decoding...
Selecting reviews 4736 until 4767
translating...
decoding...
Selecting reviews 4768 until 4799
translating...
decoding...
Selecting reviews 4800 until 4831
translating...
decoding...
Selecting reviews 4832 until 4863
translating...
decoding...
Selecting re

In [15]:
len(translated), len(df_transl)

(5824, 5824)

In [16]:
translated[:10]

['@SouthwestAir a réservé la 8ème.... Sauvez-moi une issue de secours...',
 "@united oui! Je vais envoyer un e-mail au service à la clientèle aujourd'hui.",
 '@united Était sur NH10 sur United ticket, réacheminé vers IAD en raison de la météo à JFK. Pouvez-vous nous ramener à la maison sur United 5713 ou 3277?',
 '@united pls suivent pour DM.',
 "@Delta @JetBlue s'il vous plaît commencer à voler à Guyana bientôt",
 "@Vol d'avant-vente vers #miami depuis #EWR. Voyage annuel #SOBEWFF. Maintenant.",
 "@Delta @JetBlue The Opal Dragon book The Dragon (ALI) a tissé ses voies de meurtre des Philippines à l'Australie http://t.co/ltwhmOL1Dr",
 '@USAirways Oui, je comprends cela. Je demande: Plus précisément, que ferez-vous des commentaires que je vous ai donnés?',
 "@united se produit à chaque fois à l'intérieur et à la sortie de Newark.",
 "@united J'ai envoyé un email décrivant mon expérience. J'ai hâte d'avoir un suivi pour le rendre bien."]

In [17]:
df_transl['text_fr'] = translated

In [18]:
df_transl[['text', 'text_fr']].tail()

Unnamed: 0,text,text_fr
5819,@SouthwestAir yes. Thank you. Oct 25-oct 31,@SouthwestAir oui. Merci. 25 oct. 31
5820,@SouthwestAir thank you for your help resolving my problem Shannon ROCKS - even though Rhonda didn't !!,@SouthwestAir merci pour votre aide pour résoudre mon problème Shannon ROCKS - même si Rhonda ne l'a pas fait!!
5821,"@united does a good thing, keeping flight for a few for 11 peeps on late flight connecting flight. We will still destination on time","@united fait une bonne chose, en gardant le vol pour quelques-uns pour 11 peeps sur le vol de correspondance en retard. Nous serons toujours destination à l'heure"
5822,@united thanks for leaving our 3 year old in his own row flight 360 LAX-IAD,@united merci d'avoir laissé notre 3 ans dans son propre vol de ligne 360 LAX-IAD
5823,@SouthwestAir great example of customer service this morning at MSY went to ATL. Alison and Bobbi were fantastic! Gate B8. Thank you.,@SouthwestAir excellent exemple de service à la clientèle ce matin à MOY est allé à ATL. Alison et Bobbi étaient fantastiques! Porte B8. Merci.


In [19]:
mname = f'Helsinki-NLP/opus-mt-{trg}-{src}'
mname

'Helsinki-NLP/opus-mt-fr-en'

In [20]:
tokenizer = MarianTokenizer.from_pretrained(mname)
model = MarianMTModel.from_pretrained(mname)



In [21]:
model.cuda();

In [22]:
bs = 32
translated_bwd = []
for i in range(int(len(df_transl)/bs)):
    if len(df_transl)-(i*bs)<=bs:
        print(f'Selecting reviews {i*bs} until {len(df_transl)}')
        src_texts=df_transl.loc[i*bs:,'text_fr']
    else: 
        print(f'Selecting reviews {i*bs} until {i*bs+bs-1}')
        src_texts=df_transl.loc[i*bs:i*bs+bs-1,'text_fr']
    batch = tokenizer.prepare_translation_batch(src_texts=src_texts)
    print('translating...')
    gen = model.generate(batch.input_ids.cuda())
    print('decoding...')
    decoded = [tokenizer.decode(g, skip_special_tokens=True) for g in gen]
    translated_bwd.extend(decoded)

Selecting reviews 0 until 31
translating...
decoding...
Selecting reviews 32 until 63
translating...
decoding...
Selecting reviews 64 until 95
translating...
decoding...
Selecting reviews 96 until 127
translating...
decoding...
Selecting reviews 128 until 159
translating...
decoding...
Selecting reviews 160 until 191
translating...
decoding...
Selecting reviews 192 until 223
translating...
decoding...
Selecting reviews 224 until 255
translating...
decoding...
Selecting reviews 256 until 287
translating...
decoding...
Selecting reviews 288 until 319
translating...
decoding...
Selecting reviews 320 until 351
translating...
decoding...
Selecting reviews 352 until 383
translating...
decoding...
Selecting reviews 384 until 415
translating...
decoding...
Selecting reviews 416 until 447
translating...
decoding...
Selecting reviews 448 until 479
translating...
decoding...
Selecting reviews 480 until 511
translating...
decoding...
Selecting reviews 512 until 543
translating...
decoding...
Selec

decoding...
Selecting reviews 4352 until 4383
translating...
decoding...
Selecting reviews 4384 until 4415
translating...
decoding...
Selecting reviews 4416 until 4447
translating...
decoding...
Selecting reviews 4448 until 4479
translating...
decoding...
Selecting reviews 4480 until 4511
translating...
decoding...
Selecting reviews 4512 until 4543
translating...
decoding...
Selecting reviews 4544 until 4575
translating...
decoding...
Selecting reviews 4576 until 4607
translating...
decoding...
Selecting reviews 4608 until 4639
translating...
decoding...
Selecting reviews 4640 until 4671
translating...
decoding...
Selecting reviews 4672 until 4703
translating...
decoding...
Selecting reviews 4704 until 4735
translating...
decoding...
Selecting reviews 4736 until 4767
translating...
decoding...
Selecting reviews 4768 until 4799
translating...
decoding...
Selecting reviews 4800 until 4831
translating...
decoding...
Selecting reviews 4832 until 4863
translating...
decoding...
Selecting re

In [23]:
len(df_transl), len(translated_bwd)

(5824, 5824)

In [24]:
df_transl['text'] = translated_bwd

In [25]:
df_transl['translated'] = True

In [26]:
df_new = pd.concat([df, df_transl]).reset_index(drop=True)

In [27]:
len(df_new)

35104

In [28]:
Counter(df_new[df_new.is_valid==False].airline_sentiment)

Counter({'negative': 14624, 'neutral': 7312, 'positive': 7312})

In [29]:
df_new.to_csv('Tweets_final_aug.csv', index=False)