## BERT Sentiment Analysis

In [1]:
import pandas as pd
import spacy
from transformers import pipeline
import re

In [2]:
data = pd.read_csv("data/refugee_coca_foranalysis.csv")

In [3]:
try:
    nlp = spacy.load("en")
except OSError:
    nlp = spacy.load("en_core_web_sm")

def sent_tokenize(word_list, model=nlp):
    doc = model(word_list)
    sentences = [sent.string.strip() for sent in doc.sents]
    return sentences

In [4]:
data['sentences'] = data['text'].apply(sent_tokenize)

In [5]:
#5-year periods
def get_period(year, startyr, endyr, n=5):
    period_start = []
    for i in range(startyr, endyr+1, n):
        period_start.append(i)
    for index, p in enumerate(period_start):
        if year >= p:
            period = index
            continue
        else:
            break
    return period  
    
data['period'] = data['year'].apply(lambda x: get_period(x, 1991, 2015, n=5) if x>=1991 else 0)

In [6]:
# Write a function that let us conveniently label ideology: Left, Neutral, Right
media_ideology = {}

def label_ideology(media_title, ideology, media_ideology = media_ideology):
    media_ideology[media_title] = ideology
    
label_ideology('Money', 'Neutral')
label_ideology('MotherEarth', 'Neutral')
label_ideology('MotherJones', 'Left')
label_ideology('AmHeritage', 'Neutral')
label_ideology('AmSpect', 'Right')
label_ideology('Forbes', 'Right')
label_ideology('NatlReview', 'Right')
label_ideology('Newsweek', 'Left')
label_ideology('ScienceNews', 'Neutral')
label_ideology('Smithsonian', 'Neutral')
label_ideology('USNWR', 'Left')
label_ideology('WashMonth', 'Left')
label_ideology('ChangingTimes', 'Right')
label_ideology('HistoryToday', 'Neutral')
label_ideology('Omni', 'Neutral')
label_ideology('Wilderness', 'Neutral')
label_ideology('TIME', 'Left')
label_ideology('NatlParks', 'Neutral')
label_ideology('AmerArtist', 'Neutral')
label_ideology('RollingStone', 'Left')
label_ideology('Americas', 'Neutral')
label_ideology('SportsIll', 'Neutral')
label_ideology('Ms', 'Left')
label_ideology('PopScience', 'Neutral')
label_ideology('Futurist', 'Neutral')
label_ideology('HarpersMag', 'Left')
label_ideology('Fortune', 'Right')
label_ideology('USAToday', 'Left')
label_ideology('America', 'Left')
label_ideology('ChristCentury', 'Right')
label_ideology('People', 'Left')
label_ideology('Jet', 'Left')
label_ideology('Aging', 'Neutral')
label_ideology('Horticulture', 'Neutral')
label_ideology('NewRepublic', 'Left')
label_ideology('Conservation', 'Left')
label_ideology('NaturalHist', 'Neutral')
label_ideology('Atlantic', 'Left')
label_ideology('Inc.', 'Neutral')
label_ideology('ChildrenToday', 'Neutral')
label_ideology('Ebony', 'Left')
label_ideology('ConsumResrch', 'Neutral')
label_ideology('SatEvenPost', 'Neutral')
label_ideology('ChristToday', 'Right')
label_ideology('Backpacker', 'Neutral')
label_ideology('AmericanCraft', 'Neutral')
label_ideology('ArtAmerica', 'Neutral')
label_ideology('SportingNews', 'Neutral')
label_ideology('MensHealth', 'Neutral')
label_ideology('Antiques', 'Neutral')
label_ideology('Parenting', 'Neutral')
label_ideology('Essence', 'Neutral')
label_ideology('Environmental', 'Neutral')
label_ideology('USCatholic', 'Right')
label_ideology('MilitaryHist', 'Neutral')
label_ideology('PsychToday', 'Neutral')
label_ideology('Cosmopolitan', 'Left')
label_ideology('Redbook', 'Neutral')
label_ideology('Bazaar', 'Left')
label_ideology('ChildDigest', 'Neutral')
label_ideology('Bicycling', 'Neutral')
label_ideology('Shape', 'Neutral')
label_ideology('NatGeog', 'Neutral')
label_ideology('Entertainment', 'Neutral')
label_ideology('Astronomy', 'Neutral')
label_ideology('TownCountry', 'Neutral')
label_ideology('TotalHealth', 'Neutral')
label_ideology('Esquire', 'Left')
label_ideology('FieldStream', 'Neutral')
label_ideology('TechReview', 'Neutral')
label_ideology('CountryLiving', 'Neutral')
label_ideology('VegTimes', 'Neutral')
label_ideology('SouthernLiv', 'Neutral')
label_ideology('Skiing', 'Neutral')
label_ideology('ConsumRep', 'Neutral')
label_ideology('Sunset', 'Neutral')
label_ideology('HarpersBazaar', 'Neutral')
label_ideology('AmericanSpectator', 'Right')
label_ideology('GoodHousekeeping', 'Neutral')
label_ideology('PopMech', 'Neutral')
label_ideology('MHQTheQuarterly', 'Neutral')
label_ideology('TodaysParent', 'Neutral')
label_ideology('NationalGeographic', 'Neutral')
label_ideology('EEnvironmental', 'Neutral')
label_ideology('ParentingEarly', 'Neutral')
label_ideology('ABC', 'Left')
label_ideology('CNN', 'Left')
label_ideology('PBS', 'Left')
label_ideology('CBS', 'Left')
label_ideology('Ind', 'Left')
label_ideology('NPR', 'Left')
label_ideology('NBC', 'Left')
label_ideology('Fox', 'Right')
label_ideology('MSNBC', 'Left')
label_ideology('NYTimes', 'Left')
label_ideology('CSMonitor', 'Neutral')
label_ideology('AssocPress', 'Neutral')
label_ideology('WashPost', 'Left')
label_ideology('SanFranChron', 'Left')
label_ideology('Atlanta', 'Left') #Atlanta Journal Constitution
label_ideology('Houston', 'Left') #Houston Chronicle
label_ideology('Chicago', 'Left') #Chicago Sun-Times
label_ideology('Denver', 'Left') #Denver Post
label_ideology('GolfMag', 'Neutral')
label_ideology('NewStatesman', 'Left')
label_ideology('Austin', 'Left') #Austin American Statesman
label_ideology('STLouis', 'Left') #St Louis Post_Dispatch
label_ideology('Pittsburgh', 'Right') #Pittsburgh Post-Gazette
label_ideology('OrangeCR', 'Right') #Orange County Register

#add political leaning label
def add_ideology(x, media_ideology=media_ideology):
    try_split = re.split("_|: | ",x)
    if len(try_split)>1:
        x = try_split[0]
    if x in media_ideology:
        return media_ideology[x]
    else:
        print('{} does not exists'.format(x))

data["ideology"] = data['source'].apply(add_ideology)

In [7]:
data.head()

Unnamed: 0,text_id,text,word_count,year,genre,subgen,source,title,publication_info,sentences,period,ideology
0,2018849,""" bums . "" that 's what radio havana called ...",2950,1990,MAG,124.0,Money,This is the land of opportunity.,"Vol. 19 Issue 8, p98, 8p, 1 chart, 3c, 4bw\r\n","["" bums ., "" that 's what radio havana called ...",0,Neutral
1,2018850,section : investing expanding petrochemical ...,2514,1990,MAG,124.0,Money,Betting on regional booms.,"Vol. 19 Issue 8, p110, 5p, 1 chart, 2c\r\n",[section : investing expanding petrochemical ...,0,Neutral
2,2019006,section : clothes that work american history ...,1667,1990,MAG,130.0,MotherEarth,The evolution of jeans.,"p60, 4p, 5c, 2bw\r\n","[section :, clothes that work american history...",0,Neutral
3,2019061,section : movements from socialist to republi...,1754,1990,MAG,123.0,MotherJones,Serve the people.,"Vol. 15 Issue 5, p18, 3p, 1 illustration\r\n","[section :, movements from socialist to republ...",0,Left
4,2019063,inside a dusty cement-block house with worn ...,6032,1990,MAG,123.0,MotherJones,No road to Tahuanti.,"Vol. 15 Issue 5, p36, 11p, 8bw\r\n",[inside a dusty cement-block house with worn l...,0,Left


In [8]:
# Allocate a pipeline for sentiment-analysis
nlp_sentiment = pipeline('sentiment-analysis')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=230.0, style=ProgressStyle(description_…




In [11]:
periods = data.period.unique()

sentences = {}
sentiment_over_period = {}
sentiment_scores = []
for i in ['Left', 'Right']:
    for p in periods:
        data_period = data[(data['period'] == p) & (data['ideology']==i)]
        for sent in data_period['sentences'].sum():
            if 'refugee' in sent: 
                if '{}_{}'.format(p,i) not in sentences:
                    sentences['{}_{}'.format(p,i)] = []
                sentences['{}_{}'.format(p,i)].append(sent)
                sentiment = nlp_sentiment(sent)
                polarity = sentiment[0]['label']
                score = sentiment[0]['score']

                if polarity=='NEGATIVE':
                    score = -score

                sentiment_scores.append(score)
        avg = sum(sentiment_scores) / len(sentiment_scores)
        sentiment_over_period['{}_{}'.format(p,i)] = avg

In [12]:
sentences

{'0_Left': ['later , they met three teenage refugees from honduras , the last surviving males from their village , who described seeing friends killed by the military or ground down by poverty .',
  "when the refugees brought out guitars , the students led off with the only songs they could think of , the themes from gilligan 's island and the brady           with loss , offered their own ballads , about homes left behind and their hopes for justice .",
  "as we pass a family of refugee indians from the highlands , dressed in traditional clothes and paddling downstream , mishari puts a sharp point on all the talk we 've been hearing from local indians about the threat posed by intruders .",
  'poland and czechoslovakia are turning former soviet army barracks into emergency refugee camps .',
  'refugees from a destabilized soviet union , " said a government spokesman , " could overwhelm our ability to absorb them .',
  "germany , lying farthest east and absorbingmore than half of easter

In [13]:
sentiment_over_period

{'0_Left': -0.35015882635449386,
 '1_Left': -0.3316215306730695,
 '2_Left': -0.3188904572965666,
 '3_Left': -0.31331060445336034,
 '4_Left': -0.29713114655137407,
 '0_Right': -0.2998039719408499,
 '1_Right': -0.29781253679066527,
 '2_Right': -0.29802910279604344,
 '3_Right': -0.2976948994699104,
 '4_Right': -0.2977983650316607}

## BERT Text Generation

In [14]:
from transformers import AutoModelWithLMHead, AutoTokenizer

In [15]:
def textgen_results(sequence):
    refugees_textgen = {}
    tokenizer_gpt = AutoTokenizer.from_pretrained("gpt2")
    model_gpt = AutoModelWithLMHead.from_pretrained("gpt2")

    input = tokenizer_gpt.encode(sequence, return_tensors="pt")
    generated = model_gpt.generate(input, max_length=50)
    resulting_string = tokenizer_gpt.decode(generated.tolist()[0])
    refugees_textgen['gpt'] = resulting_string

    for ideology in ['left','right']:
        tokenizer = AutoTokenizer.from_pretrained("bertresults/output_gpt_{}".format(ideology))
        model = AutoModelWithLMHead.from_pretrained("bertresults/output_gpt_{}".format(ideology))

        input = tokenizer.encode(sequence, return_tensors="pt")
        generated = model.generate(input, max_length=50, bos_token_id=1, pad_token_id=1, eos_token_ids=1)
        resulting_string = tokenizer.decode(generated.tolist()[0])
        refugees_textgen['all_{}'.format(ideology)] = resulting_string
        
        for period in range(0,5):
            tokenizer = AutoTokenizer.from_pretrained("bertresults/output_gpt_period{}_{}".format(period, ideology))
            model = AutoModelWithLMHead.from_pretrained("bertresults/output_gpt_period{}_{}".format(period, ideology))

            input = tokenizer.encode(sequence, return_tensors="pt")
            generated = model.generate(input, max_length=50, bos_token_id=1, pad_token_id=1, eos_token_ids=1)
            resulting_string = tokenizer.decode(generated.tolist()[0])

            refugees_textgen['period{}_{}'.format(period, ideology)] = resulting_string
    return refugees_textgen

In [16]:
refugees_textgen = textgen_results("Refugees are")
refugees_textgen

{'gpt': 'Refugees are not allowed to enter the country.\n\nThe government has said it will not allow refugees to enter the country.\n\nThe government has said it will not allow refugees to enter the country.\n\nThe government has said!',
 'all_left': 'Refugees are being held in camps in the middle east. the united states has been sending troops to the region. the united states has been sending troops to the region. and the united states is sending troops to the region. and the united"',
 'period0_left': 'Refugees are being held in camps in the middle east, and the u.n. has been trying to get them out of there.!mr-lehrer : well, i think that the u.n. has been trying"',
 'period1_left': 'Refugees are being held in camps in the north of the country. the government has said that they will be released if they are found guilty.  "" we\'re not going to let them go, "" said a senior government official."',
 'period2_left': 'Refugees are being held in the camp of the refugees, where they are be

In [17]:
textgen_results("Israeli refugees are")

{'gpt': 'Israeli refugees are being held in detention centers in Turkey, where they are being held in a detention center for the first time since the war began.\n\nThe Turkish government has said it will not allow the refugees to return to Turkey, and has!',
 'all_left': 'Israeli refugees are being held in camps in the middle east. the united states has been trying to get them to return to their homes. but the united states has not been able to do that.!mr-macneil : and the united"',
 'period0_left': 'Israeli refugees are being held in camps in the middle east. the united states has been trying to get them out of the camps. the united states has been trying to get them out of the camps. the united states has been trying to get them"',
 'period1_left': 'Israeli refugees are being held in camps in the north of the country.  "" we\'re not going to let them go, "" said a senior official in the camp.  "" we\'re going to take them to the border. """',
 'period2_left': 'Israeli refugees are b

In [18]:
textgen_results("Syrian refugees are")

{'gpt': 'Syrian refugees are being sent to Syria to fight for their country.\n\nThe Syrian government has been accused of using the refugees as human shields, and has been accused of using them as human shields in the past.\n\nThe UN refugee agency!',
 'all_left': 'Syrian refugees are being held in camps in the middle east. the united states has been trying to get the refugees to return to their homes in the middle east. but the united states has not been able to do that.!mr-macne"',
 'period0_left': 'Syrian refugees are being held in camps in the middle east. the united states has been sending troops to the camps to help them. the united states has also been sending troops to the camps to help the refugees. the united states has been sending troops"',
 'period1_left': 'Syrian refugees are being held in camps in the north of the country.              , the refugees are being held in camps in the north of the country.      "',
 'period2_left': 'Syrian refugees are being held in camps in

In [19]:
textgen_results("Muslim refugees are")

{'gpt': 'Muslim refugees are being held in detention centres in Turkey, as well as in Iraq and Syria, according to the United Nations.\n\nThe UN refugee agency said on Tuesday that more than 1,000 people had been held in detention centres in Turkey!',
 'all_left': 'Muslim refugees are being held in camps in the middle east. the united states has been trying to get them to return to their homes. but the united states has not been able to do that.!mr-macneil : and the united"',
 'period0_left': 'Muslim refugees are being held in camps in the middle east. the united states has been sending troops to the camps, but the refugees are being held in camps in the middle east. the united states has been sending troops to the camps, but the"',
 'period1_left': 'Muslim refugees are being held in camps in the north of the country.  "" we\'re not going to let them go, "" said a refugee who asked not to be identified. "" we\'re going to take them to the camps."',
 'period2_left': 'Muslim refugees are