## BERT Sentiment Analysis

In [12]:
import pandas as pd
import spacy
from transformers import pipeline

In [13]:
data = pd.read_csv("data/refugee_coca_foranalysis.csv")

In [14]:
try:
    nlp = spacy.load("en")
except OSError:
    nlp = spacy.load("en_core_web_sm")

def sent_tokenize(word_list, model=nlp):
    doc = model(word_list)
    sentences = [sent.string.strip() for sent in doc.sents]
    return sentences

In [None]:
data['sentences'] = data['text'].apply(sent_tokenize)

In [None]:
#5-year periods
def get_period(year, startyr, endyr, n=5):
    period_start = []
    for i in range(startyr, endyr+1, n):
        period_start.append(i)
    for index, p in enumerate(period_start):
        if year >= p:
            period = index
            continue
        else:
            break
    return period  
    
data['period'] = data['year'].apply(lambda x: get_period(x, 1991, 2015, n=5) if x>=1991 else 0)

In [None]:
data.head()

In [None]:
# Allocate a pipeline for sentiment-analysis
nlp_sentiment = pipeline('sentiment-analysis')

In [None]:
periods = data.period.unique()

sentiment_over_period = {}
sentiment_scores = []
for p in periods:
    data_period = data[data['period'] == p]
    for sent in data_period['sentences'].sum():
        if 'refugee' in sent: 
            sentiment = nlp_sentiment(sent)
            polarity = sentiment['label']
            score = sentiment['score']
            
            if polarity=='NEGATIVE':
                score = -score
            
            sentiment_scores.append(score)
    avg = sum(sentiment_scores) / len(sentiment_scores)
    sentiment_over_period[p] = avg

In [None]:
sentiment_over_period

## BERT Text Generation

In [None]:
from transformers import AutoModelWithLMHead, AutoTokenizer

In [None]:
def textgen_results(sequence)
    refugees_textgen = {}
    tokenizer_gpt = AutoTokenizer.from_pretrained("gpt2")
    model_gpt = AutoModelWithLMHead.from_pretrained("gpt2")

    input = tokenizer_gpt.encode(sequence, return_tensors="pt")
    generated = model_gpt.generate(input, max_length=50)
    resulting_string = tokenizer_gpt.decode(generated.tolist()[0])
    refugees_textgen['gpt'] = resulting_string

    for ideology in ['left','right']:
        for period in range(0,5):
            tokenizer = AutoTokenizer.from_pretrained("bertresults/output_gpt_period{}_{}".format(period, ideology))
            model = AutoModelWithLMHead.from_pretrained("bertresults/output_gpt_period{}_{}".format(period, ideology))

            input = tokenizer.encode(sequence, return_tensors="pt")
            generated = model.generate(input, max_length=50)
            resulting_string = tokenizer.decode(generated.tolist()[0])

            refugees_textgen['period{}_{}'.format(period, ideology)] = resulting_string
    return refugees_textgen

In [None]:
refugees_textgen = textgen_results("Refugees are")
refugees_textgen