<h1 align="center"> Sentiment Analysis </h1> 


In [1]:
# load the small embedding file
import gensim
small_model = gensim.models.KeyedVectors.load_word2vec_format('/Users/Ashrakat/Desktop/small-embeddings.txt', binary=False)

In [2]:
#general pipeline + embedd



import codecs, nltk, string
from nltk.corpus import stopwords

exclude = set(string.punctuation)
stop_word_list = stopwords.words('english')

# input should be a string, you convert text in a doc-embedding
def text_embedding(text):
    
    #it depends if the words are lowercased or not in the word embeddings that you use, if they are not skip this step
    text = text.lower()
    
    text = nltk.word_tokenize(text)
  
    # remove numbers
    text = [token for token in text if token not in exclude and token.isalpha()]

    # remove stopwords (not essential)
    text = [token for token in text if token not in stop_word_list]

    article_embedd = []
    
    # you take all embeddings
    for word in text:
            try:
                embed_word = small_model[word]
                article_embedd.append(embed_word)
            except KeyError:
                continue
    
    # then you average them
    avg = [float(sum(col))/len(col) for col in zip(*article_embedd)]
    
    return avg

In [None]:
#if we want to take a look using pandas - just for visualization
import pandas as pd
sentiment = pd.read_csv ("/Users/Ashrakat/Downloads/yelp-test.tsv", sep="\t",header=None)
sentiment=sentiment[1:]
sentiment.head()

In [None]:
sentiment.to_csv('/Users/Ashrakat/Downloads/yelp-test.tsv', index= False,sep="\t")

In [None]:
# open YELP product reviews dataset
# we are using only the "small" test-set, you can also train on the large training set if you'd like
import codecs

sentiment_dataset = codecs.open("/Users/Ashrakat/Downloads/yelp-test.tsv","r","utf-8").read().strip().split("\n")

print (sentiment_dataset[0])
print (" ")
print (sentiment_dataset[1])

In [None]:
# first, we define two folders, "corpus" - with the text and "labels", with the labels

corpus = []
labels = []

# be careful with this, the dataset is huge!
#for line in sentiment_dataset:
for line in sentiment_dataset[1:1000]:
    #its a tab seperated file
    text = line.split("\t")[1].replace('"','')# remove the  - replace with nothing
    label = line.split("\t")[0].replace('"','').replace("1","-1").replace("2","1") #change values
    
    corpus.append(text)
    labels.append(label)

In [None]:
corpus

In [None]:
labels

# Sentiment Analysis using Word list based approaches

"One of the simplest sentiment analysis approaches:
- compares the words of a text against a labeled word list
- where each word has been scored for valence, — **a “sentiment lexicon”** "

Check Paper by Finn ˚Arup Nielsen: http://www2.imm.dtu.dk/pubdb/edoc/imm6006.pdf

In [None]:
# AFINN Dictionary for Sentiment Analysis: https://github.com/fnielsen/afinn
#https://github.com/fnielsen/afinn/blob/master/afinn/data/AFINN-111.txt

#!pip install afinn

from afinn import Afinn

afinn = Afinn()

print (afinn.score("This is bad fake news"))

print (afinn.score("The sun is shining, what a beautiful day"))

print (afinn.score("That movie is horrible and beautiful at the same time"))

In [None]:
pred = []

#for sentence in the corpus predict its scores
for review in corpus:
    score = afinn.score(review)
    
    #I only want two labels for each of my sentences
    #if the score is below 0 give me a value -1, and if over 0 give me the value +1
    if score < 0.0:
        pred.append("-1")
    else:
        pred.append("1")

In [None]:
pred

In [None]:
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
print (precision_recall_fscore_support(labels, pred, average="macro"))

# Sentiment Analysis as a Classification Problem



In [None]:
import codecs

sentiment_dataset = codecs.open("/Users/Ashrakat/Downloads/yelp-test.tsv","r","utf-8").read().strip().split("\n")

print (sentiment_dataset[0])
print (" ")
print (sentiment_dataset[1])

In [None]:
corpus = []
labels = []

# be careful with this, the dataset is huge!
#for line in sentiment_dataset:
for line in sentiment_dataset[1:1000]:
    
    #its a tab seperated file
    text = line.split("\t")[1].replace('"','')# remove the  - replace with nothing
    label = line.split("\t")[0].replace('"','').replace("1","-1").replace("2","1") #change values
    
    # as usual, we use text-embeddings
    text = text_embedding(text)
    
    if len(text)>0:
        corpus.append(text)
        labels.append(label)
print ("ready!")


In [None]:
import numpy as np

X = np.array(corpus)
y = np.array(labels)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 20)

In [None]:
from sklearn.linear_model import LogisticRegression
reg_log = LogisticRegression()
reg_log.fit(X_train, y_train)
y_pred_log_g = reg_log.predict(X_test)

In [None]:
from sklearn.naive_bayes import GaussianNB

reg_nb = GaussianNB() 
reg_nb.fit(X_train, y_train)
y_pred_nb_g = reg_nb.predict(X_test)

In [None]:
from sklearn.svm import SVC

reg_svc = SVC()
reg_svc.fit(X_train, y_train)
y_pred_svm_g = reg_svc.predict(X_test)


In [None]:
from sklearn.neighbors import KNeighborsClassifier
reg_knn = KNeighborsClassifier(n_neighbors = 5)
reg_knn.fit(X_train, y_train)
y_pred_kc_g = reg_knn.predict(X_test)

In [None]:
from sklearn import metrics
print("Logistic Regression")
print(metrics.classification_report(y_test, y_pred_log_g))
print("SVM")
print(metrics.classification_report(y_test, y_pred_svm_g))
print("Naive Bayes")
print(metrics.classification_report(y_test, y_pred_nb_g))
print("K-Nearest Neighbor")
print(metrics.classification_report(y_test, y_pred_kc_g))


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import confusion_matrix
conf_mat = confusion_matrix(y_test, y_pred_svm_g, )
fig, ax = plt.subplots(figsize=(10,10))
sns.heatmap(conf_mat, annot=True, fmt='d',cmap="Accent")
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.show()

## KEEP only nouns

In [None]:
#pipeline + embedd
import codecs, nltk, string
from nltk.corpus import stopwords

exclude = set(string.punctuation)
stop_word_list = stopwords.words('english')

# input should be a string, you convert text in a doc-embedding
def text_embedding(text):
    
    #it depends if the words are lowercased or not in the word embeddings that you use, if they are not skip this step
    text = text.lower()
    
    text = nltk.word_tokenize(text)
    
    text = nltk.pos_tag(text)
    
    text=[word for word,pos in text if (pos == 'NN' or pos == 'NNP' or pos == 'NNS' or pos == 'NNPS')]
    
    # remove numbers
    text = [token for token in text if token not in exclude and token.isalpha()]

    # remove stopwords (not essential)
    text = [token for token in text if token not in stop_word_list]

    article_embedd = []
    
    # you take all embeddings
    for word in text:
            try:
                embed_word = small_model[word]
                article_embedd.append(embed_word)
            except KeyError:
                continue
    
    # then you average them
    avg = [float(sum(col))/len(col) for col in zip(*article_embedd)]
    
    return avg

In [None]:
corpus22 = []
labels22 = []

# be careful with this, the dataset is huge!
#for line in sentiment_dataset:
for line in sentiment_dataset[1:1000]:
    #its a tab seperated file
    text = line.split("\t")[1].replace('"','')# remove the  - replace with nothing
    label = line.split("\t")[0].replace('"','').replace("1","-1").replace("2","1") #change values
    # as usual, we use text-embeddings
    text = text_embedding(text)
    
    if len(text)>0:
        corpus22.append(text)
        labels22.append(label)
print ("ready!")


In [None]:
import numpy as np

X1 = np.array(corpus22)
y1 = np.array(labels22)

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X1, y1, test_size = 0.2, random_state = 20)

In [None]:
#Models

from sklearn.linear_model import LogisticRegression
reg_log_n = LogisticRegression()
reg_log_n.fit(X_train, y_train)
y_pred_log_n = reg_log_n.predict(X_test)

from sklearn.naive_bayes import GaussianNB
reg_nb_n = GaussianNB() 
reg_nb_n.fit(X_train, y_train)
y_pred_nb_n = reg_nb_n.predict(X_test)


from sklearn.svm import SVC
reg_svc_n = SVC()
reg_svc_n.fit(X_train, y_train)
y_pred_svm_n = reg_svc_n.predict(X_test)

from sklearn.neighbors import KNeighborsClassifier
reg_knn_n = KNeighborsClassifier(n_neighbors = 5)
reg_knn_n.fit(X_train, y_train)
y_pred_kc_n = reg_knn_n.predict(X_test)


In [None]:
from sklearn import metrics
print("Logistic Regression")
print(metrics.classification_report(y_test, y_pred_log_n))
print("SVM")
print(metrics.classification_report(y_test, y_pred_svm_n))
print("Naive Bayes")
print(metrics.classification_report(y_test, y_pred_nb_n))
print("K-Nearest Neighbor")
print(metrics.classification_report(y_test, y_pred_kc_n))


# Excercises

**Excercise 1**

- Use the Yelp review dataset, its huge, you can work with a part of it (ex. 1000 articles)
- Check our example when we analyzed the sentiment using word-based approches employing Afinn.
- Instead of taking the entire text, now you will process it (e.g., lemmat, pos, keep stopwords, etc) and see if you can improve the performance of the analysis
- Now try assigning neutral sentences as negative and see if this will improve your results

Extra:

- maybe try to see if different pipelines will improve your results


In [None]:

from nltk.stem.wordnet import WordNetLemmatizer

wordnet_lemmatizer = WordNetLemmatizer()


# input should be a string
def nlp_pipeline(text):
    
    # if you want you can split in sentences - i'm usually skipping this step
    # text = nltk.sent_tokenize(text) 
    
    #tokenize words for each sentence
    text = nltk.word_tokenize(text)
    
    # pos tagger
    text = nltk.pos_tag(text)

    # lemmatizer
    text = [wordnet_lemmatizer.lemmatize(token.lower(),"v")if "V" in pos else wordnet_lemmatizer.lemmatize(token.lower()) for token,pos in text]
    
    # remove punctuation and numbers
    text = [token for token in text if token not in exclude and token.isalpha()]
    
    # remove stopwords - be careful with this step    
    text = " ".join([token for token in text if token not in stop_word_list])

    return text

In [None]:
import codecs

sentiment_dataset = codecs.open("/Users/Ashrakat/Downloads/yelp-test.tsv","r","utf-8").read().strip().split("\n")

print (sentiment_dataset[0])
print (" ")
print (sentiment_dataset[1])

In [None]:
corpus1 = []
labels = []

# be careful with this, the dataset is huge!
#for line in sentiment_dataset:
for line in sentiment_dataset[1:1000]:
    text = line.split("\t")[1].replace('"','')
    label = line.split("\t")[0].replace('"','').replace("1","-1").replace("2","1")
    text = nlp_pipeline(text)
    
    corpus1.append(text)
    labels.append(label)

In [None]:
corpus1

In [None]:
pred1 = []

for review in corpus1:
    score = afinn.score(review)
    
    if score < 0.0:
        pred1.append("-1")
    else:
        pred1.append("1")

In [None]:
print (precision_recall_fscore_support(labels, pred, average="macro"))
print (precision_recall_fscore_support(labels, pred1, average="macro"))

In [None]:
pred2 = []

for review in corpus1:
    score = afinn.score(review)
    #assign neutral to negative!
    if score < 0.1:
        pred2.append("-1")
    else:
        pred2.append("1")

In [None]:
print (precision_recall_fscore_support(labels, pred, average="macro"))
print (precision_recall_fscore_support(labels, pred1, average="macro"))
print (precision_recall_fscore_support(labels, pred2, average="macro"))

** Excercise 2 - Dictionary based Sentiment Analysis with German Data**

1. load data (sputnikgerman20.tsv)
2. keep pnly date,title,content {2,3,4}
3. check if there are any missing values and remove them
4. create new column that includes title and content

Now you have dictionaries of positive and negative words for the sentiment analysis (SentiWS_v1.8c_Negative.txt and SentiWS_v1.8c_Positive.txt)

5. please use your dictionaries to create 2 columns that count the number of positive words and the second one counts the number of positive words

6. create a last column that will mark if the sentiment overall was positive or negative


You can use Pandas or anything else you prefer


In [None]:
sputnik = pd.read_csv("/Users/Ashrakat/Desktop/sputnikgerman20.tsv",header=None, encoding='utf-8',delimiter='\t',error_bad_lines=False)
sputnik=sputnik[[2,3,4]]
sputnik

In [None]:
sputnik2 = sputnik[sputnik.isna().any(axis=1)]
sputnik2

In [None]:
#create a content column that includes titke and content
sputnik.columns = ['date', 'title',"content1"]
sputnik["content"] = sputnik["title"] +[" "]+ sputnik["content1"] 
sputnik.head()

In [7]:

exclude = set(string.punctuation)
stop_word_list = stopwords.words('german')

# input should be a string
def nlp_pipeline(text):
    
    text = nltk.word_tokenize(text) #tokenization 
     #word_lower
    text = [word.lower() for word in text]
    # remove punctuation and numbers
    text = [token for token in text if token not in exclude and token.isalpha()] 
    # remove stopwords - be careful with this step
    text = [token for token in text if token not in stop_word_list]
    # lemmatizer - remove
    #text=[wordnet_lemmatizer.lemmatize(word) for word in text]
    
    # the output is text
    return text



In [None]:
sputnik["nlpprocessed"]=sputnik['content'].apply(nlp_pipeline)
sputnik.head()

In [None]:
sputnik['liststring'] = [','.join(map(str, l)) for l in sputnik['nlpprocessed']]
sputnik.head()

In [None]:
#load neg and positive dictionaries
negative = pd.read_csv("/Users/Ashrakat/Dropbox/University/Oxford/Jobs/Teaching/Text Analysis/code/Basics-of-Text-Analysis-for-Political-Science/Data/SentiWS_v1.8c_Negative.txt", encoding='utf-8', delimiter='\t',header=None)
positive=pd.read_csv("/Users/Ashrakat/Dropbox/University/Oxford/Jobs/Teaching/Text Analysis/code/Basics-of-Text-Analysis-for-Political-Science/Data/SentiWS_v1.8c_Positive.txt", encoding='utf-8', delimiter='\t',header=None)

negative.columns = ['main', 'value',"words"]
positive.columns = ['main', 'value',"words"]

positive

In [None]:
#split
negative1=(negative.main.apply(lambda x: pd.Series(str(x).split("|",2)))) 
positive1=(positive.main.apply(lambda x: pd.Series(str(x).split("|",2)))) 

negative1.columns = ['main', 'notneeded']
positive1.columns = ['main', 'notneeded']
positive1



In [None]:

del positive1['notneeded']
del negative1['notneeded']
negative1

In [None]:

#text preprocessing
positive1["nlpprocessed"]=positive1['main'].apply(nlp_pipeline)
negative1["nlpprocessed"]=negative1['main'].apply(nlp_pipeline)
negative1

positive1['nlpprocessed']=positive1['nlpprocessed'].apply(str)
positive1['nlpprocessed'] = positive1['nlpprocessed'].str[2:-2]

positive1

negative1['nlpprocessed']=negative1['nlpprocessed'].apply(str)
negative1['nlpprocessed'] = negative1['nlpprocessed'].str[2:-2]
negative1

positive1['nlpprocessed'][0] = "abmachung"

del positive1['main']
del negative1['main']

In [None]:



#convert pandas to list
positive_list= positive1.values.tolist()
negative_list= negative1.values.tolist()
positive_list

negative_list

#flatten
from functools import reduce #python 3

positive_list1=reduce(lambda x,y: x+y,positive_list)
negative_list1=reduce(lambda x,y: x+y,negative_list)

negative_list1
#positive_list1 = lambda l: [item for positive_list in l for item in positive_list]
#print(positive_list1)

In [None]:
sputnik['positive words'] = sputnik['liststring'].str.count('|'.join(positive_list1))
sputnik['negative words'] = sputnik['liststring'].str.count('|'.join(negative_list1))


conditions = [
(sputnik['positive words'] > sputnik['negative words']),
(sputnik['negative words'] > sputnik['positive words']),
(sputnik['negative words'] == sputnik['positive words'])
]

choices = [
'positive',
'negative',
'neutral'
]

sputnik['overall'] = np.select(conditions, choices, default = '')

sputnik.head()


** Excercise 3 - Using Cosine Similarity to check sentiment**


1. create a small dictionary of words that are related to good descriiption of refugees - positive
2. create a small dictionary of words that are related to bad description of refugees - negative
3. load rt_data
4. choose a random sample of one or more articles
5. Check the cosine similarity of this one or more articles with the good and the bad dictionary
6. Is this a good measure to assess the sentiment of an article? Can you think of applications of using this?

In [3]:
import codecs, nltk, string
import spacy
from nltk.corpus import stopwords
nlp = spacy.load('de_core_news_sm')


exclude = set(string.punctuation)
stop_word_list = stopwords.words('english')

# input should be a string - we need a simple pipeline for getting word embeddings
def nlp_simple_pipeline(text):
    
    #it depends if the words have been lowercased or not
    text = text.lower()
    
    text = nltk.word_tokenize(text)
        
    text = [token for token in text if token not in exclude and token.isalpha()]
    
    text = [token for token in text if token not in stop_word_list]

    return text

import codecs, nltk, string, os, gensim
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np


exclude = set(string.punctuation)

# this represent any text as a single "doc-embedding" we use it both for the query and the sentences
# input should be a string
def text_embedding(text):
    
    #you should check in the embeddings you use if the words have been lowercased or not. 
    #try ask the embedding for "barack" and for "Barack"
    # if the Barack works, then comment the following line
    text = text.lower()
    
    # we tokenize the text in single words
    text = nltk.tokenize.WordPunctTokenizer().tokenize(text)
    
    # we remove numbers and punctuation
    text = [token for token in text if token not in exclude and token.isalpha()]
    
    
    doc_embed = []
    
    # for each word we get the embedding and we append it to a list
    for word in text:
            try:
                embed_word = small_model[word]
                doc_embed.append(embed_word)
            except KeyError:
                continue
    # we average the embeddings of all the words, getting an overall doc embedding
    if len(doc_embed)>0:
        avg = [float(sum(col))/len(col) for col in zip(*doc_embed)]

        avg = np.array(avg).reshape(1, -1)

        # the output is a doc-embedding
        return avg
    else:
        return "Empty"

In [4]:
import codecs
rt = codecs.open("/Users/Ashrakat/Dropbox/University/Oxford/Jobs/Teaching/Text Analysis/code/Basics-of-Text-Analysis-for-Political-Science/Data/rt_dataset.tsv","r","utf-8").read().strip().split("\n")

In [8]:
good=["good","helpful","productive","devine","delightful","approve","accomplished","courageous","independent",\
     "innovative"]
bad=["bad","criminal","damage","harmful","annoy","angry"]

query_good = [" ".join(nlp_pipeline(" ".join(good)))]
emb_query_good = text_embedding(" ".join(query_good))

In [9]:
query_bad = [" ".join(nlp_pipeline(" ".join(bad)))]
emb_query_bad = text_embedding(" ".join(query_bad))

In [10]:
refugee_articles=[k for k in rt if 'refugee' in k]
refugee_article=refugee_articles[0]
refugee_article

'30 Jan, 2016 21:02 \tGermany accused of ‘paying’ African countries to take back ‘foreign’ asylum seekers\tnews\tBerlin has been adding specific “Readmission Agreements” to development aid accords with African countries in order to deport asylum seekers to these countries – regardless of their actual country of origin, “Pro Azyl” (For Asylum), a German refugee rights organization, alleges. “These agreements commit the African countries to readmitting their own citizens who have had asylum turned down by Germany, but it also allows them to readmit rejected asylum seekers from other countries, who travelled through these transit states,” Max Pichl, a member of Pro Azyl, told The Local. 40% of Germans want Merkel to resign over #refugees – poll https://t.co/fqin2O1fjwpic.twitter.com/6i4jj5kRji According to the relief organization’s data, if German authorities are unable to deport a rejected asylum seeker because they lack sufficient information and documents to determine or confirm their 

In [11]:
embs_corpus = text_embedding(refugee_article)


In [12]:
embs_corpus

array([[ 0.37329175,  0.05515771,  0.0271666 , -0.13462886,  0.23954936,
         0.15727955, -0.3521518 , -0.02903198, -0.05244827, -0.15272749,
         0.14220338, -0.08416153, -0.17582534, -0.11561452,  0.4303607 ,
         0.08351724,  0.03738566, -0.20569329, -0.13680462, -0.18129114,
         0.15322843,  0.07453755,  0.11037618,  0.03810774, -0.04981496,
        -1.52829004, -0.1232073 , -0.14362997, -0.0472891 , -0.01746286,
         2.98388645,  0.10590136, -0.34872489, -0.37058112, -0.05044259,
        -0.07058127,  0.05080798, -0.02663002, -0.14169129, -0.06504141,
        -0.11223021,  0.07423452,  0.26998135,  0.0368143 , -0.03500096,
        -0.06604766, -0.31665879,  0.18765902, -0.0174849 , -0.26387939]])

In [13]:
cosgood=cosine_similarity(embs_corpus, emb_query_good)
cosbad=cosine_similarity(embs_corpus, emb_query_bad)

In [14]:
if cosgood > cosbad:
    print("good")
else:
    print("bad")

bad


### with more articles

In [15]:
import csv

tsv_file = open("/Users/Ashrakat/Dropbox/University/Oxford/Jobs/Teaching/Text Analysis/code/Basics-of-Text-Analysis-for-Political-Science/Data/rt_dataset.tsv")
read_tsv = csv.reader(tsv_file, delimiter="\t")
all_lines=[]
for line in read_tsv:
    print(line)
    all_lines.append(line)
tsv_file.close()

['date ', 'title ', 'topic ', 'content']
['16 Sep, 2016 14:08 ', 'Putin: We don’t approve of WADA hackers, but information they leaked raises questions', 'news', 'We don’t approve of what hackers do, but what they’ve done is definitely of interest to the international community, especially the sports community,” the Russian president said, as cited by RIA Novosti. “It raises a lot of questions. It turns out that healthy athletes legally take medications that are prohibited for others, and the people who obviously suffer from serious illnesses and severe disabilities are being banned from the Paralympics only on suspicion of using some kind of drugs,” he added. Earlier this week, hacktivists from the cyber group ‘Fancy Bears’ released files revealing that top world athletes had received the green light from WADA to take banned substances. Prominent US sports stars - including tennis players Serena and Venus Williams, multiple Olympic gymnastics champion Simone Biles, and basketball play

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.


['16 Aug, 2016 17:49 ', 'Muslim women call out Western feminists for silence over ‘misogynist’ burkini ban', 'viral', 'Following bans by the French Riviera towns of Cannes and Villeneuve-Loubet citing “hygienic reasons” and linking it to terrorism, Sisco’s mayor enacted the restriction after a major brawl this past weekend in his village over the controversial swimsuit. @pparaman I don\'t think they for a minute see the irony. The #burkini ban highlights both deeply rooted racism within french elites & their collective irresponsibility in further igniting conflicts I am  JUST AS disgusted by France banning the burkini as I am by countries that make wearing it an obligation on women. READ MORE: Burqinis banned by one more French resort, mayor cites ‘hygienic reasons’ The debate has gripped France with Islamophobes, socialists, and feminists, among others, seemingly placed on the same side on the issue. “Since when did wearing a burkini, in most cases a loose fitting nylon version of a w

In [16]:
all_lines=all_lines[1:20]

In [17]:
#keywords = ['refugee', 'Refugees']
#l = map(lambda sub_list: list(filter(lambda element: any(map(lambda keyword: keyword in element, keywords)), sub_list)), all_lines)
#l

In [18]:
#refugee_article=list(l)
#refugee_article = [x for x in refugee_article if x != []]
#refugee_article

In [19]:
embs_corpus = [x+[text_embedding(x[3])] for x in all_lines] #what is x

In [20]:
embs_corpus[1]

['11 Sep, 2016 22:33 ',
 "Hillary Clinton diagnosed with pneumonia, cancels California campaign trip, 'Ellen' appearance",
 'usa',
 ' Dr. Lisa Bardack, Clinton’s personal doctor since 2001, released a statement through the Clinton campaign which said the former secretary of state had been diagnosed with pneumonia during a follow-up examination regarding her prolonged cough. Dr. Lisa R. Bardack, M.D., Clinton\'s doctor, says the Democratic nominee has pneumonia. Full statement: pic.twitter.com/qloLbhjdZy Clinton has been “advised to rest and modify her schedule,” and was put on antibiotics on Friday, Bardack said. “She is now re-hydrated and recovering nicely,” said the doctor, referring to the earlier explanation of “overheating and dehydration” given for Clinton’s abrupt departure from the 9/11 commemoration event. The media expressed suspicion over Clinton’s exit from the event after there were accounts that she had stumbled and had to be helped into her car. Video footage later emer

In [21]:


#def article_embedding(cleaned_article):
    
#    article_embedd = []
#    # for each word in the article, you take the embeddings
#    for word in cleaned_article:
#        try:
#            embed_word = small_model[word]
#            article_embedd.append(embed_word)
#        except KeyError:
#            continue
    
#    # average vectors of all words
#    avg = [float(sum(col))/len(col) for col in zip(*article_embedd)]
#    avg = np.array(avg).reshape(1, -1)
#    return avg



#query = [" ".join(nlp_pipeline(" ".join(text_query)))]

#emb_query = nlp_simple_pipeline(" ".join(text_query))
#emb_query = article_embedding(" ".join(text_query))

#emb_query

In [22]:
import codecs, nltk, string, os, gensim
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

scores_pos = [x + [cosine_similarity(x[4], emb_query_good)[0]] for x in embs_corpus]

In [23]:
scores_neg = [x + [cosine_similarity(x[4], emb_query_bad)[0]] for x in scores_pos]
scores_neg

[['16 Sep, 2016 14:08 ',
  'Putin: We don’t approve of WADA hackers, but information they leaked raises questions',
  'news',
  'We don’t approve of what hackers do, but what they’ve done is definitely of interest to the international community, especially the sports community,” the Russian president said, as cited by RIA Novosti. “It raises a lot of questions. It turns out that healthy athletes legally take medications that are prohibited for others, and the people who obviously suffer from serious illnesses and severe disabilities are being banned from the Paralympics only on suspicion of using some kind of drugs,” he added. Earlier this week, hacktivists from the cyber group ‘Fancy Bears’ released files revealing that top world athletes had received the green light from WADA to take banned substances. Prominent US sports stars - including tennis players Serena and Venus Williams, multiple Olympic gymnastics champion Simone Biles, and basketball player Elena Delle Donne - were mentio

In [24]:
for l in scores_neg:
    if l[5] > l[6] :
        print("GOOD",l[3])
        print(" ")
    else:
        print("BAD",l[3])
        print(" ")


GOOD We don’t approve of what hackers do, but what they’ve done is definitely of interest to the international community, especially the sports community,” the Russian president said, as cited by RIA Novosti. “It raises a lot of questions. It turns out that healthy athletes legally take medications that are prohibited for others, and the people who obviously suffer from serious illnesses and severe disabilities are being banned from the Paralympics only on suspicion of using some kind of drugs,” he added. Earlier this week, hacktivists from the cyber group ‘Fancy Bears’ released files revealing that top world athletes had received the green light from WADA to take banned substances. Prominent US sports stars - including tennis players Serena and Venus Williams, multiple Olympic gymnastics champion Simone Biles, and basketball player Elena Delle Donne - were mentioned among those who had received exemptions from WADA. The group later expanded its list of those who had been allowed to ta

In [25]:
new = []
for l in scores_neg:
    if l[5] > l[6] :
        new.append([l[3]+"GOOD"])
    else:
        new.append([l[3]+"BAD"])

In [26]:
new

[['We don’t approve of what hackers do, but what they’ve done is definitely of interest to the international community, especially the sports community,” the Russian president said, as cited by RIA Novosti. “It raises a lot of questions. It turns out that healthy athletes legally take medications that are prohibited for others, and the people who obviously suffer from serious illnesses and severe disabilities are being banned from the Paralympics only on suspicion of using some kind of drugs,” he added. Earlier this week, hacktivists from the cyber group ‘Fancy Bears’ released files revealing that top world athletes had received the green light from WADA to take banned substances. Prominent US sports stars - including tennis players Serena and Venus Williams, multiple Olympic gymnastics champion Simone Biles, and basketball player Elena Delle Donne - were mentioned among those who had received exemptions from WADA. The group later expanded its list of those who had been allowed to take