In [139]:

import pandas as pd
import re

import numpy as np
import pandas as pd
from pprint import pprint
from collections import Counter

#nltk stopwords
from nltk.corpus import stopwords


# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# spacy for lemmatization
import spacy

#TF-IDF
from sklearn.feature_extraction.text import TfidfTransformer,CountVectorizer

# Enable logging for gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

### Read Posts

In [140]:
name = "LatinosForBiden"
terms = "biden|democrats|democrat"
candidate = pd.read_csv(name+".csv")
candidate.shape

(4971, 31)

In [141]:
candidate.columns

Index(['Group Name', 'User Name', 'Facebook Id', 'Likes at Posting', 'Created',
       'Type', 'Likes', 'Comments', 'Shares', 'Love', 'Wow', 'Haha', 'Sad',
       'Angry', 'Care', 'Video Share Status', 'Post Views', 'Total Views',
       'Total Views For All Crossposts', 'Video Length', 'URL', 'Message',
       'Link', 'Final Link', 'Image Text', 'Link Text', 'Description',
       'Sponsor Id', 'Sponsor Name', 'Total Interactions',
       'Total Interactions (weighted  —  Likes 1x Shares 1x Comments 1x Love 1x Wow 1x Haha 1x Sad 1x Angry 1x Care 1x )'],
      dtype='object')

## Filter the most relevant columns

In [142]:
content = candidate[['Message','Description','Link','Group Name','Likes', 'Comments', 'Shares', 'Love', 'Wow', 'Haha', 'Sad',
       'Angry', 'Care','Total Interactions']].copy()
content.shape

(4971, 14)

### Add column to concat Message and Description

In [143]:
concat_message = content['Message'].fillna('') + (' ' + content['Description']).fillna('') + (' ' + content['Link']).fillna('')
content['MessageDescr'] = concat_message

### Just keep those groups that have Trump or Republican in their name

In [144]:
groups_candidate = content['Group Name'].str.lower().reset_index()
final_groups = groups_candidate[groups_candidate['Group Name'].str.contains(terms)].copy()
final_groups = final_groups['Group Name'].unique()
final_groups = pd.DataFrame(final_groups)
final_groups.columns = ['Group Name']
final_groups.to_csv(name+"_groups.csv")
final_groups.shape

(191, 1)

In [145]:
content['Group Name'] = content['Group Name'].str.lower()
content.shape

(4971, 15)

### Keep the posts from the preselected groups

In [146]:
content = content[content['Group Name'].isin(final_groups['Group Name'])].copy()
content.shape

(640, 15)

## Tokenize words and Clean-up text

In [147]:
content['cleanMessage'] = content['MessageDescr'].apply(lambda x: re.split('https:\/\/.*', str(x))[0])
content['cleanMessage'].shape

(640,)

In [148]:
# Convert to list
data = content.cleanMessage.tolist()

In [149]:
# Remove new line characters
data = [re.sub('\s+', ' ', sent) for sent in data]

# Remove distracting single quotes
data = [re.sub("\'", "", sent) for sent in data]

In [150]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

data_words = list(sent_to_words(data))

print(data_words[:1])

[['ricky', 'martin', 'one', 'of', 'the', 'biggest', 'hispanic', 'pop', 'icon', 'openly', 'supporting', 'joe', 'biden', 'pleads', 'for', 'all', 'his', 'followers', 'to', 'support', 'joe', 'biden', 'for', 'presdident', 'el', 'artista', 'que', 'ha', 'expresado', 'en', 'mas', 'de', 'una', 'ocasion', 'su', 'contrariedad', 'la', 'politica', 'de', 'trump', 'participa', 'en', 'un', 'encuentro', 'del', 'candidato', 'democrata', 'con', 'los', 'latinos', 'luis', 'fonsi', 'eva', 'longoria', 'tambien', 'pidieron', 'su', 'voto', 'en', 'dicho', 'evento']]


#### Creating Bigram and Trigram Models

In [151]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

# See trigram example
print(trigram_mod[bigram_mod[data_words[0]]])

['ricky_martin', 'one', 'of', 'the', 'biggest', 'hispanic', 'pop', 'icon', 'openly', 'supporting', 'joe', 'biden', 'pleads', 'for', 'all', 'his', 'followers', 'to', 'support', 'joe', 'biden', 'for', 'presdident', 'el', 'artista', 'que', 'ha', 'expresado', 'en', 'mas', 'de', 'una', 'ocasion', 'su', 'contrariedad', 'la', 'politica', 'de', 'trump', 'participa', 'en', 'un', 'encuentro', 'del', 'candidato', 'democrata', 'con', 'los', 'latinos', 'luis', 'fonsi', 'eva_longoria', 'tambien', 'pidieron', 'su', 'voto', 'en', 'dicho', 'evento']


In [152]:
stop_words = stopwords.words('english')
stop_words.extend(stopwords.words('spanish'))
stop_words.extend(['su','sus','al','de','en','el'])
stop_words.extend(['like','would','get','many','much'])

## Remove Stopwords, Make Bigrams and Lemmatize

In [153]:
# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts,stop_words_lang):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words_lang] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

## Bag of Words

In [157]:
# Remove Stop Words
data_words_nostops = remove_stopwords(data_words,stop_words)

# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
#!python3 -m spacy download en
nlp = spacy.load('en', disable=['parser', 'ner'])

# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

print(data_lemmatized[:2])

[['ricky_martin', 'big', 'hispanic', 'pop', 'icon', 'openly', 'support', 'plead', 'follower', 'support', 'ocasion'], ['call', 'activism', 'need', 'sanity']]


In [158]:
def combine_words(word1,word2,dictionary):

    if word1 in dictionary and word2 in dictionary:
        concat_name = word1+" / "+word2
        dictionary[concat_name] = dictionary[word1] + dictionary[word2]
        dictionary.pop(word1, None)
        dictionary.pop(word2, None)

        print(dictionary[concat_name])
    return dictionary

In [159]:
flat_list_words = [item for sublist in data_lemmatized for item in sublist]
count_words = Counter(flat_list_words)

In [123]:
final_dict = combine_words('donald','trump',count_words)
final_dict = combine_words('joe','biden',count_words)
final_dict = combine_words('kamala','harris',count_words)
final_dict = combine_words('american','america',count_words)
final_dict = combine_words('estados','unidos',count_words)
final_dict = combine_words('voters','vote',count_words)
final_dict = combine_words('mexico','mexican',count_words)
final_dict = combine_words('casa','blanca',count_words)
final_dict = combine_words('venezuela','venezuelans',count_words)
final_dict = combine_words('latino','latinos',count_words)
final_dict = combine_words('american / america','americans',count_words)
final_dict = combine_words('voters / vote','voting',count_words)
final_dict = combine_words('puerto','rico',count_words)
final_dict = combine_words('communism','communist',count_words)
final_dict = combine_words('ee','uu',count_words)
final_dict = combine_words('united','states',count_words)

430
476
28


In [124]:
final_dict.most_common(1000)

[('trump', 824),
 ('latino / latinos', 476),
 ('vote', 475),
 ('joe / biden', 430),
 ('voter', 393),
 ('state', 272),
 ('people', 253),
 ('make', 234),
 ('election', 229),
 ('say', 224),
 ('go', 187),
 ('hispanic', 173),
 ('support', 169),
 ('campaign', 157),
 ('plan', 152),
 ('know', 147),
 ('democratic', 145),
 ('community', 129),
 ('help', 129),
 ('want', 125),
 ('win', 123),
 ('white', 116),
 ('take', 112),
 ('month', 107),
 ('need', 99),
 ('live', 99),
 ('see', 98),
 ('show', 97),
 ('time', 94),
 ('share', 94),
 ('come', 94),
 ('downplay', 94),
 ('also', 93),
 ('life', 93),
 ('group', 90),
 ('country', 88),
 ('lose', 88),
 ('year', 87),
 ('call', 85),
 ('former', 85),
 ('black', 83),
 ('debate', 83),
 ('care', 83),
 ('poll', 82),
 ('tell', 82),
 ('american', 81),
 ('virus', 80),
 ('accord', 79),
 ('well', 76),
 ('spend', 73),
 ('event', 73),
 ('family', 72),
 ('president', 72),
 ('point', 72),
 ('question', 71),
 ('first', 70),
 ('talk', 70),
 ('pandemic', 70),
 ('key', 69),
 ('ri

## TD-IDF

In [125]:
docs = ([' '.join(ele) for ele in data_words_nostops] )

In [126]:
cv=CountVectorizer() 
word_count_vector=cv.fit_transform(docs)
word_count_vector.shape

(640, 6554)

In [127]:
tfidf_transformer=TfidfTransformer(smooth_idf=True,use_idf=True) 
tfidf_transformer.fit(word_count_vector)

TfidfTransformer()

In [128]:
# print idf values 
df_idf = pd.DataFrame(tfidf_transformer.idf_, index=cv.get_feature_names(),columns=["idf_weights"]) 
 
# sort ascending 
df_idf.sort_values(by=['idf_weights'])

Unnamed: 0,idf_weights
biden,1.230581
latinos,1.246423
trump,1.736182
joe,1.782857
vote,2.092391
...,...
inconvenient,6.769882
incorporated,6.769882
incorruptible,6.769882
impunity,6.769882


In [129]:
# count matrix 
count_vector=cv.transform(docs) 
 
# tf-idf scores 
tf_idf_vector=tfidf_transformer.transform(count_vector)

In [130]:
feature_names = cv.get_feature_names() 
 
#get tfidf vector for first document 
first_document_vector=tf_idf_vector[0] 
 
#print the scores 
df = pd.DataFrame(first_document_vector.T.todense(), index=feature_names, columns=["tfidf"]) 
df = df[(df != 0).all(1)]
df.sort_values(by=["tfidf"],ascending=False).to_csv(name+"_tfidf.csv")

In [131]:
df

Unnamed: 0,tfidf
artista,0.196292
biden,0.075908
biggest,0.141031
candidato,0.17016
contrariedad,0.196292
democrata,0.196292
dicho,0.196292
encuentro,0.196292
eva,0.17016
evento,0.174914


In [138]:
word = df.index[0]
content["lowercase"] = content['MessageDescr'].apply(lambda x: x.lower())
word_messages = content[content['lowercase'].str.contains(word)]

word_messages = word_messages['lowercase'].value_counts(ascending=False).rename_axis('unique_messages').reset_index(name='counts')
word_messages.to_csv(word +"_"+ name+"_messages.csv")
word_messages

Unnamed: 0,unique_messages,counts
0,ricky martin one of the biggest hispanic pop i...,2
1,for all latinos in miami who support biden... ...,1
2,https://www.youtube.com/watch?v=i2ynxayspew es...,1


### Repeated messages

In [93]:
content['Total Interactions']=content['Total Interactions'].astype(str).str.replace(',', '').astype(int)
results = content.groupby(['MessageDescr']).aggregate({'MessageDescr': 'count','Likes': 'sum','Comments': 'sum', 'Shares': 'sum', 'Love': 'sum', 'Wow': 'sum', 'Haha': 'sum', 'Sad': 'sum',
       'Angry': 'sum', 'Care': 'sum','Total Interactions':'sum'})
results.rename(columns={"MessageDescr": "counts"}, inplace=True)


results.to_csv("mostpopularLatinosFor"+name+".csv")

In [94]:
results.sort_values(by='Total Interactions',ascending=False)

Unnamed: 0_level_0,counts,Likes,Comments,Shares,Love,Wow,Haha,Sad,Angry,Care,Total Interactions
MessageDescr,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
"Ricky Martin one of the biggest Hispanic Pop Icon openly supporting Joe Biden! Pleads for all his followers to support Joe Biden for Presdident! El artista, que ha expresado en más de una ocasión su contrariedad a la política de Trump, participa en un encuentro del candidato demócrata con los latinos. Luis Fonsi y Eva Longoria también pidieron su voto en dicho evento https://elpais.com/gente/2020-09-16/ricky-martin-entra-en-campana-electoral-apoyando-abiertamente-a-joe-biden.html",2,2039,172,273,732,1,2,0,1,28,3248
CALL TO ACTIVISM for much needed sanity! https://www.facebook.com/calltoactivism/photos/a.398732900513772/1321138241606562/?type=3,3,626,686,132,193,5,276,2,6,1,1927
"This is good news. Sixty percent of young Latinos between the ages of 18 and 34 plan to vote for Democratic presidential nominee Joe Biden in the upcoming general election, according to data from a national Telemundo-Buzzfeed News survey of Latino voters.The poll also found that 60 percent of young Latinos surveyed be... http://a.msn.com/01/en-us/BB18TPdF?ocid=sf",1,767,129,90,182,0,0,1,0,15,1184
"Breaking news across America!!! Trump constantly disrespects WOMAN. He keeps saying white suburban HOUSEWIVES are supporting him 😂😂 Truthfully—-White suburban woman are backing BIDEN by a whopping 70% in every swing state in America Breaking news As a result, Biden has gained significant ground among WHITE MEN & WOMAN in swing states... Biden is at 69-70% with White suburban woman Trumps at 33-35% with white suburban woman Biden is currently at 50 % with white MEN Trump is at ————48%. With White MEN The demographics that will push Biden over the edge are the demographics as follows: Trumps at 32 with Latinos Biden is at 69-70% Trumps at 28% with African Americans,, Biden is at 80% with black voters.. Women in US suburbs are rallying against Trump's reelection efforts despite his assertion that ""suburban housewives"" will vote for him. https://www.insider.com/suburban-women-housewives-voters-trump-challenging-election?amp",1,601,72,62,199,0,6,0,0,14,954
Latinos early voting for Biden/Harris. MN blue all the way! My neice Emily voting for the very first time. #proudtio https://www.facebook.com/photo.php?fbid=10157584115543513&set=gm.721678188422012&type=3,1,607,60,2,176,0,2,0,1,7,855
...,...,...,...,...,...,...,...,...,...,...,...
"How come when you look around you find :\n\n- Latinos For Trump\n- Black voices For Trump \n- Muslims For Trump\n- Conservatives For Trump\n- Democrats For Trump \n- Evangelists For Trump\n- Jewish For Trump\n- Law Enforcement Unions For Trump\n- Veterans For Trump\n- LGBQT For Trump \n- Women For Trump\n- SouthEast Asians For Trump\n- Middle Eastern For Trump \n- Immigrants For Trump\n- Students For Trump\n\nYou can search every group by name on social media . They are not paid or funded by Billionaires. They are regular American citizens .\n\nWhile when I searched for Biden I could hardly find any similar groups except the obvious:\n\n- Media \n- Billionaires like Bloomberg & George Soros \n- Hollywood \n- Some Athletes \n- Pharmaceuticals \n- Anti Trump individuals \n- Businesses with strong ties to #CHINA.\n\nWhy is that ? How come not a single person is actually for Biden , Every one voting for Biden is simply voting against Trump because We all know #Biden had 47 years in politics with ZERO accomplishments. \n\nThose who will vote for #Trump , they will because they really like his Actions .\n\nThose who will vote for #Biden , they will because they really hate Trump words .\n\nIt’s all about #Trump and Biden has NOTHING to do with it . \n\nTake The Regular People Side , those who are like you and me , Don’t Take the Institutions side who Only uses me and you for their own benefit .\n\nMake some research for your own , Turn off the TV and make your own decision because No One Will Pay Your Bills But You .\n\nThis #election2020 is not about Right or Left . Not about Democrats or Republicans. Not About Colors Or Religions . It’s about America and Our Kids Future.\n\nTrump is Fighting everyone for #America .\nBiden doesn’t even know what is he running for .\n\n#Trump2020 https://www.facebook.com/824565719/posts/10164413110190720/",1,0,0,0,0,0,0,0,0,0,0
"Yep... Biden's not a lock to win ...\nFrom the article:\nThere are at least five reasons Joe Biden’s consistent lead over Donald Trump does not guarantee him a lock on the White House.\nFirst, there are indications that Trump’s base of support — whites without college degrees — is more energized and committed to voting this year than key Democratic constituencies. And there is also evidence that polling does not reflect this.\nSecond, Latinos, who are key to the outcome in several crucial states — Arizona and Florida, for example — have shown less support for Biden than for past Democratic nominees. Many Hispanic voters seem resistant to any campaign that defines them broadly as “people of color.”\nThird, absentee voting is expected to be higher among Democrats than Republicans, subjecting their ballots to a greater risk of rejection, a fate more common to mailed-in votes than to in-person voting.\nFourth, the generic Democratic-Republican vote (“Would you be more willing to vote for a Republican or Democratic candidate for Congress?”) through early July favored Democrats by more than 10 points, but has since narrowed to 6 points.\nFifth, the debates will test Biden’s ability to withstand three 90-minute battles against an opponent known for brutal personal attacks. https://www.nytimes.com/2020/09/23/opinion/joe-biden-donald-trump-2020.html?smid=fb-share",1,0,0,0,0,0,0,0,0,0,0
Florida Lieutenant Gov. Jeanette Nunez said Joe Biden is falling behind in the polls among Florida Latinos because he does not stand for what the community values. https://www.foxnews.com/politics/florida-lt-gov-argues-biden-slacking-in-latino-votes-since-trump-has-delivered-in-community?cmpid=prn_newsstand,1,0,0,0,0,0,0,0,0,0,0
"HOW IS THIS ACCEPTABLE? DO YOU WANT TO RETURN TO THIS? WHO WANTS TO RELIVE HISTORY AND NOT BE ABLE TO REWRITE OR CORRECT IT WHERE NECESSARY? I AM SURE HER BABY WAS TAKEN AWAY FROM HER OR WAS KILLED. I didn't even add a photo of the Jewish ordeals, it was too much. FOR THE LATINOS, BLACKS, NATIVE AMERICANS, AND ALL OTHER RACES. PLEASE REGISTER AND GET OUT AND VOTE. If we do not VOTE, just prepare yourself for this. BIDEN 2020. BRING YOUR LOVE ONES AND PETS OUT OF THE HEAT. GIVE THEM SHADE AND WATER. BUT DO YOU WANT TO GO BACK TO THIS?? https://www.facebook.com/photo.php?fbid=363075651728021&set=gm.2889266281312785&type=3",1,0,0,0,0,0,0,0,0,0,0
