In [1]:
import pandas as pd
import re

import numpy as np
import pandas as pd
from pprint import pprint
from collections import Counter

#nltk stopwords
from nltk.corpus import stopwords


# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# spacy for lemmatization
import spacy

#TF-IDF
from sklearn.feature_extraction.text import TfidfTransformer,CountVectorizer

# Enable logging for gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

### Read Posts

In [2]:
name = "LatinosForBiden"
terms = "biden|democrats|democrat"
candidate = pd.read_csv(name+".csv")
candidate.shape

(4971, 31)

In [3]:
candidate.columns

Index(['Group Name', 'User Name', 'Facebook Id', 'Likes at Posting', 'Created',
       'Type', 'Likes', 'Comments', 'Shares', 'Love', 'Wow', 'Haha', 'Sad',
       'Angry', 'Care', 'Video Share Status', 'Post Views', 'Total Views',
       'Total Views For All Crossposts', 'Video Length', 'URL', 'Message',
       'Link', 'Final Link', 'Image Text', 'Link Text', 'Description',
       'Sponsor Id', 'Sponsor Name', 'Total Interactions',
       'Total Interactions (weighted  —  Likes 1x Shares 1x Comments 1x Love 1x Wow 1x Haha 1x Sad 1x Angry 1x Care 1x )'],
      dtype='object')

## Filter the most relevant columns

In [4]:
content = candidate[['Message','Description','Link','Group Name','Likes', 'Comments', 'Shares', 'Love', 'Wow', 'Haha', 'Sad',
       'Angry', 'Care','Total Interactions']].copy()
content.shape

(4971, 14)

### Add column to concat Message and Description

In [5]:
concat_message = content['Message'].fillna('') + (' ' + content['Description']).fillna('')
content['MessageDescr'] = concat_message

### Just keep those groups that have Trump or Republican in their name

In [6]:
groups_candidate = content['Group Name'].str.lower().reset_index()
final_groups = groups_candidate[groups_candidate['Group Name'].str.contains(terms)].copy()
final_groups = final_groups['Group Name'].unique()
final_groups = pd.DataFrame(final_groups)
final_groups.columns = ['Group Name']
final_groups.to_csv(name+"_groups.csv")
final_groups.shape

(191, 1)

In [7]:
content['Group Name'] = content['Group Name'].str.lower()
content.shape

(4971, 15)

### Keep the posts from the preselected groups

In [8]:
content = content[content['Group Name'].isin(final_groups['Group Name'])].copy()
content.shape

(640, 15)

## Tokenize words and Clean-up text

In [9]:
content['cleanMessage'] = content['MessageDescr'].apply(lambda x: re.split('https:\/\/.*', str(x))[0])
content['cleanMessage'].shape

(640,)

In [10]:
# Convert to list
data = content.cleanMessage.tolist()

In [11]:
# Remove new line characters
data = [re.sub('\s+', ' ', sent) for sent in data]

# Remove distracting single quotes
data = [re.sub("\'", "", sent) for sent in data]

In [12]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

data_words = list(sent_to_words(data))

print(data_words[:1])

[['ricky', 'martin', 'one', 'of', 'the', 'biggest', 'hispanic', 'pop', 'icon', 'openly', 'supporting', 'joe', 'biden', 'pleads', 'for', 'all', 'his', 'followers', 'to', 'support', 'joe', 'biden', 'for', 'presdident', 'el', 'artista', 'que', 'ha', 'expresado', 'en', 'mas', 'de', 'una', 'ocasion', 'su', 'contrariedad', 'la', 'politica', 'de', 'trump', 'participa', 'en', 'un', 'encuentro', 'del', 'candidato', 'democrata', 'con', 'los', 'latinos', 'luis', 'fonsi', 'eva', 'longoria', 'tambien', 'pidieron', 'su', 'voto', 'en', 'dicho', 'evento']]


#### Creating Bigram and Trigram Models

In [13]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

# See trigram example
print(trigram_mod[bigram_mod[data_words[0]]])

['ricky_martin', 'one', 'of', 'the', 'biggest', 'hispanic', 'pop', 'icon', 'openly', 'supporting', 'joe', 'biden', 'pleads', 'for', 'all', 'his', 'followers', 'to', 'support', 'joe', 'biden', 'for', 'presdident', 'el', 'artista', 'que', 'ha', 'expresado', 'en', 'mas', 'de', 'una', 'ocasion', 'su', 'contrariedad', 'la', 'politica', 'de', 'trump', 'participa', 'en', 'un', 'encuentro', 'del', 'candidato', 'democrata', 'con', 'los', 'latinos', 'luis', 'fonsi', 'eva_longoria', 'tambien', 'pidieron', 'su', 'voto', 'en', 'dicho', 'evento']


In [14]:
stop_words = stopwords.words('english')
stop_words.extend(stopwords.words('spanish'))
stop_words.extend(['su','sus','al','de','en','el'])
stop_words.extend(['like','would','get','many','much'])

## Remove Stopwords, Make Bigrams and Lemmatize

In [15]:
# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts,stop_words_lang):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words_lang] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

## Bag of Words

In [16]:
# Remove Stop Words
data_words_nostops = remove_stopwords(data_words,stop_words)

# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
#!python3 -m spacy download en
nlp = spacy.load('en', disable=['parser', 'ner'])

# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

print(data_lemmatized[:2])

[['ricky_martin', 'big', 'hispanic', 'pop', 'icon', 'openly', 'support', 'plead', 'follower', 'support', 'ocasion'], ['call', 'activism', 'need', 'sanity']]


In [17]:
def combine_words(word1,word2,dictionary):

    if word1 in dictionary and word2 in dictionary:
        concat_name = word1+" / "+word2
        dictionary[concat_name] = dictionary[word1] + dictionary[word2]
        dictionary.pop(word1, None)
        dictionary.pop(word2, None)
        print(concat_name, dictionary[concat_name])
    return dictionary

In [18]:
flat_list_words = [item for sublist in data_lemmatized for item in sublist]
count_words = Counter(flat_list_words)

In [19]:
final_dict = combine_words('donald','trump',count_words)
final_dict = combine_words('joe','biden',count_words)
final_dict = combine_words('kamala','harris',count_words)
final_dict = combine_words('american','america',count_words)
final_dict = combine_words('estados','unidos',count_words)
final_dict = combine_words('voters','vote',count_words)
final_dict = combine_words('mexico','mexican',count_words)
final_dict = combine_words('casa','blanca',count_words)
final_dict = combine_words('venezuela','venezuelans',count_words)
final_dict = combine_words('latino','latinos',count_words)
final_dict = combine_words('american / america','americans',count_words)
final_dict = combine_words('voters / vote','voting',count_words)
final_dict = combine_words('puerto','rico',count_words)
final_dict = combine_words('communism','communist',count_words)
final_dict = combine_words('ee','uu',count_words)
final_dict = combine_words('united','states',count_words)

joe / biden 429
latino / latinos 475
communism / communist 28


In [20]:
final_dict.most_common(1000)

[('trump', 824),
 ('vote', 475),
 ('latino / latinos', 475),
 ('joe / biden', 429),
 ('voter', 392),
 ('state', 271),
 ('people', 253),
 ('make', 234),
 ('election', 229),
 ('say', 224),
 ('go', 187),
 ('hispanic', 173),
 ('support', 169),
 ('campaign', 157),
 ('plan', 151),
 ('know', 147),
 ('democratic', 145),
 ('community', 129),
 ('help', 129),
 ('want', 125),
 ('win', 123),
 ('white', 116),
 ('take', 112),
 ('month', 107),
 ('need', 99),
 ('see', 98),
 ('live', 98),
 ('show', 97),
 ('time', 94),
 ('life', 94),
 ('share', 94),
 ('come', 94),
 ('downplay', 94),
 ('also', 93),
 ('group', 90),
 ('country', 88),
 ('lose', 88),
 ('year', 87),
 ('call', 85),
 ('former', 85),
 ('black', 83),
 ('debate', 83),
 ('care', 83),
 ('poll', 82),
 ('tell', 82),
 ('american', 80),
 ('virus', 80),
 ('accord', 79),
 ('well', 76),
 ('spend', 73),
 ('event', 73),
 ('family', 72),
 ('president', 72),
 ('point', 72),
 ('question', 71),
 ('first', 70),
 ('talk', 70),
 ('pandemic', 70),
 ('key', 69),
 ('ri

## TD-IDF

In [21]:
docs = ([' '.join(ele) for ele in data_words_nostops] )

In [22]:
cv=CountVectorizer() 
word_count_vector=cv.fit_transform(docs)
word_count_vector.shape

(640, 6541)

In [23]:
tfidf_transformer=TfidfTransformer(smooth_idf=True,use_idf=True) 
tfidf_transformer.fit(word_count_vector)

TfidfTransformer()

In [24]:
# print idf values 
df_idf = pd.DataFrame(tfidf_transformer.idf_, index=cv.get_feature_names(),columns=["idf_weights"]) 
 
# sort ascending 
df_idf.sort_values(by=['idf_weights'])

Unnamed: 0,idf_weights
biden,1.232548
latinos,1.246423
trump,1.739444
joe,1.786276
vote,2.092391
...,...
incomes,6.769882
inconvenient,6.769882
incorporated,6.769882
improve,6.769882


In [25]:
# count matrix 
count_vector=cv.transform(docs) 
 
# tf-idf scores 
tf_idf_vector=tfidf_transformer.transform(count_vector)

In [26]:
feature_names = cv.get_feature_names() 
 
#get tfidf vector for first document 
first_document_vector=tf_idf_vector[0] 
 
#print the scores 
df = pd.DataFrame(first_document_vector.T.todense(), index=feature_names, columns=["tfidf"]) 
df = df[(df != 0).all(1)]
df.sort_values(by=["tfidf"],ascending=False).to_csv(name+"_tfidf.csv")

In [27]:
df

Unnamed: 0,tfidf
artista,0.196285
biden,0.076026
biggest,0.141025
candidato,0.170153
contrariedad,0.196285
democrata,0.196285
dicho,0.196285
encuentro,0.196285
eva,0.170153
evento,0.174908


In [28]:
word = df.index[0]
content["lowercase"] = content['MessageDescr'].apply(lambda x: x.lower())
word_messages = content[content['lowercase'].str.contains(word)]

word_messages = word_messages['lowercase'].value_counts(ascending=False).rename_axis('unique_messages').reset_index(name='counts')
word_messages.to_csv(word +"_"+ name+"_messages.csv")
word_messages

Unnamed: 0,unique_messages,counts
0,ricky martin one of the biggest hispanic pop i...,2
1,for all latinos in miami who support biden... ...,1
2,https://www.youtube.com/watch?v=i2ynxayspew es...,1


### Repeated messages

In [29]:
content['Total Interactions']=content['Total Interactions'].astype(str).str.replace(',', '').astype(int)
results = content.groupby(['MessageDescr']).aggregate({'Group Name':['count',' |'.join],'MessageDescr': 'count','Likes': 'sum','Comments': 'sum', 'Shares': 'sum', 'Love': 'sum', 'Wow': 'sum', 'Haha': 'sum', 'Sad': 'sum',
       'Angry': 'sum', 'Care': 'sum','Total Interactions':'sum'})
results.rename(columns={"MessageDescr": "Counts",'Group Name':"Num.Groups"}, inplace=True)


results.to_csv("mostpopularLatinosFor"+name+".csv")

In [30]:
#results.sort_values(by='Counts',ascending=False)
results

Unnamed: 0_level_0,Num.Groups,Num.Groups,Counts,Likes,Comments,Shares,Love,Wow,Haha,Sad,Angry,Care,Total Interactions
Unnamed: 0_level_1,count,join,count,sum,sum,sum,sum,sum,sum,sum,sum,sum,sum
MessageDescr,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2
,3,"social democrats usa-socialist party, usa |rid...",3,10,22,4,3,0,0,0,0,0,39
"""Joe Biden is a puppet of the extreme left who is now promoting the same kinds of policies our families fled. As the daughter of immigrants, I know that is not the future we want for our families, nor for the United States of America."" - Senior Advisor Mercedes Schlapp \n\n#LatinosForTrump",1,ridin' with biden,1,0,0,0,0,0,0,0,0,0,0
"""Voter turnout among Latinos in Florida could mean the difference for a Biden-Harris win in Florida,"" Bloomberg said in a statement.",1,cubanos con biden,1,9,0,1,2,0,0,0,0,0,12
#TodosConBiden,1,cubanos con biden,1,11,0,1,2,0,0,0,0,0,14
"(CNN)Former President Barack Obama delivered an often-incredulous and blistering account of his successor's first four years in office on Wednesday in Philadelphia, making his most direct attacks on President Donald Trump to date both on substance and on a personal level.\n\nThe event is Obama's first stump speech for his former vice president, a welcome sight to Democrats who see the former president as Joe Biden's most potent character witness and a key factor in encouraging Black men, Latinos and younger voters to turn out and vote.\nObama's speech represented his most direct attacks on Trump to date, with the former Democratic leader leveling both substantive critiques -- like questioning Trump's tax policy and handling of the coronavirus pandemic -- and personal barbs, jabbing at shrinking ratings for the President's speeches and town halls. The former President wasted no time lacing into Trump, opening the remarks by mocking him for telling an audience in Erie, Pennsylvania, on Tuesday night that he wouldn't have visited the area if not for the coronavirus hurting his political fortunes.\nThe remarks drilled down on years of Democratic concerns about the President, with Obama arguing Trump's presidency has not only changed the way other countries view the United States but remade the way Americans feel about politics.\nContent by BooHoo\nBoohoo's latest collaboration is here\nBoohoo — the global online fashion store trusted by millions of the best-dressed people on the planet, including your most stylish friend\n""I never thought Donald Trump would embrace my vision or continue my polices, but I did hope for the sake of the country, that he might show some interest in taking the job seriously,"" Obama said. ""But it hasn't happened. He hasn't showed any interest in doing the work or helping anybody but himself and his friends.""\nThe former President directly attacked Trump's handling of the coronavirus, the issue that is dominating the 2020 campaign.\nHe noted that Trump recently said that there is ""not much"" he would change about the US response to the pandemic that has killed over 220,000 people in the United States.\n""Really?"" Obama asked. ""Not much? Nothing you can think of that could have helped some people keep their loved ones alive?""\nAs Obama spoke at the drive-in rally, standing against the backdrop of Lincoln Financial Field, people sounded air horns and car horns in approval throughout his speech. People stood at a distance, waving flags and banners against the setting sun.\n""This is not a reality show -- this is reality,"" Obama added, taking particular aim at Trump on the coronavirus crisis. ""Four years ago, you'd be tailgating here instead of watching a speech from your car.""\nObama's remarks will undoubtedly anger Trump, who has continued to attack his predecessor even after almost four years removed from his last day in office.\nAfter Obama suggested Trump uses the presidency to boost his own profile, he added, ""Even then, his TV ratings are down. So you know that upsets him.""\nAnd when he noted that Trump inherited a booming economy from him, Obama added, but ""just like everything else he inherited, he messed it up.""\nThe speech also showed Obama is closely watching the day-to-day news about Trump, including a Tuesday report that the President maintains a Chinese bank account.\n""How is that possible? A secret Chinese bank account,"" Obama asked.\nThen he reflected on the way that conservative media and Trump treated him during his time in office.\n""Listen, can you imagine if I had had a secret Chinese bank account when I was running for reelection?"" Obama asked. ""You think Fox News might have been a little concerned about that? They would have called me Beijing Barry.""\nObama also touted Biden and his running mate, California Sen. Kamala Harris, throughout the speech, saying that while he didn't know Biden well when they both served in the Senate, he ""came to admire Joe as a man who has learned early on to treat everyone that he meets with dignity and respect.""\nObama said Biden would ""never call the men and women of our military suckers and losers,"" citing a bombshell report on Trump from The Atlantic. He said Trump has ""emboldened"" racists. Obama also incredulously recounted instances of Trump retweeting conspiracy theories, indirectly mentioning the false QAnon claim that says there is a ""secret cabal running the world.""\n""It just won't be so exhausting,"" Obama said of a Biden administration, adding voters are ""not going to have to think about the crazy things ... and that is worth a lot.""\nBack in the game\nObama, after largely sitting out the Democratic primary, has steadily ramped up his anti-Trump rhetoric, after years of trying to hew to a longstanding tradition that former presidents avoid attacking their successors. Trump, however, changed that calculation and Obama has ramped up his critiques of the President.\n""Former presidents tend not to delve too deeply into politics and certainly (truncated)",1,why i love joe biden-kamala harris / why i hat...,1,11,1,0,3,0,0,0,0,0,15
...,...,...,...,...,...,...,...,...,...,...,...,...,...
👏🏻👏🏻👏🏻💙💙💙LATINOS FOR BIDEN 🇺🇸🇺🇸👊🏻👊🏻🇺🇸🇵🇷🇨🇺🇨🇴🇪🇨🇦🇷🇩🇴🇻🇪🇧🇷🇨🇱🇵🇪🇸🇻🇺🇾🇲🇽,1,cubanos con biden,1,15,0,4,9,0,0,0,0,0,28
💙 Official Webstore of Biden for President,1,joe biden for colorado,1,16,0,1,10,0,0,0,0,0,27
"📍Miami-Dade, FL #TodosConBidenCaravan 📍Miami-Dade, FL\n\n#TodosConBidenCaravan a massive show of support by Latinos who support Joe Biden and Kamala Harris! The support for Joe and Democrats on the ballot is strong and powerful! \n\n#BidenForFL",6,biden for florida |joe biden for florida |cuba...,6,332,51,54,144,0,0,0,0,2,583
😂😂😂😂😂 Kamala Harris made a surprise visit to Miami on Thursday as Joe Biden hid in his basement bunker. Joe Biden has a serious problem among Florida Hispanic voters. Latinos from South America and Cuba who have suffered under Socialist and Communist regimes have come to the US for a better life and they....,1,americans against democratic and republican party,1,1,0,0,0,0,1,0,0,0,2
