In [1]:
# Libraries
import os
from dotenv import find_dotenv, load_dotenv

import psycopg2 as pg
import pandas as pd

pd.set_option('display.max_colwidth', 200)

  """)


In [2]:
# Environment variables
load_dotenv(find_dotenv())

database_url = os.environ.get('DATABASE_URL')

In [3]:
# Database
conn = pg.connect(database_url)
curr = conn.cursor()

In [4]:
curr.execute('SELECT COUNT(*) FROM raw_tweets;')
curr.fetchone()

(254406585,)

In [5]:
%timeit -n 1
sample_df = pd.read_sql('SELECT * FROM raw_tweets LIMIT 50000;', con=conn)

In [6]:
# Inspect the tweets
print (sample_df.dtypes)
sample_df.head()

id                    int64
tweetID               int64
date         datetime64[ns]
message              object
username             object
userID                int64
language             object
longitude           float64
latitude            float64
retweet              object
dtype: object


Unnamed: 0,id,tweetID,date,message,username,userID,language,longitude,latitude,retweet
0,398289,848097694266609664,2017-04-01 04:00:21,سبحان الله والحمد لله والله أكبر ♻️,hebamagdy706,3192656896,en,103.11711,5.321724,
1,398290,848097695185215488,2017-04-01 04:00:22,RT : He just silenced every Muslim apologist – Israel Video Network,BuggerWorld,166581009,nl,101.59485,3.058207,RT
2,398291,848097695613014016,2017-04-01 04:00:22,اللهم اهدني فيمن هديت وعافني فيمن عافيت وتولني فيمن توليت ♻️,ftoooxoxo,849785622,ar,-84.157394,34.83752,
3,398292,848097695529132033,2017-04-01 04:00:22,Shocking and Stunning Treatment of Girls and Women that we are importing into the West,trend_auditor,419081441,en,112.7228,-7.4576,
4,398293,848097696078602241,2017-04-01 04:00:22,RT : Iranian man sentenced to death for 'insulting Islam' through messaging app,kathy101252,820049807653879808,en,101.38079,4.478324,RT


In [7]:
sample_df.language.value_counts()

ar       22252
en       21739
fr        1742
es         978
id         900
en-gb      488
de         418
nl         389
tr         351
it         137
en-GB      123
sv          79
pt          75
ru          63
pl          57
ja          46
ca          19
he          18
fi          17
ko          16
da          14
fa          11
hi           9
cs           9
no           8
th           7
en-AU        7
el           7
gl           3
zh-cn        3
hu           3
msa          3
zh-CN        2
eu           2
sr           1
hr           1
ro           1
xx-lc        1
nb           1
Name: language, dtype: int64

In [8]:
sample_df.language.unique()

array(['en', 'nl', 'ar', 'id', 'fr', 'tr', 'es', 'en-gb', 'no', 'it',
       'pt', 'pl', 'ru', 'de', 'ja', 'en-GB', 'en-AU', 'he', 'ca', 'fi',
       'sv', 'da', 'el', 'th', 'eu', 'zh-CN', 'fa', 'cs', 'hi', 'ro',
       'gl', 'ko', 'zh-cn', 'xx-lc', 'hu', 'sr', 'msa', 'hr', 'nb'],
      dtype=object)

#### Count Null Columns

In [9]:
null_cols = sample_df.columns[sample_df.isnull().any()]
sample_df[null_cols].isnull().sum()

message      135
retweet    36169
dtype: int64

#### Duplicate Users

In [13]:
duplicated_users = pd.read_sql('SELECT username, count(username) FROM raw_tweets WHERE "language" ', con=conn)
duplicated_users.sort_values('count', ascending=False, inplace=True)
duplicated_users.head(20)

Unnamed: 0,username,count
508776,TII99,1042
547824,yammamamalek,926
237256,islamstoris,740
443281,saadalqahtani0,728
39619,alsamer4141,715
205722,haleem990079,665
11799,AAL_Tawi,634
260927,jory217,633
184283,freeforiraq,630
244707,JANA_MOHD,628


In [17]:
sample_df[sample_df.username == 'bho14321'][['tweetID', 'message']].head(25)

Unnamed: 0,tweetID,message
62,848097722536259585,اللهم صل وسلم على نبينا محمد ♻️
4211,848460255168270336,سبحان الله وبحمده سبحان الله العظيم ♻️
6255,848641418805620736,أذكار الأذان:اللهم رب هذه الدعوة التامة والصلاة القائم ♻️
8076,848822518588854273,لا إله إلا أنت سبحانك إني كنت من الظالمين ♻️
9993,849003635669438464,أذكار الأذان:يدعو لنفسه بين الأذان والإقامة فإن الدعاء ♻️
12200,849184920350195712,الذكر عند الدخول المنزل:بسم الله ولجنا وبسم الله خرجنا ♻️
14301,849366138505113600,حسبي الله لا إله إلا هو عليه توكلت وهو رب العرش العظيم ♻️
18075,849728425661280260,اللهم أعذنا من عذاب القبر وعذاب جهنم ♻️
20061,849909684173721600,اللهم منزل الكتاب سريع الحساب اهزم الأحزاب ♻️
24159,850272009837715457,سبحان الله والحمد لله والله أكبر ♻️


In [None]:
# filtering!
# happens on three levels (most -> least coarse)
# 1. Messages, we need *something*
# 2. Language, we need it to be an english variant
# 3. Duplicated tweets/spam users


---

### Vectors

In [18]:
from gensim.models import KeyedVectors
word_f = '../models/word2vec/word.twitter.25d.txt'

In [19]:
model = KeyedVectors.load_word2vec_format(word_f, binary=False)

In [22]:
model.most_similar(positive=['woman', 'king'], negative=['man'])

[('meets', 0.8841923475265503),
 ('prince', 0.832163393497467),
 ('queen', 0.8257461190223694),
 ('’s', 0.8174097537994385),
 ('crow', 0.8134994506835938),
 ('hunter', 0.8131038546562195),
 ('father', 0.811583399772644),
 ('soldier', 0.8111359477043152),
 ('mercy', 0.8082392811775208),
 ('hero', 0.8082262873649597)]

In [23]:
keywords = ['immigrant', 'immigration', 'islam', 'muslim', 'refugee']

---

#### NLTK

In [33]:
from string import punctuation
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import SnowballStemmer

In [34]:
lemmas = WordNetLemmatizer()
stemmer = SnowballStemmer('english')
stops = stopwords.words('english')

In [37]:
def lang_root(s):
    s = s.lower()
    lemmad = lemmas.lemmatize(s)
    stemmed = stemmer.stem(lemmad)
    
    return lemmad, stemmed

lang_root('carasses')

('carasses', 'carass')

In [51]:
for kw in keywords:
    lemma, stem = lang_root(kw)
    print (kw, lemma, stem)
    print ('pos ', [w for (w, p) in model.most_similar(positive=[kw])])
    print ('pos (stem)', [w for (w, p) in model.most_similar(positive=[stem])])
    print ('neg ', [w for (w, p) in model.most_similar(negative=[kw])])
    print ('neg (stem)', [w for (w, p) in model.most_similar(negative=[stem])])
    print (' ')

immigrant immigrant immigr
pos  ['pro-life', 'clergy', 'undocumented', 'migrant', 'communist', 'socialist', 'activist', 'jewish', 'circumcision', 'ugandan']
pos (stem) ['inven', 'pakist', 'attra', 'purpo', 'sağlanacak', 'envo', 'safet', 'onest', 'gover', 'di̇r']
neg  ['ينآقشك', 'ﺄخر', 'كاان', 'sonrio', 'ھذھ', 'اختلفنا', 'أسعدونا', 'بعضنا', 'ﻓﻳھا', 'مهماا']
neg (stem) ['*', 'foto', 'doraemon', 'sambil', 'nonton', 'lagu', 'gangnam', 'nyanyi', 'pengen', 'perahu']
 
immigration immigration immigr
pos  ['obamacare', 'legislation', 'policy', 'reform', 'reforms', 'laws', 'government', 'congress', 'labour', 'rights']
pos (stem) ['inven', 'pakist', 'attra', 'purpo', 'sağlanacak', 'envo', 'safet', 'onest', 'gover', 'di̇r']
neg  ['صرتي', 'ماكانت', '亡くなった方の名誉のため実名で報道させていただきます', 'ايامي', 'الصدف', 'احلامي', 'فعِش', 'تسُعدكَ', 'تبقين', 'ايامنا']
neg (stem) ['*', 'foto', 'doraemon', 'sambil', 'nonton', 'lagu', 'gangnam', 'nyanyi', 'pengen', 'perahu']
 
islam islam islam
pos  ['muslim', 'dakwah', 'jiha

***Get all english tweets***

> Because we might want to compare the sentiments between different countries/dialects. England + USA vs Australia

In [56]:
curr.execute('SELECT substring(language from 1 for 2) FROM raw_tweets LIMIT 20;')
curr.fetchall()

[('ar',),
 ('en',),
 ('en',),
 ('ar',),
 ('en',),
 ('ar',),
 ('ar',),
 ('en',),
 ('ar',),
 ('ar',),
 ('en',),
 ('ar',),
 ('en',),
 ('ar',),
 ('en',),
 ('en',),
 ('en',),
 ('en',),
 ('en',),
 ('en',)]

In [68]:
curr.close()