In [13]:
import sqlite3
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer 
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer

# User count

In [5]:
conn = sqlite3.connect('dpc.db')

df = pd.read_sql('SELECT count(DISTINCT([user.screen_name])) FROM tweets', conn)

conn.close()

print('Total number of distinct Twitter users is:\n'+str(df.iloc[0][0]))

Total number of distinct Twitter users is:
28199


# Clustering

In [210]:
lmtz = WordNetLemmatizer()


def lemmatize(word):
    if word.startswith('#'):
        return word
    
    lemma = lmtz.lemmatize(word, 'v')
    if lemma == word:
        lemma = lmtz.lemmatize(word, 'n')
    return lemma


def strip_punc(s):
    if s[0] == '#':
        return s
    return ''.join([c for c in s if c.isalpha()])

In [211]:
stop_words = [strip_punc(w) for w in stopwords.words('english')]
stop_words.extend([
    'i',
    'u',
    'r',
    'im',
    'cant',
    'would',
    'family',
    'domestic',
    'violence',
    'australia',
    'australian',
    'dv',
    'fv',
    'au',
    'via',
    'today',
    'thing',
    'make',
    'talk',
    'due',
    'day',
    'month',
    'find',
    'show',
    'put',
    'part',
    'time',
    'yeah',
    'deal',
    'big',
    'level',
    'focus',
    'theyre',
    'list',
    'top',
    'give',
    'situation',
    'lot',
    'hold',
    'number',
    'include',
    'form',
    'back',
    'involve',
    'link',
    'real',
    'get',
    'go',
    'have',
    'do',
    'take',
    'time','year','month','week','day','say'
])

In [212]:
def clean_text(text):
    cleaned_text = ''
    for token in text.split():
        
        # Cleaning
        if token[0] in ['@','$','%','^','&','*'] or token.startswith('http'):
            continue

        # Remove puctuations, lower case
        token = strip_punc(token.lower())
        
        # Lemmatize
        lemma = lemmatize(token)

        if lemma and lemma not in stop_words:
            cleaned_text += lemma + ' '
    
    return cleaned_text.strip()

In [273]:
docs = []
raw_docs = []
conn = sqlite3.connect('dpc.db')

df = pd.read_sql('SELECT text, [extended_tweet.full_text] FROM tweets where created_at like "%Nov%" and created_at like "%2018"', conn)
conn.close()

for i, row in df.iterrows():
    text = ''
    if row['extended_tweet.full_text']:
        text = clean_text(row['extended_tweet.full_text'])
        raw_docs.append(row['extended_tweet.full_text'])
    else:
        text = clean_text(row['text'])
        raw_docs.append(row['text'])
    if text:
        docs.append(text)

print(len(docs), docs[0])

2181 deadly message appear


In [274]:
tfidf_vectorizer=TfidfVectorizer(use_idf=True)

tfidf_vectorizer_vectors=tfidf_vectorizer.fit_transform(docs)

# get the first vector out (target document)
first_vector_tfidfvectorizer=tfidf_vectorizer_vectors[0]



# # place tf-idf values in a pandas data frame
# df = pd.DataFrame(first_vector_tfidfvectorizer.T.todense(), index=tfidf_vectorizer.get_feature_names(), columns=["tfidf"])
# print(df.sort_values(by=["tfidf"],ascending=False))

# KMEANS

In [122]:
from sklearn.cluster import KMeans

In [279]:
km_model = KMeans(n_clusters=5)
km_model.fit(tfidf_vectorizer_vectors)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=5, n_init=10, n_jobs=None, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)

In [125]:
import collections

In [280]:
clustering = collections.defaultdict(list)
 
for idx, label in enumerate(km_model.labels_):
    clustering[label].append(idx)

In [281]:
for label in clustering:
    scores = {}
    for idx in clustering[label]:
        scores[idx] = np.sum(tfidf_vectorizer_vectors[idx])
        
    print('-----------------------------------------\nCluster '+str(label)+'\n')
    for idx in list(reversed(sorted(scores, key=scores.get)))[:10]:
        print(raw_docs[idx], '\n')
    print('-----------------------------------------\n\n')

-----------------------------------------
Cluster 4

After a deadly month for domestic violence, the message doesn't appear to be getting through https://t.co/uo1woxut9R via @ConversationEDU 

@BarbyWT @GenderANU @sanambna @wpscoalition Indeed @BarbyWT, I advocate on domestic violence too. That doesn't mean women elsewhere don't deserve justice too though. 

After a deadly month for domestic violence, the message doesn't appear to be getting through https://t.co/FAZ9qtqiEd via @ConversationEDU 

.@RichardWilkins #askcharlie why him saying his speaking tour would be full of laughs causes grief to victims of domestic violence #vaw @CollectiveShout 

@ScottMorrisonMP Mr Morrison, STOP Charlie Sheen’s entry to Australia. We have an epidemic of lethal family violence with men killing women in high numbers during October. Sheen is a serial perpetrator &amp; the message your Government is providing is “he’s okay to speak publicly”! 

After a deadly month for domestic #violence, the message do

# LDA

In [None]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
np.random.seed(2018)

In [None]:
processed_docs = pd.Series([doc.split() for doc in docs])
processed_docs.head()

In [None]:
dictionary = gensim.corpora.Dictionary(processed_docs)


In [None]:
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]


In [None]:
from gensim import corpora, models
tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]


In [None]:
num_topics = 3

lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=num_topics, id2word=dictionary, passes=2, workers=4)
for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

In [None]:
for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))
    print()
    
    scores = {}
    for i in range(len(bow_corpus)):
        for index, score in sorted(lda_model_tfidf[bow_corpus[i]], key=lambda tup: -1*tup[1]):
            if index == idx:
                scores[i] = score
            break

    for i in list(reversed(sorted(scores, key=scores.get)))[:10]:
        print(raw_docs[i], '\n')
    
    print('---------------------------------------------------------------------------------------------------------')