In [13]:
import sqlite3
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer 
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer

# User count

In [5]:
conn = sqlite3.connect('dpc.db')

df = pd.read_sql('SELECT count(DISTINCT([user.screen_name])) FROM tweets', conn)

conn.close()

print('Total number of distinct Twitter users is:\n'+str(df.iloc[0][0]))

Total number of distinct Twitter users is:
28199


# Clustering

In [210]:
lmtz = WordNetLemmatizer()


def lemmatize(word):
    if word.startswith('#'):
        return word
    
    lemma = lmtz.lemmatize(word, 'v')
    if lemma == word:
        lemma = lmtz.lemmatize(word, 'n')
    return lemma


def strip_punc(s):
    if s[0] == '#':
        return s
    return ''.join([c for c in s if c.isalpha()])

In [317]:
stop_words = [strip_punc(w) for w in stopwords.words('english')]
stop_words.extend([
    'i',
    'u',
    'r',
    'im',
    'cant',
    'would',
    'via',
    'today',
    'thing',
    'make',
    'talk',
    'due',
    'day',
    'month',
    'find',
    'show',
    'put',
    'part',
    'time',
    'yeah',
    'deal',
    'big',
    'level',
    'focus',
    'theyre',
    'list',
    'top',
    'give',
    'situation',
    'lot',
    'hold',
    'number',
    'include',
    'form',
    'back',
    'involve',
    'link',
    'real',
    'get',
    'go',
    'have',
    'do',
    'take',
    'time','year','month','week','day','say'
])

In [212]:
def clean_text(text):
    cleaned_text = ''
    for token in text.split():
        
        # Cleaning
        if token[0] in ['@','$','%','^','&','*'] or token.startswith('http'):
            continue

        # Remove puctuations, lower case
        token = strip_punc(token.lower())
        
        # Lemmatize
        lemma = lemmatize(token)

        if lemma and lemma not in stop_words:
            cleaned_text += lemma + ' '
    
    return cleaned_text.strip()

In [389]:
docs = []
raw_docs = []
conn = sqlite3.connect('dpc.db')

df = pd.read_sql('SELECT text, [extended_tweet.full_text] FROM tweets where created_at like "%Dec%" and created_at like "%2014"', conn)
conn.close()

for i, row in df.iterrows():
    text = ''
    if row['extended_tweet.full_text']:
        text = clean_text(row['extended_tweet.full_text'])
        raw_docs.append(row['extended_tweet.full_text'])
    else:
        text = clean_text(row['text'])
        raw_docs.append(row['text'])
    if text:
        docs.append(text)

print(len(docs), docs[0])

332 white ribbon police chief oath swearer stop domestic violence worry fix


In [390]:
tfidf_vectorizer=TfidfVectorizer(use_idf=True)

tfidf_vectorizer_vectors=tfidf_vectorizer.fit_transform(docs)

# get the first vector out (target document)
first_vector_tfidfvectorizer=tfidf_vectorizer_vectors[0]



# # place tf-idf values in a pandas data frame
# df = pd.DataFrame(first_vector_tfidfvectorizer.T.todense(), index=tfidf_vectorizer.get_feature_names(), columns=["tfidf"])
# print(df.sort_values(by=["tfidf"],ascending=False))

# KMEANS

In [122]:
from sklearn.cluster import KMeans

In [391]:
km_model = KMeans(n_clusters=10)
km_model.fit(tfidf_vectorizer_vectors)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=10, n_init=10, n_jobs=None, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)

In [125]:
import collections

In [392]:
clustering = collections.defaultdict(list)
 
for idx, label in enumerate(km_model.labels_):
    clustering[label].append(idx)

In [393]:
for label in clustering:
    scores = {}
    for idx in clustering[label]:
        scores[idx] = np.sum(tfidf_vectorizer_vectors[idx])
        
    print('-----------------------------------------\nCluster '+str(label)+'\n')
    for idx in list(reversed(sorted(scores, key=scores.get)))[:10]:
        print(raw_docs[idx], '\n')
    print('-----------------------------------------\n\n')

-----------------------------------------
Cluster 7

And as long as we continue to cast family violence, sexual assault and mental ill health as victim weakness, @fehowarth, the belief persists 

TBC I don't for a moment think those who survive (or sadly succumb) to assault, family violence or mental ill health, are weak @Amelia_Draws 

Climate change, Ramp Up, family violence and refugees all get a gong in an imagined Australia ruled by Stella in the future. #valestella 

Humbled to join #Caulfield locals packing Impact for Women Christmas hampers, assisting women fleeing family violence http://t.co/QJCViG9zC9 

Alcohol, money fuel Christmas family violence: CASES of family violence in Victoria spike dramatica... http://t.co/gdySIXuZpP #Vic #News 

Governments all levels need to do something to curd Domestic violence police go home today with the sight of a 18mth to 15 yr olds Butchered 

Christmas Day saw a huge number of family disputes and domestic violence incidents requiring poli

# Geo

In [394]:
conn = sqlite3.connect('dpc.db')

df = pd.read_sql('SELECT [user.derived.locations] FROM tweets', conn)

conn.close()

In [411]:
import json
import ast

In [425]:
data = {}

for location_ in df['user.derived.locations']:
    if location_:
        try:
            location = json.loads(location_.replace("'", '"').replace('Decimal', '').replace('(','').replace(')',''))[0]
            if not (location['full_name'] == 'Australia' or location['country'] != 'Australia'):
                coord = location['geo']['coordinates']
                lat = float(coord[1])
                long = float(coord[0])
                data[(lat, long)] = data.get((lat, long), 0) + 1
        except:
            pass

In [428]:
with open('output/geo/geo.csv', 'w') as f:
    f.write('Latitude,Longitude,Count\n')
    for (lat, long) in data:
        f.write('{},{},{}\n'.format(lat, long, data[(lat, long)]))