In [1]:
import sqlite3
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer 
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer

# User count

In [None]:
conn = sqlite3.connect('dpc.db')

df = pd.read_sql('SELECT count(DISTINCT([user.screen_name])) FROM tweets', conn)

conn.close()

print('Total number of distinct Twitter users is:\n'+str(df.iloc[0][0]))

# Clustering

In [27]:
lmtz = WordNetLemmatizer()


def lemmatize(word):
    if word.startswith('#'):
        return word
    
    lemma = lmtz.lemmatize(word, 'v')
    if lemma == word:
        lemma = lmtz.lemmatize(word, 'n')
    return lemma


def strip_punc(s):
    return ''.join([c for c in s if c.isalpha()])

In [None]:
stop_words = [strip_punc(w) for w in stopwords.words('english')]
stop_words.extend([
    'i',
    'u',
    'r',
    'im',
    'cant',
    'would',
    'via',
    'today',
    'thing',
    'make',
    'talk',
    'due',
    'day',
    'month',
    'find',
    'show',
    'put',
    'part',
    'time',
    'yeah',
    'deal',
    'big',
    'level',
    'focus',
    'theyre',
    'list',
    'top',
    'give',
    'situation',
    'lot',
    'hold',
    'number',
    'include',
    'form',
    'back',
    'involve',
    'link',
    'real',
    'get',
    'go',
    'have',
    'do',
    'take',
    'time','year','month','week','day','say'
])

In [None]:
def clean_text(text):
    cleaned_text = ''
    for token in text.split():
        
        # Cleaning
        if token[0] in ['@','$','%','^','&','*'] or token.startswith('http'):
            continue

        # Remove puctuations, lower case
        token = strip_punc(token.lower())
        
        # Lemmatize
        lemma = lemmatize(token)

        if lemma and lemma not in stop_words:
            cleaned_text += lemma + ' '
    
    return cleaned_text.strip()

In [None]:
docs = []
raw_docs = []
conn = sqlite3.connect('dpc.db')

df = pd.read_sql('SELECT text, [extended_tweet.full_text] FROM tweets where created_at like "%Dec%" and created_at like "%2014"', conn)
conn.close()

for i, row in df.iterrows():
    text = ''
    if row['extended_tweet.full_text']:
        text = clean_text(row['extended_tweet.full_text'])
        raw_docs.append(row['extended_tweet.full_text'])
    else:
        text = clean_text(row['text'])
        raw_docs.append(row['text'])
    if text:
        docs.append(text)

print(len(docs), docs[0])

In [None]:
tfidf_vectorizer=TfidfVectorizer(use_idf=True)

tfidf_vectorizer_vectors=tfidf_vectorizer.fit_transform(docs)

# get the first vector out (target document)
first_vector_tfidfvectorizer=tfidf_vectorizer_vectors[0]



# # place tf-idf values in a pandas data frame
# df = pd.DataFrame(first_vector_tfidfvectorizer.T.todense(), index=tfidf_vectorizer.get_feature_names(), columns=["tfidf"])
# print(df.sort_values(by=["tfidf"],ascending=False))

# KMEANS

In [None]:
from sklearn.cluster import KMeans

In [None]:
km_model = KMeans(n_clusters=10)
km_model.fit(tfidf_vectorizer_vectors)

In [None]:
import collections

In [None]:
clustering = collections.defaultdict(list)
 
for idx, label in enumerate(km_model.labels_):
    clustering[label].append(idx)

In [None]:
for label in clustering:
    scores = {}
    for idx in clustering[label]:
        scores[idx] = np.sum(tfidf_vectorizer_vectors[idx])
        
    print('-----------------------------------------\nCluster '+str(label)+'\n')
    for idx in list(reversed(sorted(scores, key=scores.get)))[:10]:
        print(raw_docs[idx], '\n')
    print('-----------------------------------------\n\n')

# Event Sample Tweets

In [32]:
events_df = pd.read_csv('middle-data/events.csv')

In [4]:
month_labels = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']

In [33]:
def get_query(dt):
    [month, day, year] = dt.strip().split('/')
    if len(day) < 2:
        day = '0' + day
    month_label = month_labels[int(month)-1]
    return 'SELECT text, [extended_tweet.full_text] FROM tweets WHERE created_at like "%' + year + '" AND created_at like "%' + month_label + ' ' + day + '%" ORDER BY retweet_count+reply_count DESC LIMIT 5'

In [34]:
tweets_df = pd.DataFrame(columns=['date','sample_tweet'])

i = 0
conn = sqlite3.connect('dpc.db')

for _, row in events_df.iterrows():
    if row['size'] > 0:
        query = get_query(row['date'])
        tmp_df = pd.read_sql(query, conn)
        for _, tmp_row in tmp_df.iterrows():
            text = ''
            if tmp_row['extended_tweet.full_text']:
                text = tmp_row['extended_tweet.full_text']
            else:
                text = tmp_row['text']
            if text:
                tweets_df.loc[i] = [row['date'], text]
                i += 1
                
conn.close()

In [35]:
tweets_df.to_csv('output/event_tweet/sample_tweets.csv')

# Dashboard Data

In [96]:
import json
import ast
from math import isnan

# Geo

In [97]:
conn = sqlite3.connect('dpc.db')

df = pd.read_sql('SELECT created_at, [user.derived.locations] FROM tweets', conn)

conn.close()
df['created_at'] = list(map(lambda x: x.split()[-1], df['created_at']))
df.head()

Unnamed: 0,created_at,user.derived.locations
0,2014,"[{'country': 'Australia', 'country_code': 'AU'..."
1,2014,"[{'country': 'Australia', 'country_code': 'AU'..."
2,2014,"[{'country': 'Australia', 'country_code': 'AU'..."
3,2014,"[{'country': 'Australia', 'country_code': 'AU'..."
4,2014,"[{'country': 'Australia', 'country_code': 'AU'..."


In [98]:
df.columns = ['Year','Latitude']
df['Longitude'] = None
df.head()

Unnamed: 0,Year,Latitude,Longitude
0,2014,"[{'country': 'Australia', 'country_code': 'AU'...",
1,2014,"[{'country': 'Australia', 'country_code': 'AU'...",
2,2014,"[{'country': 'Australia', 'country_code': 'AU'...",
3,2014,"[{'country': 'Australia', 'country_code': 'AU'...",
4,2014,"[{'country': 'Australia', 'country_code': 'AU'...",


In [99]:
def is_aus_loc(s):
    if 'Australia' not in s:
        return False
    
    i = s.find('full_name')
    if i == -1:
        return False
    
    if s[i+13:].startswith('Australia'):
        return False
    
    return True

In [100]:
lats = []
longs = []
for i, row in df.iterrows():
    if not row['Latitude']:
        lats.append(None)
        longs.append(None)
        continue
        
    string = row['Latitude']
    start_idx = string.find("'geo'")
    
    if start_idx != -1 and is_aus_loc(string):
        subs = string[start_idx+7:]
        end_idx = subs.find('}')
        subs = subs[:end_idx+1]
        
        obj = json.loads(subs.replace("'", '"').replace('Decimal', '').replace('(','').replace(')',''))
        
        lats.append(obj['coordinates'][0])
        longs.append(obj['coordinates'][1])
    else:
        lats.append(None)
        longs.append(None)

In [101]:
df['Latitude'] = lats
df['Longitude'] = longs
df.head()

Unnamed: 0,Year,Latitude,Longitude
0,2014,145.76625,-16.92304
1,2014,151.20732,-33.86785
2,2014,144.96667,-37.83333
3,2014,138.59863,-34.92866
4,2014,138.59863,-34.92866


In [106]:
counts = {}
for _,row in df.iterrows():
    y = str(row['Year'])
    lat = str(row['Latitude'])
    long = str(row['Longitude'])

    if lat != 'None' and long != 'None':
        counts[(y,lat,long)] = counts.get((y,lat,long),0) + 1
    
len(counts)

1275

In [108]:
df1 = pd.DataFrame(columns=['Year','Lat','Long','Count'])

i = 0
for (y,lat,long) in counts:
    df1.loc[i] = [y,lat,long,counts[(y,lat,long)]]
    i += 1
    
df1.to_csv('output/dashboard/year_geo.csv')

### Hashtag cloud

In [79]:
conn = sqlite3.connect('dpc.db')
df = pd.read_sql('SELECT created_at, text, [extended_tweet.full_text] FROM tweets', conn)

df['created_at'] = list(map(lambda x: x.split()[-1], df['created_at']))
conn.close()
df.head()

Unnamed: 0,created_at,text,extended_tweet.full_text
0,2014,Domestic violence up during Christmas.1/4 wome...,
1,2014,Dr. Phil and His Viewers Don&amp;#039;t See Ey...,
2,2014,Loved all @charliepick's @Mamamia articles so ...,
3,2014,@charliepick fantastic article. Well said. But...,
4,2014,Well said... I feel a blog post coming on htt...,


In [80]:
tag_counts = {}
for _, row in df.iterrows():
    text = ''
    if row['extended_tweet.full_text']:
        text = row['extended_tweet.full_text']
    else:
        text = row['text']
    if text:
        for token in text.split():
            if token[0] == '#':
                tag = strip_punc(token).upper()
                y = row['created_at']
                tag_counts[(y,tag)] = tag_counts.get((y,tag), 0) + 1

In [81]:
with open('output/dashboard/year_tag_count.csv', 'w') as f:
    f.write('year,tag,count\n')
    
    for (y,tag) in list(sorted(tag_counts, key=tag_counts.get, reverse=True)):
        f.write('{},{},{}\n'.format(y, tag, tag_counts[(y,tag)]))

# 10 t per Day

In [114]:
def get_query(y,m,d):
    year = str(y)
    month_label = month_labels[int(m)-1]
    day = str(d)
    if len(day) < 2:
        day = '0'+day
    return 'SELECT created_at, text, [extended_tweet.full_text], [user.name], [user.screen_name], [user.profile_image_url] FROM tweets WHERE created_at like "%' + year + '" AND created_at like "%' + month_label + ' ' + day + '%" ORDER BY retweet_count+reply_count DESC LIMIT 10'

In [169]:
conn = sqlite3.connect('dpc.db')
# i = 0

# df = pd.DataFrame()

# for y in range(2014,2019):
#     for m in range(1,13):
#         print(m, end = ' ')
#         for d in range(1, 32):
#             query = get_query(y,m,d)
#             dft = pd.read_sql(query, conn)
#             df = df.append(dft)

# Topic Tweets
df = pd.DataFrame(columns=['Key','Date','Text','Username','Screen Name','Image'])

i = 0
for year in range(2014,2019):
    df1 = pd.read_csv('output/topic_tweets/{}_topic_tweets.csv'.format(year))
    df2 = pd.read_sql('SELECT created_at, text, [extended_tweet.full_text], [user.name], [user.screen_name], [user.profile_image_url] FROM tweets WHERE created_at like "%{}"'.format(year), conn)
    for _, row in df1.iterrows():
        key = row['Key']
        doc_id = row['Doc_Id']
        data = df2.loc[doc_id]
        date = transform_date(data['created_at'])
        if data['extended_tweet.full_text']:
            text = data['extended_tweet.full_text']
        else:
            text = data['text']
        
        df.loc[i] = [key,date,text,data['user.name'],data['user.screen_name'],data['user.profile_image_url']]
        i += 1
    
conn.close()

df.shape

(400, 6)

In [170]:
df.to_csv('output/dashboard/topic_tweets.csv')

In [142]:
df.columns = ['Date','Text','Ftext','Username','Screen Name','Profile Image']
df.reset_index(inplace=True)
df.head()

ValueError: Length mismatch: Expected axis has 7 elements, new values have 6 elements

In [139]:
texts = []
for i, row in df.iterrows():
    if row['Ftext']:
        texts.append(row['Ftext'])
    else:
        texts.append(row['Text'])
    if i % 1000 == 0:
        print(i, end=' ')

0 1000 2000 3000 4000 5000 6000 7000 8000 9000 10000 11000 

In [140]:
dft = df.copy()
dft['Text'] = texts

dft.drop(['Ftext'], axis=1, inplace=True)
dft.head()

Unnamed: 0,index,Date,Text,Username,Screen Name,Profile Image
0,0,Wed Jan 01 11:27:40 +0000 2014,Domestic violence up during Christmas.1/4 wome...,Lynne Reid,generalpatience,http://pbs.twimg.com/profile_images/6020358491...
1,0,Sun Jan 05 02:09:50 +0000 2014,Dr. Phil and His Viewers Don&amp;#039;t See Ey...,Supporting Men,SupportingMen,http://pbs.twimg.com/profile_images/1093659755...
2,0,Wed Jan 08 23:27:19 +0000 2014,Loved all @charliepick's @Mamamia articles so ...,hailseb,hailsbester,http://pbs.twimg.com/profile_images/8539419109...
3,0,Thu Jan 09 04:39:49 +0000 2014,@charliepick fantastic article. Well said. But...,Deanna Napier,DeannaN_MTA,http://pbs.twimg.com/profile_images/2901194542...
4,1,Thu Jan 09 13:35:54 +0000 2014,Well said... I feel a blog post coming on htt...,Jarrod Lamshed,jlamshed,http://pbs.twimg.com/profile_images/9676344747...


In [143]:
dft.drop(['index'], axis=1, inplace=True)

In [144]:
dft.head()

Unnamed: 0,Date,Text,Username,Screen Name,Profile Image
0,Wed Jan 01 11:27:40 +0000 2014,Domestic violence up during Christmas.1/4 wome...,Lynne Reid,generalpatience,http://pbs.twimg.com/profile_images/6020358491...
1,Sun Jan 05 02:09:50 +0000 2014,Dr. Phil and His Viewers Don&amp;#039;t See Ey...,Supporting Men,SupportingMen,http://pbs.twimg.com/profile_images/1093659755...
2,Wed Jan 08 23:27:19 +0000 2014,Loved all @charliepick's @Mamamia articles so ...,hailseb,hailsbester,http://pbs.twimg.com/profile_images/8539419109...
3,Thu Jan 09 04:39:49 +0000 2014,@charliepick fantastic article. Well said. But...,Deanna Napier,DeannaN_MTA,http://pbs.twimg.com/profile_images/2901194542...
4,Thu Jan 09 13:35:54 +0000 2014,Well said... I feel a blog post coming on htt...,Jarrod Lamshed,jlamshed,http://pbs.twimg.com/profile_images/9676344747...


In [148]:
def transform_date(s):
    splits = s.split()
    m = splits[1]
    d = splits[2]
    y = splits[-1]
    m = str(month_labels.index(m)+1)
    if d[0] == '0':
        d = d[1]
    return '/'.join([m,d,y])

In [151]:
dft['Date'] = list(map(lambda x: transform_date(x), dft['Date']))
dft.head()

Unnamed: 0,Date,Text,Username,Screen Name,Profile Image
0,1/1/2014,Domestic violence up during Christmas.1/4 wome...,Lynne Reid,generalpatience,http://pbs.twimg.com/profile_images/6020358491...
1,1/5/2014,Dr. Phil and His Viewers Don&amp;#039;t See Ey...,Supporting Men,SupportingMen,http://pbs.twimg.com/profile_images/1093659755...
2,1/8/2014,Loved all @charliepick's @Mamamia articles so ...,hailseb,hailsbester,http://pbs.twimg.com/profile_images/8539419109...
3,1/9/2014,@charliepick fantastic article. Well said. But...,Deanna Napier,DeannaN_MTA,http://pbs.twimg.com/profile_images/2901194542...
4,1/9/2014,Well said... I feel a blog post coming on htt...,Jarrod Lamshed,jlamshed,http://pbs.twimg.com/profile_images/9676344747...


In [152]:
dft.to_csv('output/dashboard/tweets_per_day.csv')

# Events only

In [153]:
df = pd.read_csv('middle-data/events.csv')
df.head()

Unnamed: 0,date,count,size,description
0,1/1/2014,1,0,
1,1/2/2014,0,0,
2,1/3/2014,0,0,
3,1/4/2014,0,0,
4,1/5/2014,1,0,


In [160]:
ids = []
dates = []
des = []
images = []

i = 1
for _, row in df.iterrows():
    if row['size'] > 0:
        ids.append(i)
        i += 1
        dates.append(row['date'])
        des.append(row['description'])
        images.append('placeholder')

dft = pd.DataFrame.from_dict(dict(Index=ids,Date=dates,Event=des,Image_URL=images))
dft.head()

Unnamed: 0,Index,Date,Event,Image_URL
0,1,1/10/2014,"On January 8th, Charlie Pickering published an...",placeholder
1,2,2/11/2014,Simon Glittany murder case.,placeholder
2,3,2/27/2014,Not many common threads. Some posts were about...,placeholder
3,4,3/10/2014,Fairfax reported violence against women incide...,placeholder
4,5,3/25/2014,Project TB interviewed a victim 'Jane' on fami...,placeholder


In [161]:
dft.set_index('Index', inplace=True)
dft.head()

Unnamed: 0_level_0,Date,Event,Image_URL
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,1/10/2014,"On January 8th, Charlie Pickering published an...",placeholder
2,2/11/2014,Simon Glittany murder case.,placeholder
3,2/27/2014,Not many common threads. Some posts were about...,placeholder
4,3/10/2014,Fairfax reported violence against women incide...,placeholder
5,3/25/2014,Project TB interviewed a victim 'Jane' on fami...,placeholder


In [162]:
dft.to_csv('output/event_image/event_images.csv')

# Card Browser

In [163]:
df = pd.read_csv('middle-data/year_top_image.csv')
df.head()

Unnamed: 0,Year,Ent1,Ent2,Ent3,Ent4,Ent5
0,2014,https://pbs.twimg.com/profile_images/108235881...,https://pbs.twimg.com/profile_images/750529809...,https://pbs.twimg.com/profile_images/560633309...,https://pbs.twimg.com/profile_images/108308108...,https://static-s.aa-cdn.net/img/ios/1411569220...
1,2015,https://pbs.twimg.com/profile_images/108235881...,https://pbs.twimg.com/profile_images/874429610...,https://static-s.aa-cdn.net/img/ios/1411569220...,https://pbs.twimg.com/profile_images/278676236...,https://pbs.twimg.com/profile_images/967839415...
2,2016,https://pbs.twimg.com/profile_images/108235881...,https://pbs.twimg.com/profile_images/967839415...,https://static-s.aa-cdn.net/img/ios/1411569220...,https://pbs.twimg.com/profile_images/113321698...,https://pbs.twimg.com/profile_images/278676236...
3,2017,https://pbs.twimg.com/profile_images/108235881...,https://pbs.twimg.com/profile_images/967839415...,https://pbs.twimg.com/profile_images/766095411...,https://pbs.twimg.com/profile_images/948089203...,https://pbs.twimg.com/profile_images/278676236...
4,2018,https://pbs.twimg.com/profile_images/108235881...,https://pbs.twimg.com/profile_images/111608152...,https://pbs.twimg.com/profile_images/114878229...,https://static-s.aa-cdn.net/img/ios/1411569220...,https://www.parliament.vic.gov.au/images/membe...


In [164]:
years = []
ranks = []
names = []
images = []
bios = []

for _, row in df.iterrows():
    for i in range(1,6):
        years.append(row['Year'])
        ranks.append(i)
        names.append('?')
        images.append(row['Ent'+str(i)])
        bios.append('?')
        
len(images)

25

In [165]:
df1 = pd.DataFrame.from_dict(dict(Year=years,Rank=ranks,Name=names,Image=images,Bio=bios))
df1

Unnamed: 0,Year,Rank,Name,Image,Bio
0,2014,1,?,https://pbs.twimg.com/profile_images/108235881...,?
1,2014,2,?,https://pbs.twimg.com/profile_images/750529809...,?
2,2014,3,?,https://pbs.twimg.com/profile_images/560633309...,?
3,2014,4,?,https://pbs.twimg.com/profile_images/108308108...,?
4,2014,5,?,https://static-s.aa-cdn.net/img/ios/1411569220...,?
5,2015,1,?,https://pbs.twimg.com/profile_images/108235881...,?
6,2015,2,?,https://pbs.twimg.com/profile_images/874429610...,?
7,2015,3,?,https://static-s.aa-cdn.net/img/ios/1411569220...,?
8,2015,4,?,https://pbs.twimg.com/profile_images/278676236...,?
9,2015,5,?,https://pbs.twimg.com/profile_images/967839415...,?


In [166]:
df1.to_csv('output/dashboard/top_ent_cards.csv')