### Importing data gathered from twitter

In [49]:
import os
import sys
import glob

data_directory = os.path.join(os.path.dirname(os.getcwd()),'data/twitter')

files = glob.glob(os.path.join(data_directory,'*.csv'))
files

['/Users/sami/Projects/vote2017/data/twitter/fillon.csv',
 '/Users/sami/Projects/vote2017/data/twitter/hamon.csv',
 '/Users/sami/Projects/vote2017/data/twitter/lepen.csv',
 '/Users/sami/Projects/vote2017/data/twitter/macron.csv',
 '/Users/sami/Projects/vote2017/data/twitter/melenchon.csv',
 '/Users/sami/Projects/vote2017/data/twitter/valls.csv']

In [50]:
import pandas as pd
import csv
from gensim import corpora
from gensim.models.word2vec import LineSentence
sys.path.append(os.path.dirname(os.getcwd()))
from utils.preprocessing import preprocess_sentence

corpus = {}

for file in files:
    print('reading ',file)
    df = pd.read_csv(file, sep=';', quoting=csv.QUOTE_ALL)
    text = df.text.values
    tweets = []
    for t in text:
        tweets.append(preprocess_sentence(t))
    name = file.split('/')[-1]
    corpus[name] = tweets

('reading ', '/Users/sami/Projects/vote2017/data/twitter/fillon.csv')
('reading ', '/Users/sami/Projects/vote2017/data/twitter/hamon.csv')
('reading ', '/Users/sami/Projects/vote2017/data/twitter/lepen.csv')
('reading ', '/Users/sami/Projects/vote2017/data/twitter/macron.csv')
('reading ', '/Users/sami/Projects/vote2017/data/twitter/melenchon.csv')
('reading ', '/Users/sami/Projects/vote2017/data/twitter/valls.csv')


In [51]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

## Building Dictionaries

Removing rare words (appearing in less than 20 tweets or in more than 50% of the tweets).

In [52]:
from gensim.corpora import Dictionary

dictionaries = {}
for k,v in corpus.items():
    d = Dictionary(v)
    dictionary.filter_extremes(no_below=20, no_above=0.5)
    dictionaries[k] = d
                                 

2017-02-06 09:56:09,147 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2017-02-06 09:56:09,246 : INFO : built Dictionary(5667 unique tokens: [u'd\xe9sunion', u'leanature', u'travaux', u'accueilli', u'gxnjqlgy4m']...) from 1425 documents (total 21113 corpus positions)
2017-02-06 09:56:09,247 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2017-02-06 09:56:09,409 : INFO : built Dictionary(6552 unique tokens: [u'accueille', u'jihad', u'y3oxfjhms3', u'6440691', u's\xe9curit\xe9']...) from 1552 documents (total 24867 corpus positions)
2017-02-06 09:56:09,410 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2017-02-06 09:56:09,729 : INFO : adding document #10000 to Dictionary(17817 unique tokens: [u'suivrons', u'ol\xe9oducs', u'grossier', u'ryw787x4mrw', u'barrages']...)
2017-02-06 09:56:09,877 : INFO : built Dictionary(22692 unique tokens: [u'h0erhp', u'hprl9o', u'ryw787x4mrw', u'encouragerait', u'accueille']...) from 14513 documents (total 18304

LDA Fillon:

In [54]:
from gensim.models import LdaModel

name = 'fillon.csv'
#Bag of words representation of the tweets
c = [dictionaries[name].doc2bow(doc) for doc in corpus[name]]

print('Number of unique tokens: %d' % len(dictionaries[name]))
print('Number of documents: %d' % len(corpus[name]))

# Set training parameters.
num_topics = 10
chunksize = 2000
passes = 20
iterations = 400
eval_every = None  # Don't evaluate model perplexity, takes too much time.

# Make a index to word dictionary.
temp = dictionaries[name][0]  # This is only to "load" the dictionary.
id2word = dictionaries[name].id2token

%time model = LdaModel(corpus=c, id2word=id2word, chunksize=chunksize, \
                       alpha='auto', eta='auto', \
                       iterations=iterations, num_topics=num_topics, \
                       passes=passes, eval_every=eval_every)

2017-02-06 09:57:13,703 : INFO : using autotuned alpha, starting with [0.10000000000000001, 0.10000000000000001, 0.10000000000000001, 0.10000000000000001, 0.10000000000000001, 0.10000000000000001, 0.10000000000000001, 0.10000000000000001, 0.10000000000000001, 0.10000000000000001]
2017-02-06 09:57:13,705 : INFO : using autotuned eta, starting with [0.10000000000000001, 0.10000000000000001, 0.10000000000000001, 0.10000000000000001, 0.10000000000000001, 0.10000000000000001, 0.10000000000000001, 0.10000000000000001, 0.10000000000000001, 0.10000000000000001]
2017-02-06 09:57:13,706 : INFO : using serial LDA version on this node


Number of unique tokens: 13617
Number of documents: 6876


2017-02-06 09:57:14,496 : INFO : running online LDA training, 10 topics, 20 passes over the supplied corpus of 6876 documents, updating model once every 2000 documents, evaluating perplexity every 0 documents, iterating 400x with a convergence threshold of 0.001000
2017-02-06 09:57:14,498 : INFO : PROGRESS: pass 0, at document #2000/6876
2017-02-06 09:57:18,045 : INFO : optimized alpha [0.074537517701808761, 0.071414194167133183, 0.082401820735239606, 0.088294396622764212, 0.07939354897921512, 0.079880371616848106, 0.068680158678281741, 0.076514707867852902, 0.070857406154000713, 0.06779145151401432]
2017-02-06 09:57:18,048 : INFO : merging changes from 2000 documents into a model of 6876 documents
2017-02-06 09:57:18,071 : INFO : topic #9 (0.068): 0.026*des + 0.018*faut + 0.016*les + 0.016*une + 0.016*nous + 0.011*pas + 0.011*sur + 0.009*pour + 0.008*qui + 0.007*que
2017-02-06 09:57:18,074 : INFO : topic #6 (0.069): 0.021*est + 0.020*com + 0.019*twitter + 0.017*france + 0.015*pic + 0.

CPU times: user 1min 57s, sys: 1.62 s, total: 1min 59s
Wall time: 2min 3s
