### Importing data gathered from twitter

In [1]:
import os
import sys
import glob

data_directory = os.path.join(os.path.dirname(os.getcwd()),'data/twitter')

files = glob.glob(os.path.join(data_directory,'*.csv'))
files

['/Users/sami/Projects/vote2017/data/twitter/fillon.csv',
 '/Users/sami/Projects/vote2017/data/twitter/hamon.csv',
 '/Users/sami/Projects/vote2017/data/twitter/lepen.csv',
 '/Users/sami/Projects/vote2017/data/twitter/macron.csv',
 '/Users/sami/Projects/vote2017/data/twitter/melenchon.csv',
 '/Users/sami/Projects/vote2017/data/twitter/valls.csv']

In [2]:
import pandas as pd
import csv
from gensim import corpora
from gensim.models.word2vec import LineSentence
sys.path.append(os.path.dirname(os.getcwd()))
from utils.preprocessing import preprocess_sentence

corpus = {}

for file in files:
    print('reading ',file)
    df = pd.read_csv(file, sep=';', quoting=csv.QUOTE_ALL)
    text = df.text.values
    tweets = []
    for t in text:
        tweets.append(preprocess_sentence(t))
    name = file.split('/')[-1]
    corpus[name] = tweets

('reading ', '/Users/sami/Projects/vote2017/data/twitter/fillon.csv')
('reading ', '/Users/sami/Projects/vote2017/data/twitter/hamon.csv')
('reading ', '/Users/sami/Projects/vote2017/data/twitter/lepen.csv')
('reading ', '/Users/sami/Projects/vote2017/data/twitter/macron.csv')
('reading ', '/Users/sami/Projects/vote2017/data/twitter/melenchon.csv')
('reading ', '/Users/sami/Projects/vote2017/data/twitter/valls.csv')


In [4]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

## Building Dictionaries

Removing rare words (appearing in less than 20 tweets or in more than 50% of the tweets).

In [5]:
from gensim.corpora import Dictionary

dictionaries = {}
for k,v in corpus.items():
    d = Dictionary(v)
    d.filter_extremes(no_below=10, no_above=0.4)
    dictionaries[k] = d
                                 

2017-02-08 11:36:00,747 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2017-02-08 11:36:00,826 : INFO : built Dictionary(4905 unique tokens: [u'venez', u'no0lxzm7yy', u'transitions', u'sajdvsqhyf', u'serons']...) from 1425 documents (total 12411 corpus positions)
2017-02-08 11:36:00,847 : INFO : discarding 4735 tokens: [(u'sdentrepreneurs', 1), (u'entrepreneuriat', 3), (u'succ\xe8s', 4), (u'pdbydrwhhq', 1), (u'revenir', 4), (u'twitter', 838), (u'vitalit\xe9', 2), (u'sde2017', 5), (u'vecteur', 1), (u'\u2019\xe9mancipation', 5)]...
2017-02-08 11:36:00,851 : INFO : keeping 170 tokens which were in no less than 10 and no more than 570 (=40.0%) documents
2017-02-08 11:36:00,855 : INFO : resulting dictionary: Dictionary(170 unique tokens: [u'rencontre', u'fran\xe7aise', u'lancement', u'bercy', u'emmanuel']...)
2017-02-08 11:36:00,859 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2017-02-08 11:36:01,016 : INFO : built Dictionary(5729 unique tokens: [u'accueill

## Building LDA Model

In [6]:
from gensim.models import LdaModel

name = 'fillon.csv'
#Bag of words representation of the tweets
c = [dictionaries[name].doc2bow(doc) for doc in corpus[name]]

print('Number of unique tokens: %d' % len(dictionaries[name]))
print('Number of documents: %d' % len(corpus[name]))

# Set training parameters.
num_topics = 10
chunksize = 2000
passes = 20
iterations = 400
eval_every = None  # Don't evaluate model perplexity, takes too much time.

# Make a index to word dictionary.
temp = dictionaries[name][0]  # This is only to "load" the dictionary.
id2word = dictionaries[name].id2token

%time model = LdaModel(corpus=c, id2word=id2word, chunksize=chunksize, \
                       alpha='auto', eta='auto', \
                       iterations=iterations, num_topics=num_topics, \
                       passes=passes, eval_every=eval_every)


2017-02-08 11:36:04,355 : INFO : using autotuned alpha, starting with [0.10000000000000001, 0.10000000000000001, 0.10000000000000001, 0.10000000000000001, 0.10000000000000001, 0.10000000000000001, 0.10000000000000001, 0.10000000000000001, 0.10000000000000001, 0.10000000000000001]
2017-02-08 11:36:04,356 : INFO : using serial LDA version on this node
2017-02-08 11:36:04,443 : INFO : running online LDA training, 10 topics, 20 passes over the supplied corpus of 6876 documents, updating model once every 2000 documents, evaluating perplexity every 0 documents, iterating 400x with a convergence threshold of 0.001000
2017-02-08 11:36:04,445 : INFO : PROGRESS: pass 0, at document #2000/6876


Number of unique tokens: 1123
Number of documents: 6876


2017-02-08 11:36:06,300 : INFO : optimized alpha [0.082515667196944892, 0.088264478432870969, 0.089145711910870798, 0.088626814370193541, 0.086167358469874847, 0.086879056937394991, 0.082804958112576224, 0.08593280085537576, 0.089363666565756034, 0.087942042649295324]
2017-02-08 11:36:06,301 : INFO : merging changes from 2000 documents into a model of 6876 documents
2017-02-08 11:36:06,306 : INFO : topic #0 (0.083): 0.049*"twitter" + 0.029*"l" + 0.029*"’est" + 0.020*"politique" + 0.020*"c" + 0.015*"français" + 0.015*"cards" + 0.012*"fillon" + 0.012*"projet" + 0.011*"fillon2017"
2017-02-08 11:36:06,308 : INFO : topic #6 (0.083): 0.034*"twitter" + 0.025*"faire" + 0.018*"france" + 0.017*"français" + 0.017*"’est" + 0.015*"fillon2017" + 0.014*"c" + 0.013*"europe" + 0.011*"d" + 0.011*"fillonemploi"
2017-02-08 11:36:06,311 : INFO : topic #3 (0.089): 0.062*"twitter" + 0.036*"france" + 0.031*"fillon" + 0.021*"fillon2017" + 0.016*"sommes" + 0.011*"français" + 0.011*"medium" + 0.011*"campaign" + 

CPU times: user 1min 10s, sys: 1.3 s, total: 1min 11s
Wall time: 1min 16s


In [7]:
import pyLDAvis.gensim
pyLDAvis.enable_notebook()
pyLDAvis.gensim.prepare(model, corpus[name], dictionaries[name])

ValueError: too many values to unpack

## Word2vec model

In [8]:
from gensim.models import Word2Vec

In [10]:
word2vec_macron = Word2Vec(corpus['macron.csv'], min_count=5, iter=50)
word2vec_lepen = Word2Vec(corpus['lepen.csv'], min_count=5, iter=50)
word2vec_melenchon = Word2Vec(corpus['melenchon.csv'], min_count=5, iter=50)
word2vec_fillon = Word2Vec(corpus['fillon.csv'], min_count=5, iter=50)
word2vec_hamon = Word2Vec(corpus['hamon.csv'], min_count=5, iter=50)

2017-02-08 11:55:42,333 : INFO : collecting all words and their counts
2017-02-08 11:55:42,354 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2017-02-08 11:55:42,377 : INFO : collected 4905 word types from a corpus of 12411 raw words and 1425 sentences
2017-02-08 11:55:42,402 : INFO : Loading a fresh vocabulary
2017-02-08 11:55:42,417 : INFO : min_count=5 retains 455 unique words (9% of original 4905, drops 4450)
2017-02-08 11:55:42,419 : INFO : min_count=5 leaves 6318 word corpus (50% of original 12411, drops 6093)
2017-02-08 11:55:42,423 : INFO : deleting the raw counts dictionary of 4905 items
2017-02-08 11:55:42,426 : INFO : sample=0.001 downsamples 70 most-common words
2017-02-08 11:55:42,429 : INFO : downsampling leaves estimated 4574 word corpus (72.4% of prior 6318)
2017-02-08 11:55:42,431 : INFO : estimated required memory for 455 words and 100 dimensions: 591500 bytes
2017-02-08 11:55:42,440 : INFO : resetting layer weights
2017-02-08 11:55:42,454 

### Similarities

In [15]:
for mot in ['religion', 'entreprise', 'france', 'europe']:
    print('\n')
    print "Mots les plus similaires au mot %s suivant les tweets de Fillon:\n" %mot
    for i,j in enumerate(word2vec_fillon.most_similar(mot)):
        print(str(i+1) + '. ' + j[0])



Mots les plus similaires au mot religion suivant les tweets de Fillon:

1. catholique
2. musulmane
3. exigé
4. ’exiger
5. ghettos
6. menton
7. administratif
8. attaché
9. communautés
10. héros


Mots les plus similaires au mot entreprise suivant les tweets de Fillon:

1. échange
2. production
3. écoles
4. locaux
5. passage
6. industriels
7. chefs
8. établissement
9. court
10. commune


Mots les plus similaires au mot france suivant les tweets de Fillon:

1. osons
2. destin
3. puissance
4. réagir
5. résoudre
6. magnifique
7. réserve
8. patriote
9. hisser
10. menacée


Mots les plus similaires au mot europe suivant les tweets de Fillon:

1. protège
2. naïveté
3. intérêts
4. européenne
5. monnaie
6. franco
7. continent
8. allemande
9. puissance
10. allemagne


In [16]:
for mot in ['religion', 'entreprise', 'france', 'europe']:
    print('\n')
    print "Mots les plus similaires au mot %s suivant les tweets de Lepen:\n" %mot
    for i,j in enumerate(word2vec_lepen.most_similar(mot)):
        print(str(i+1) + '. ' + j[0])

2017-02-08 12:00:04,625 : INFO : precomputing L2-norms of word weight vectors




Mots les plus similaires au mot religion suivant les tweets de Lepen:

1. minorité
2. combattons
3. religieuses
4. terreau
5. imams
6. égard
7. artificielle
8. égypte
9. combats
10. capables


Mots les plus similaires au mot entreprise suivant les tweets de Lepen:

1. protections
2. airfrance
3. industrie
4. livrée
5. abandon
6. intelligent
7. renouer
8. produire
9. entrepreneurs
10. salondelagriculture


Mots les plus similaires au mot france suivant les tweets de Lepen:

1. éternelle
2. fière
3. signifie
4. jamais
5. souveraine
6. horreur
7. grandeur
8. indépendante
9. transmission
10. générations


Mots les plus similaires au mot europe suivant les tweets de Lepen:

1. nations
2. ariane
3. coopération
4. airbus
5. vague
6. saine
7. souhaitons
8. peuples
9. souveraines
10. concert
