In [5]:
import sys
import csv
import urllib
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
import seaborn
from matplotlib import pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize.casual import casual_tokenize
from sklearn.decomposition import PCA
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import train_test_split 
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation as LDiA

### Putting together the three candidates' speeches.

In [187]:
meade_df = pd.read_csv("meade.csv",encoding='UTF-16')
#Outcome = 1 if amlo, 0 ioc
meade_df["outcome"] = 0
del meade_df['date']


anaya_df = pd.read_csv("anaya_bis.csv",encoding='UTF-16')
anaya_df.columns = ['speech']
anaya_df["outcome"] = 0

amlo_df = pd.read_csv("amlo_bis.csv",encoding='UTF-16')
del amlo_df["date"]
amlo_df.columns = ['speech']
amlo_df["outcome"] = 1

frames = [meade_df, anaya_df, amlo_df]

#Merging
speeches = pd.concat(frames)
#speeches.to_csv('speeches.csv',  encoding='UTF-16', index=False)
speeches.head()

Unnamed: 0,speech,outcome
0,buenas tardes querétaro buenas tardes parte di...,0
1,buenas tardes buenas tardes carmelitas buenas ...,0
2,muchas gracias muchas gracias recibirme campec...,0
3,buenos días chalco cómo echan ganas días hace ...,0
4,buenas tardes fresnillo cómo entusiasmo fresni...,0


In [188]:
#sketchy code to calculate summary stats. 
texts = amlo_df.speech
round(sum([len(t.split()) for t in texts]) * 1. / len(texts))

603

In [189]:
len(anaya_df)

127

## PCA

In [190]:
#PCA analysis. Slightly adapting the code from the class' distributed notebooks.
tfidf = TfidfVectorizer(tokenizer=casual_tokenize)
tfidf_docs = tfidf.fit_transform(raw_documents=speeches.speech).toarray()
tfidf_docs = pd.DataFrame(tfidf_docs)
tfidf_docs = tfidf_docs - tfidf_docs.mean()
tfidf_docs.shape

(469, 22508)

In [191]:
speeches.outcome.sum()

162

In [192]:
pca = PCA(n_components=10)
pca = pca.fit(tfidf_docs)
pca_topic_vectors = pca.transform(tfidf_docs)
columns = ['topic{}'.format(i) for i in range(pca.n_components)]
pca_topic_vectors = pd.DataFrame(pca_topic_vectors, 
                                 columns=columns)
pca_topic_vectors.round(3).head()

Unnamed: 0,topic0,topic1,topic2,topic3,topic4,topic5,topic6,topic7,topic8,topic9
0,0.225,-0.056,0.114,-0.007,-0.023,-0.078,-0.025,-0.027,0.008,-0.024
1,0.201,-0.111,0.16,0.037,-0.016,-0.071,0.071,-0.121,-0.063,-0.019
2,0.225,-0.033,0.15,0.033,-0.031,-0.069,-0.0,-0.072,-0.062,-0.021
3,0.306,-0.085,0.301,-0.057,0.063,-0.172,-0.033,-0.079,-0.101,-0.024
4,0.176,-0.109,0.152,0.02,-0.073,0.012,0.048,-0.106,0.05,-0.008


In [193]:
svd = TruncatedSVD(n_components=10, n_iter=300)  
svd_topic_vectors = svd.fit_transform(tfidf_docs)
svd_topic_vectors = pd.DataFrame(svd_topic_vectors, 
                                 columns=columns)
svd_topic_vectors.round(2).head(6)

Unnamed: 0,topic0,topic1,topic2,topic3,topic4,topic5,topic6,topic7,topic8,topic9
0,0.23,-0.06,0.11,-0.0,-0.02,-0.08,-0.03,-0.03,0.02,0.02
1,0.2,-0.11,0.16,0.04,-0.02,-0.07,0.07,-0.13,-0.05,0.02
2,0.22,-0.03,0.15,0.03,-0.03,-0.07,0.01,-0.07,-0.06,-0.02
3,0.31,-0.09,0.3,-0.06,0.06,-0.18,-0.04,-0.08,-0.1,-0.01
4,0.18,-0.11,0.15,0.02,-0.08,0.01,0.06,-0.11,0.04,-0.05
5,0.32,0.12,0.06,-0.06,0.1,0.04,-0.09,0.11,0.02,0.06


In [194]:
svd_topic_vectors = (svd_topic_vectors.T / 
                     np.linalg.norm(svd_topic_vectors, axis=1)).T
#Meade
svd_topic_vectors.iloc[:6].dot(svd_topic_vectors.iloc[:6].T).round(3)

Unnamed: 0,0,1,2,3,4,5
0,1.0,0.842,0.918,0.894,0.791,0.605
1,0.842,1.0,0.92,0.852,0.882,0.242
2,0.918,0.92,1.0,0.915,0.826,0.476
3,0.894,0.852,0.915,1.0,0.693,0.504
4,0.791,0.882,0.826,0.693,1.0,0.203
5,0.605,0.242,0.476,0.504,0.203,1.0


In [195]:
#AMLO
svd_topic_vectors.iloc[-6:].dot(svd_topic_vectors.iloc[-6:].T).round(3)

Unnamed: 0,463,464,465,466,467,468
463,1.0,0.7,0.873,0.855,0.824,0.943
464,0.7,1.0,0.807,0.867,0.733,0.755
465,0.873,0.807,1.0,0.823,0.798,0.874
466,0.855,0.867,0.823,1.0,0.945,0.92
467,0.824,0.733,0.798,0.945,1.0,0.941
468,0.943,0.755,0.874,0.92,0.941,1.0


In [196]:
#Anaya
svd_topic_vectors.iloc[200:206].dot(svd_topic_vectors.iloc[200:206].T).round(3)

Unnamed: 0,200,201,202,203,204,205
200,1.0,0.615,0.415,0.414,0.229,0.688
201,0.615,1.0,0.233,0.543,0.153,0.747
202,0.415,0.233,1.0,0.859,0.909,0.323
203,0.414,0.543,0.859,1.0,0.878,0.472
204,0.229,0.153,0.909,0.878,1.0,0.137
205,0.688,0.747,0.323,0.472,0.137,1.0


## LDA

In [197]:
pca10_topic_vectors = pca.fit_transform(tfidf_docs)


X_train, X_test, y_train, y_test = train_test_split(pca10_topic_vectors,
                                                    speeches.outcome.astype(int),
                                                    test_size=0.5, 
                                                    random_state=271828)
lda = LDA(n_components=1)
lda = lda.fit(X_train, y_train)
speeches['pca10_outcome'] = lda.predict(pca10_topic_vectors)
round(float(lda.score(X_test, y_test)), 3)

0.987

## LDiA

In [198]:
# Average number of words per speech
texts = speeches.speech
sum([len(t.split()) for t in texts]) * 1. / len(texts)

687.9914712153518

In [199]:
counter = CountVectorizer(tokenizer=casual_tokenize)
bow_docs = pd.DataFrame(counter.fit_transform(raw_documents=speeches.speech).toarray())
column_nums, terms = zip(*sorted(zip(counter.vocabulary_.values(),
                                     counter.vocabulary_.keys())))
bow_docs.columns = terms

In [203]:
ldia = LDiA(n_components=10, learning_method='batch')
ldia = ldia.fit(bow_docs)

components = pd.DataFrame(ldia.components_.T, index=terms, columns=columns)
components.round(2).head(3)

Unnamed: 0,topic0,topic1,topic2,topic3,topic4,topic5,topic6,topic7,topic8,topic9
0,0.1,0.1,0.1,1.1,0.1,0.1,0.1,0.1,0.1,0.1
0,2.53,0.1,0.1,0.1,0.1,0.1,0.1,0.67,1.1,0.1
2,0.1,0.1,0.1,0.1,0.1,0.1,0.1,1.1,0.1,0.1


In [228]:
components.topic8.sort_values(ascending=False)[:8]

vamos       2643.004944
ganar       1243.010529
méxico       820.460790
aquí         810.582361
mujeres      799.938056
queremos     663.260153
viva         550.392652
hoy          500.096054
Name: topic8, dtype: float64

In [221]:
ldia10_topic_vectors = ldia.transform(bow_docs)
ldia10_topic_vectors = pd.DataFrame(ldia10_topic_vectors, columns=columns)
#ldia10_topic_vectors.round(2).head()
#which candidate. Currently: anaya
ldia10_topic_vectors.round(2).iloc[-5:]

Unnamed: 0,topic0,topic1,topic2,topic3,topic4,topic5,topic6,topic7,topic8,topic9
464,0.0,0.88,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.12
465,0.0,0.76,0.0,0.0,0.0,0.05,0.0,0.05,0.03,0.12
466,0.84,0.16,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
467,0.35,0.48,0.11,0.0,0.0,0.0,0.0,0.06,0.0,0.0
468,0.65,0.12,0.23,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [229]:
X_train, X_test, y_train, y_test = train_test_split(ldia10_topic_vectors, speeches.outcome,
    test_size=0.5, random_state=271828)
lda = LDA(n_components=1)
lda = lda.fit(X_train, y_train)
speeches['ldia10_outcome'] = lda.predict(ldia10_topic_vectors)
round(float(lda.score(X_test, y_test)), 3)



0.919

In [230]:
X_train, X_test, y_train, y_test = train_test_split(tfidf_docs, speeches.outcome, test_size=0.5, random_state=271828)
lda = LDA(n_components=1)
lda = lda.fit(X_train, y_train)
round(float(lda.score(X_train, y_train)), 3)



0.376

In [231]:
round(float(lda.score(X_test, y_test)), 3)

0.506

In [232]:
ldia32 = LDiA(n_components=32, learning_method='batch')
ldia32 = ldia32.fit(bow_docs)
ldia32.components_.shape

(32, 22508)

In [233]:
ldia32_topic_vectors = ldia32.transform(bow_docs)
columns32 = ['topic{}'.format(i) for i in range(ldia32.n_components)]
ldia32_topic_vectors = pd.DataFrame(ldia32_topic_vectors, columns=columns32)
ldia32_topic_vectors.round(2).head()

Unnamed: 0,topic0,topic1,topic2,topic3,topic4,topic5,topic6,topic7,topic8,topic9,...,topic22,topic23,topic24,topic25,topic26,topic27,topic28,topic29,topic30,topic31
0,0.0,0.0,0.0,0.0,0.0,0.66,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.78,0.0,0.0,0.0,0.0,...,0.0,0.22,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.67,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [234]:
X_train, X_test, y_train, y_test = train_test_split(ldia32_topic_vectors, speeches.outcome,
    test_size=0.5, random_state=271828)
lda = LDA(n_components=1)
lda = lda.fit(X_train, y_train)
speeches['ldia32_outcome'] = lda.predict(ldia32_topic_vectors)
X_train.shape



(234, 32)

In [235]:
round(float(lda.score(X_train, y_train)), 3)

0.991

In [236]:
round(float(lda.score(X_test, y_test)), 3)

0.953