## Imports & Stuff

In [None]:
import pandas as pd
## pandas 1.1.1 necessary
import numpy as np
import string 
import re
from sklearn.feature_extraction.text import TfidfVectorizer
import math
from gensim.models import Word2Vec
from plotly.graph_objects import Scatter3d

In [None]:
! pwd

In [None]:
data = pd.read_pickle('../../delphes/data/extended_tweet_df')
data

In [None]:
test_data = data[data['country'] == 'Ireland']

## Preprocessing

In [None]:
# Remove the undesirable elements in the entire dataframe
def rmurl_df(df, column_name):
    '''
    This function removes all the URLs, the #hashtag and the @user of a column made of strings.
    Be careful to apply it BEFORE all the other preprocessing steps (if not it wont'
    be recognized as a URL)
    '''
    df = df.copy()
    df[column_name] = df[column_name].str.replace('http\S+|www.\S+|@\S+|#\S+', '', case=False)
    return df

In [None]:
# Lowercase the tweet's column
def lower_df(df, column_name):
    '''
    This function lowercases a column made of strings.
    '''
    df = df.copy()
    df[column_name] = df[column_name].str.lower()
    return df

In [None]:
# Remove the numbers in the tweet's column
def rmnumbers_df(df, column_name):
    '''
    This function removes all the digits of a column made of strings.
    '''
    df = df.copy()
    def remove_numbers(text):
        return ''.join(word for word in text if not word.isdigit())
    df[column_name] = df[column_name].apply(remove_numbers)
    return df

In [None]:
# Remove the undesirable punctuations in the tweet's column
def rmpunct_df(df, column_name):
    '''
    This function removes all the punctuations, all the "rt" and remove multiple spaces
    of a column made of strings.
    '''
    punct = string.punctuation
    df = df.copy()
    def replace_punct(text):
        for punctu in punct:
            text = text.replace(punctu, ' ')
            text = text.replace(' rt ','')
            text = " ".join(text.split())
        return text
    df[column_name] = df[column_name].apply(replace_punct)
    return df

In [None]:
# Remove the undesirable emojis in the entire dataframe
def rmemojis_df(df):
    '''
    This function removes all the emojis of a column made of strings.
    Be careful to translate in latin alphabet before applying this function : 
    it also removes cyrillic alphabet
    '''
    df = df.copy()
    df = df.astype(str).apply(lambda x: x.str.encode('ascii', 'ignore').str.decode('ascii'))
    return df

In [None]:
test_data = rmurl_df(test_data, 'content')
test_data = lower_df(test_data, 'content')
test_data = rmnumbers_df(test_data, 'content')
test_data = rmpunct_df(test_data, 'content')
test_data = rmemojis_df(test_data)

In [None]:
test_data = test_data.reset_index()

In [None]:
test_data["index"][0]

## Word2Vec 

In [None]:
sentences = list(test_data['content'])

In [None]:
sentences_train = []
for sentence in sentences:
    sentences_train.append(sentence.split())

## TF-IDF

In [None]:
def word2vec_embedding(sentences,size,min_count):
    ### build word embedding for each reviews
    word2vec_model = Word2Vec(sentences, size=size, min_count=min_count)
    return word2vec_model

In [None]:
def weight_of_words(text):
    #takes text and returns a dictionary with the word and its weight
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit(text)
    weight_of_words = dict(zip(X.get_feature_names(),X.idf_))
    return weight_of_words

In [None]:
weight_of_words(sentences)

In [None]:
tweet_vecs = []
def weighted_words(text, weight_of_words, word2vec):
    #returns a list of n_tweets dimensions containing a (1,100) vector for each word
    check = 0
    for sentence in text:
        divided_sentence = sentence.split(' ')
        new_vecs = []
        for word in divided_sentence: 
            if word in weight_of_words.keys():
                pondered = word2vec.wv.get_vector(word).reshape(1, 100)
                check += 1
                print(check)
                pondered_word = pondered*weight_of_words[word]
                new_vecs.append(pondered_word)
            else:
                continue
        tweet_vecs.append(new_vecs)
    return tweet_vecs

In [None]:
tryout = weighted_words(sentences, weight_of_words(sentences),word2vec_embedding(sentences=sentences_train, size=100, min_count=1))

In [None]:
tryout

In [None]:
tweets = []
def weight_tweets(tweet_weighted_vecs):
    #returns a single (1,100) vector per tweet
    for i in range (0,3000):
        result = 0
        if len(tryout[i]) != 0:
            result = sum(tryout[i])/len(tryout[i])
            tweets.append(result.tolist())
        else: 
            tweets.append(0)
    return tweets

In [None]:
tweets = weight_tweets(tryout)

In [None]:
tweets

In [None]:
dict_test = {}

def df_making(tweets):
    #returns a DataFrame of n_tweets rows and 100 columns
    count = 0
    for i in range (0,len(tweets)):
        if tweets[i] != 0:
            dict_test[test_data["index"][i]] = tweets[i][0]
    return pd.DataFrame.from_dict(dict_test).transpose()

In [None]:
dict_test = df_making(tweets)

## PCA & T-NSE

In [None]:
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_mldata
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
%matplotlib inline
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns
from mpl_toolkits.mplot3d import Axes3D
import plotly.express as px
import plotly

In [164]:
pca = PCA(n_components=50)
pca_result = pca.fit_transform(dict_test.values)
pca_result.shape

(2967, 50)

In [165]:
len(pca_result)

2967

In [166]:
count

0

In [169]:
testing_tsv = []
for i in pca_result:
    count = 0
    testing_tsv.append(str(i[0])+"\t"+str(i[1])+"\t"+str(i[2]))

In [175]:
import csv

with open('testing', 'w') as myfile:
    wr = csv.writer(myfile, quoting=csv.QUOTE_ALL)
    wr.writerow(testing_tsv)

In [176]:
! pwd

/Users/simonpastor/code/simonjpastor/delphes/notebooks/simonjpastor_notebooks


In [112]:
tsne = TSNE(n_components=3, verbose=1, perplexity=40, n_iter=10000)
tsne_results = tsne.fit_transform(pca_result)
tsne_results

[t-SNE] Computing 121 nearest neighbors...
[t-SNE] Indexed 2967 samples in 0.004s...
[t-SNE] Computed neighbors for 2967 samples in 0.090s...
[t-SNE] Computed conditional probabilities for sample 1000 / 2967
[t-SNE] Computed conditional probabilities for sample 2000 / 2967
[t-SNE] Computed conditional probabilities for sample 2967 / 2967
[t-SNE] Mean sigma: 0.034864
[t-SNE] KL divergence after 250 iterations with early exaggeration: 50.007820
[t-SNE] KL divergence after 7500 iterations: 0.172223


array([[ 16.653126  ,  -8.127023  ,   0.70474756],
       [  1.116931  ,   0.2472599 , -19.94199   ],
       [ 16.72766   ,  -8.195616  ,   0.8061663 ],
       ...,
       [ 15.569933  ,   4.562419  , -10.534881  ],
       [-20.679482  ,   3.9627528 ,  -7.376375  ],
       [ -8.296525  ,  21.443062  ,   4.93316   ]], dtype=float32)

In [113]:
pca_result = pd.DataFrame(pca_result)

In [114]:
pca_result

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,40,41,42,43,44,45,46,47,48,49
0,0.433197,-0.020252,-0.007582,-0.003797,0.003648,0.004501,0.001970,-0.003147,-0.002557,-0.000254,...,0.000820,0.002183,0.000768,0.001187,-0.000899,-0.000227,0.001048,-0.002182,-0.004430,0.003094
1,-1.120731,-0.011187,-0.008614,0.004107,0.001833,-0.004029,-0.003398,0.000100,-0.002017,0.000124,...,-0.002645,0.004449,0.001063,-0.000921,0.001468,-0.000813,-0.001393,0.000724,-0.003644,0.006771
2,0.433007,-0.013216,-0.009399,-0.002638,-0.004493,0.002491,-0.000203,0.001121,-0.004414,0.002154,...,-0.002635,0.004943,-0.002056,-0.002047,0.001063,0.001806,-0.000215,0.005504,0.001341,0.003657
3,-1.111715,0.001572,-0.004743,-0.006394,0.002231,-0.003705,0.001309,0.004343,-0.006659,0.005946,...,0.002466,-0.004919,-0.001927,-0.002786,0.000422,0.001153,0.002278,0.000367,0.002087,-0.003890
4,0.074074,-0.013011,-0.001015,-0.004771,-0.004664,0.000792,-0.007211,0.000953,-0.000624,0.003813,...,-0.002098,0.000286,-0.002836,-0.004078,0.002359,-0.007742,0.000774,0.002539,-0.004414,-0.002087
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2962,1.317712,-0.007067,-0.003612,0.014972,0.001972,0.002922,0.003981,-0.000394,-0.002698,-0.016034,...,-0.009819,0.000134,0.000151,0.001656,-0.010225,-0.008848,0.002101,-0.004574,0.003963,-0.013033
2963,1.041101,0.007117,0.018033,-0.000762,-0.000689,0.000563,0.011010,-0.003132,-0.003426,-0.000373,...,0.002851,0.000542,-0.000438,-0.001420,-0.002342,-0.006471,-0.001170,0.000697,0.000669,-0.000194
2964,-1.544167,0.026296,-0.010322,0.002218,-0.001398,0.003074,-0.000924,-0.001580,0.005345,-0.002951,...,-0.003227,0.000992,-0.000890,0.001128,-0.002065,-0.005606,-0.000823,0.002335,0.002715,0.001184
2965,1.725479,0.004506,-0.003133,-0.006579,-0.002712,0.001222,0.003180,-0.006764,-0.008394,0.003654,...,0.002758,0.000945,0.001836,0.004506,-0.004094,0.000415,0.001170,0.001152,0.000574,0.001250


In [115]:
pca.explained_variance_ratio_

array([9.99401063e-01, 1.28178360e-04, 2.81078891e-05, 2.10577240e-05,
       1.51165426e-05, 1.23243909e-05, 1.19815348e-05, 1.11553450e-05,
       1.06719326e-05, 9.94023913e-06, 9.20145214e-06, 8.90941849e-06,
       8.79755510e-06, 8.20473545e-06, 7.99494581e-06, 7.72762178e-06,
       7.47027837e-06, 7.24272677e-06, 6.85539961e-06, 6.61684309e-06,
       6.53115805e-06, 6.35013616e-06, 6.18804387e-06, 6.08701233e-06,
       5.98978896e-06, 5.76057882e-06, 5.61028791e-06, 5.47597936e-06,
       5.41181402e-06, 5.34499710e-06, 5.24470587e-06, 5.04983207e-06,
       4.95229556e-06, 4.77977832e-06, 4.71729427e-06, 4.70539215e-06,
       4.56790264e-06, 4.51091149e-06, 4.39027249e-06, 4.24139850e-06,
       4.13811215e-06, 4.02638332e-06, 3.98183708e-06, 3.84489775e-06,
       3.83390058e-06, 3.69008667e-06, 3.63984817e-06, 3.62321893e-06,
       3.56545924e-06, 3.48277697e-06])

In [116]:
dict_test

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
24722,0.343130,1.821136,0.729091,-0.780753,1.117681,-0.642696,0.883891,0.063969,-2.401691,0.182722,...,-0.629596,2.630774,1.327676,1.567115,0.275329,1.240702,-1.350635,-0.645492,-0.328513,0.887045
24723,0.386416,2.059793,0.823250,-0.882285,1.266376,-0.727843,1.000658,0.067937,-2.715657,0.206082,...,-0.713558,2.974562,1.495788,1.776626,0.312295,1.406074,-1.522638,-0.735219,-0.369524,1.004168
24724,0.344552,1.820474,0.728089,-0.772024,1.119067,-0.644256,0.884932,0.063954,-2.407375,0.182321,...,-0.630183,2.628993,1.324182,1.567837,0.270476,1.245109,-1.346104,-0.643299,-0.331628,0.886999
24725,0.383577,2.060384,0.829464,-0.877885,1.268648,-0.724785,0.994885,0.070632,-2.715005,0.211946,...,-0.722032,2.982672,1.506734,1.768861,0.307374,1.412243,-1.515686,-0.731875,-0.366875,0.999074
24726,0.351088,1.881338,0.751106,-0.800052,1.155948,-0.668317,0.909337,0.064088,-2.470943,0.183856,...,-0.651388,2.710820,1.367898,1.617690,0.278425,1.277830,-1.390281,-0.668665,-0.339098,0.908579
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
130490,0.321501,1.679850,0.668677,-0.722021,1.035140,-0.586261,0.817115,0.056374,-2.212994,0.174938,...,-0.595370,2.432443,1.221510,1.456535,0.249966,1.153004,-1.251858,-0.605326,-0.293158,0.817627
130491,0.327473,1.726129,0.691692,-0.735359,1.060710,-0.607831,0.830652,0.054806,-2.276692,0.177671,...,-0.602083,2.502090,1.253732,1.492340,0.260687,1.184348,-1.277026,-0.623399,-0.301848,0.836355
130492,0.403128,2.125588,0.853538,-0.909636,1.306041,-0.743381,1.025209,0.068211,-2.799548,0.214072,...,-0.740218,3.081211,1.551738,1.825408,0.321757,1.453886,-1.564517,-0.767486,-0.375927,1.034215
130493,0.301160,1.621303,0.652417,-0.693043,1.001455,-0.572017,0.788758,0.056645,-2.138397,0.162255,...,-0.565378,2.345572,1.184679,1.393649,0.242649,1.110049,-1.201148,-0.582138,-0.289782,0.789958


In [117]:
dict_test

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
24722,0.343130,1.821136,0.729091,-0.780753,1.117681,-0.642696,0.883891,0.063969,-2.401691,0.182722,...,-0.629596,2.630774,1.327676,1.567115,0.275329,1.240702,-1.350635,-0.645492,-0.328513,0.887045
24723,0.386416,2.059793,0.823250,-0.882285,1.266376,-0.727843,1.000658,0.067937,-2.715657,0.206082,...,-0.713558,2.974562,1.495788,1.776626,0.312295,1.406074,-1.522638,-0.735219,-0.369524,1.004168
24724,0.344552,1.820474,0.728089,-0.772024,1.119067,-0.644256,0.884932,0.063954,-2.407375,0.182321,...,-0.630183,2.628993,1.324182,1.567837,0.270476,1.245109,-1.346104,-0.643299,-0.331628,0.886999
24725,0.383577,2.060384,0.829464,-0.877885,1.268648,-0.724785,0.994885,0.070632,-2.715005,0.211946,...,-0.722032,2.982672,1.506734,1.768861,0.307374,1.412243,-1.515686,-0.731875,-0.366875,0.999074
24726,0.351088,1.881338,0.751106,-0.800052,1.155948,-0.668317,0.909337,0.064088,-2.470943,0.183856,...,-0.651388,2.710820,1.367898,1.617690,0.278425,1.277830,-1.390281,-0.668665,-0.339098,0.908579
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
130490,0.321501,1.679850,0.668677,-0.722021,1.035140,-0.586261,0.817115,0.056374,-2.212994,0.174938,...,-0.595370,2.432443,1.221510,1.456535,0.249966,1.153004,-1.251858,-0.605326,-0.293158,0.817627
130491,0.327473,1.726129,0.691692,-0.735359,1.060710,-0.607831,0.830652,0.054806,-2.276692,0.177671,...,-0.602083,2.502090,1.253732,1.492340,0.260687,1.184348,-1.277026,-0.623399,-0.301848,0.836355
130492,0.403128,2.125588,0.853538,-0.909636,1.306041,-0.743381,1.025209,0.068211,-2.799548,0.214072,...,-0.740218,3.081211,1.551738,1.825408,0.321757,1.453886,-1.564517,-0.767486,-0.375927,1.034215
130493,0.301160,1.621303,0.652417,-0.693043,1.001455,-0.572017,0.788758,0.056645,-2.138397,0.162255,...,-0.565378,2.345572,1.184679,1.393649,0.242649,1.110049,-1.201148,-0.582138,-0.289782,0.789958


In [118]:
dict_test = dict_test.reset_index()

In [119]:
new_data = test_data.merge(dict_test, on="index")

In [120]:
! pwd

/Users/simonpastor/code/simonjpastor/delphes/notebooks/simonjpastor_notebooks


In [121]:
new_data.to_csv("visu.tsv",sep='\t')

In [122]:
x_coord = []
y_coord = []
z_coord = []
for i in tsne_results:
    x_coord.append(i[0])
    y_coord.append(i[1])
    z_coord.append(i[2])
new_data["x"] = x_coord
new_data["y"] = y_coord
new_data["z"] = z_coord

In [123]:
def color(values):
    if values == 'Fine Gael Party':
        return 0
    elif values == "Green Party":
        return 1
    elif values == "Independent":
        return 2
    elif values == "Sinn Fin":
        return 3
    elif values == "Independents for change":
        return 4
    else:
        return 5

In [124]:
new_data["nat_group"] = new_data["nat_group"].apply(color)

In [125]:
new_data["nat_group"].unique()

array([0, 1, 2, 3, 4])

## Plotting

In [126]:
fig = px.scatter_3d(new_data, x="x", y="y", z="z",color="nat_group")
fig.show()

In [None]:
sns.set_style("whitegrid", {'axes.grid' : False})

fig = plt.figure(figsize=(6,6))

ax = Axes3D(fig)

x = x_coord
y = y_coord
z = z_coord


g = ax.scatter(x, y, z, marker='o', depthshade=False, cmap='Paired')

plt.show()

In [None]:
pca_result

## Color