## Imports & Stuff

In [1]:
import pandas as pd
## pandas 1.1.1 necessary
import numpy as np
import string 
import re
from sklearn.feature_extraction.text import TfidfVectorizer
import math
from gensim.models import Word2Vec
# from plotly.graph_objects import Scatter3d

In [2]:
df = pd.read_pickle('../../delphes/data/cleaned_tweet_df')

In [3]:
df

Unnamed: 0,mep_id,name,country,group,nat_group,twitter,content,age,sex
0,197490,Magdalena ADAMOWICZ,Poland,Group of the European People's Party (Christia...,Independent,Adamowicz_Magda,thank much free media independent authorities ...,47,1.0
1,197490,Magdalena ADAMOWICZ,Poland,Group of the European People's Party (Christia...,Independent,Adamowicz_Magda,the commission adopted major pilot projects fu...,47,1.0
2,197490,Magdalena ADAMOWICZ,Poland,Group of the European People's Party (Christia...,Independent,Adamowicz_Magda,the commission adopted authored pilot projects...,47,1.0
3,197490,Magdalena ADAMOWICZ,Poland,Group of the European People's Party (Christia...,Independent,Adamowicz_Magda,day point hour asking one empty slogans unders...,47,1.0
4,197490,Magdalena ADAMOWICZ,Poland,Group of the European People's Party (Christia...,Independent,Adamowicz_Magda,god need defended anyone want name used terror...,47,1.0
...,...,...,...,...,...,...,...,...,...
134719,197652,Angelika WINZIG,Austria,Group of the European People's Party (Christia...,sterreichische Volkspartei,AngelikaWinzig,italy budget policy endangers italy also europ...,63,1.0
134720,197652,Angelika WINZIG,Austria,Group of the European People's Party (Christia...,sterreichische Volkspartei,AngelikaWinzig,thank martin organizing summer talk attersee e...,63,1.0
134721,197652,Angelika WINZIG,Austria,Group of the European People's Party (Christia...,sterreichische Volkspartei,AngelikaWinzig,tradition amp modern top companies network mee...,63,1.0
134722,197652,Angelika WINZIG,Austria,Group of the European People's Party (Christia...,sterreichische Volkspartei,AngelikaWinzig,here statement budget speech finance minister ...,63,1.0


## Preprocessing

In [4]:
# Remove the undesirable elements in the entire dataframe
def rmurl_df(df, column_name):
    '''
    This function removes all the URLs, the #hashtag and the @user of a column made of strings.
    Be careful to apply it BEFORE all the other preprocessing steps (if not it wont'
    be recognized as a URL)
    '''
    df = df.copy()
    df[column_name] = df[column_name].str.replace('http\S+|www.\S+|@\S+|#\S+', '', case=False)
    return df

In [5]:
# Lowercase the tweet's column
def lower_df(df, column_name):
    '''
    This function lowercases a column made of strings.
    '''
    df = df.copy()
    df[column_name] = df[column_name].str.lower()
    return df

In [6]:
# Remove the numbers in the tweet's column
def rmnumbers_df(df, column_name):
    '''
    This function removes all the digits of a column made of strings.
    '''
    df = df.copy()
    def remove_numbers(text):
        return ''.join(word for word in text if not word.isdigit())
    df[column_name] = df[column_name].apply(remove_numbers)
    return df

In [7]:
# Remove the undesirable punctuations in the tweet's column
def rmpunct_df(df, column_name):
    '''
    This function removes all the punctuations, all the "rt" and remove multiple spaces
    of a column made of strings.
    '''
    punct = string.punctuation
    df = df.copy()
    def replace_punct(text):
        for punctu in punct:
            text = text.replace(punctu, ' ')
            text = text.replace(' rt ','')
            text = " ".join(text.split())
        return text
    df[column_name] = df[column_name].apply(replace_punct)
    return df

In [8]:
# Remove the undesirable emojis in the entire dataframe
def rmemojis_df(df):
    '''
    This function removes all the emojis of a column made of strings.
    Be careful to translate in latin alphabet before applying this function : 
    it also removes cyrillic alphabet
    '''
    df = df.copy()
    df = df.astype(str).apply(lambda x: x.str.encode('ascii', 'ignore').str.decode('ascii'))
    return df

In [9]:
nw_df = rmurl_df(df, 'content')
nw_df = lower_df(nw_df, 'content')
nw_df = rmnumbers_df(nw_df, 'content')
nw_df = rmpunct_df(nw_df, 'content')
nw_df = rmemojis_df(nw_df)

In [10]:
nw_df_test = nw_df[:3000]

## Word2Vec 

In [11]:
sentences = list(nw_df_test['content'])

In [12]:
sentences_train = []
for sentence in sentences:
    sentences_train.append(sentence.split())

## TF-IDF

In [13]:
def word2vec_embedding(sentences,size,min_count):
    ### build word embedding for each reviews
    word2vec_model = Word2Vec(sentences, size=size, min_count=min_count)
    return word2vec_model

In [14]:
def weight_of_words(text):
    #takes text and returns a dictionary with the word and its weight
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit(text)
    weight_of_words = dict(zip(X.get_feature_names(),X.idf_))
    return weight_of_words

In [15]:
weight_of_words(sentences)

{'abandoned': 8.313553664880422,
 'abbreviation': 8.313553664880422,
 'abhorred': 8.313553664880422,
 'abiding': 8.313553664880422,
 'abilities': 8.313553664880422,
 'ability': 7.908088556772258,
 'abka': 8.313553664880422,
 'able': 5.540964942640641,
 'abnormal': 8.313553664880422,
 'abolished': 7.908088556772258,
 'abolition': 8.313553664880422,
 'abortion': 7.908088556772258,
 'about': 6.367643515825108,
 'abroad': 6.704115752446321,
 'absent': 8.313553664880422,
 'absenteeism': 8.313553664880422,
 'absolute': 7.620406484320476,
 'absolutely': 8.313553664880422,
 'abstained': 8.313553664880422,
 'absurd': 7.2149413762123125,
 'absurdities': 8.313553664880422,
 'absurdity': 8.313553664880422,
 'abuse': 7.2149413762123125,
 'ac': 8.313553664880422,
 'academy': 8.313553664880422,
 'accelerating': 7.908088556772258,
 'accent': 8.313553664880422,
 'accents': 8.313553664880422,
 'accept': 6.809476268104148,
 'acceptance': 8.313553664880422,
 'accepted': 7.620406484320476,
 'accepting': 7.

In [16]:
tweet_vecs = []
def weighted_words(text, weight_of_words, word2vec):
    #returns a list of n_tweets dimensions containing a (1,100) vector for each word
    check = 0
    for sentence in text:
        divided_sentence = sentence.split(' ')
        new_vecs = []
        for word in divided_sentence: 
            if word in weight_of_words.keys():
                pondered = word2vec.wv.get_vector(word).reshape(1, 100)
                check += 1
                pondered_word = pondered*weight_of_words[word]
                new_vecs.append(pondered_word)
            else:
                continue
        tweet_vecs.append(new_vecs)
    return tweet_vecs

In [17]:
tryout = weighted_words(sentences, weight_of_words(sentences),word2vec_embedding(sentences=sentences_train, size=100, min_count=1))

In [None]:
tryout

In [22]:
print("posadfgnaposdf")

posadfgnaposdf


In [18]:
tweets = []
def weight_tweets(tweet_weighted_vecs):
    #returns a single (1,100) vector per tweet
    for i in range (0,3000):
        result = 0
        if len(tryout[i]) != 0:
            result = sum(tryout[i])/len(tryout[i])
            tweets.append(result.tolist())
        else: 
            tweets.append(0)
    return tweets

In [19]:
tweets = weight_tweets(tryout)

In [None]:
tweets

In [20]:
dict_test = {}

def df_making(tweets):
    #returns a DataFrame of n_tweets rows and 100 columns
    count = 0
    for i in range (0,len(tweets)):
        if tweets[i] != 0:
            dict_test[nw_df_test["index"][i]] = tweets[i][0]
    return pd.DataFrame.from_dict(dict_test).transpose()

In [21]:
dict_test = df_making(tweets)

KeyError: 'index'

## PCA & T-NSE

In [None]:
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_mldata
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
%matplotlib inline
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns
from mpl_toolkits.mplot3d import Axes3D
import plotly.express as px
import plotly

In [None]:
pca = PCA(n_components=50)
pca_result = pca.fit_transform(dict_test.values)
pca_result.shape

In [None]:
len(pca_result)

In [None]:
count

In [None]:
testing_tsv = []
for i in pca_result:
    count = 0
    testing_tsv.append(str(i[0])+"\t"+str(i[1])+"\t"+str(i[2]))

In [None]:
import csv

with open('testing', 'w') as myfile:
    wr = csv.writer(myfile, quoting=csv.QUOTE_ALL)
    wr.writerow(testing_tsv)

In [None]:
! pwd

In [None]:
tsne = TSNE(n_components=3, verbose=1, perplexity=40, n_iter=10000)
tsne_results = tsne.fit_transform(pca_result)
tsne_results

In [None]:
pca_result = pd.DataFrame(pca_result)

In [None]:
pca_result

In [None]:
pca.explained_variance_ratio_

In [None]:
dict_test

In [None]:
dict_test

In [None]:
dict_test = dict_test.reset_index()

In [None]:
new_data = test_data.merge(dict_test, on="index")

In [None]:
! pwd

In [None]:
new_data.to_csv("visu.tsv",sep='\t')

In [None]:
x_coord = []
y_coord = []
z_coord = []
for i in tsne_results:
    x_coord.append(i[0])
    y_coord.append(i[1])
    z_coord.append(i[2])
new_data["x"] = x_coord
new_data["y"] = y_coord
new_data["z"] = z_coord

In [None]:
def color(values):
    if values == 'Fine Gael Party':
        return 0
    elif values == "Green Party":
        return 1
    elif values == "Independent":
        return 2
    elif values == "Sinn Fin":
        return 3
    elif values == "Independents for change":
        return 4
    else:
        return 5

In [None]:
new_data["nat_group"] = new_data["nat_group"].apply(color)

In [None]:
new_data["nat_group"].unique()

## Plotting

In [None]:
fig = px.scatter_3d(new_data, x="x", y="y", z="z",color="nat_group")
fig.show()

In [None]:
sns.set_style("whitegrid", {'axes.grid' : False})

fig = plt.figure(figsize=(6,6))

ax = Axes3D(fig)

x = x_coord
y = y_coord
z = z_coord


g = ax.scatter(x, y, z, marker='o', depthshade=False, cmap='Paired')

plt.show()

In [None]:
pca_result

## Color