# Data Engineering II Final Project

### 3.1 The Similarity Search
The students will use word embedding models to facilitate similarity searches. I.e: the word embeddings of the search string are compared with those of the available tweets (using which ever distance algorithm the students choose, like eucledian distance for example) and the top 20 similar tweets are chosen.
The students are free to choose whichever word embedding model they choose, like Fasttext, Doc2Vec, Word2Vec…
**note: a reminder to handle all the cleaning and pre-processing of the text.


In [27]:
import pandas as pd

In [28]:
df = pd.read_csv("../data/tweets.csv")
df

Unnamed: 0.1,Unnamed: 0,date,id,link,retweet,text,author
0,0,Oct 7,784609194234306560,/realDonaldTrump/status/784609194234306560,False,Here is my statement.pic.twitter.com/WAZiGoQqMQ,DonaldTrump
1,1,Oct 10,785608815962099712,/realDonaldTrump/status/785608815962099712,False,Is this really America? Terrible!pic.twitter.c...,DonaldTrump
2,2,Oct 8,784840992734064640,/realDonaldTrump/status/784840992734064641,False,The media and establishment want me out of the...,DonaldTrump
3,3,Oct 8,784767399442653184,/realDonaldTrump/status/784767399442653184,False,Certainly has been an interesting 24 hours!,DonaldTrump
4,4,Oct 10,785561269571026944,/realDonaldTrump/status/785561269571026946,False,Debate polls look great - thank you!\n#MAGA #A...,DonaldTrump
...,...,...,...,...,...,...,...
17211,17211,12 May 2009,1773561338,/realDonaldTrump/status/1773561338,False,"""My persona will never be that of a wallflower...",DonaldTrump
17212,17212,8 May 2009,1741160716,/realDonaldTrump/status/1741160716,False,New Blog Post: Celebrity Apprentice Finale and...,DonaldTrump
17213,17213,8 May 2009,1737479987,/realDonaldTrump/status/1737479987,False,Donald Trump reads Top Ten Financial Tips on L...,DonaldTrump
17214,17214,4 May 2009,1701461182,/realDonaldTrump/status/1701461182,False,Donald Trump will be appearing on The View tom...,DonaldTrump


In [29]:
df_clear = df.drop(columns=["Unnamed: 0","date","id","retweet","author"])
df_clear

Unnamed: 0,link,text
0,/realDonaldTrump/status/784609194234306560,Here is my statement.pic.twitter.com/WAZiGoQqMQ
1,/realDonaldTrump/status/785608815962099712,Is this really America? Terrible!pic.twitter.c...
2,/realDonaldTrump/status/784840992734064641,The media and establishment want me out of the...
3,/realDonaldTrump/status/784767399442653184,Certainly has been an interesting 24 hours!
4,/realDonaldTrump/status/785561269571026946,Debate polls look great - thank you!\n#MAGA #A...
...,...,...
17211,/realDonaldTrump/status/1773561338,"""My persona will never be that of a wallflower..."
17212,/realDonaldTrump/status/1741160716,New Blog Post: Celebrity Apprentice Finale and...
17213,/realDonaldTrump/status/1737479987,Donald Trump reads Top Ten Financial Tips on L...
17214,/realDonaldTrump/status/1701461182,Donald Trump will be appearing on The View tom...


In [30]:
df_clear = df_clear.dropna()
df_clear

Unnamed: 0,link,text
0,/realDonaldTrump/status/784609194234306560,Here is my statement.pic.twitter.com/WAZiGoQqMQ
1,/realDonaldTrump/status/785608815962099712,Is this really America? Terrible!pic.twitter.c...
2,/realDonaldTrump/status/784840992734064641,The media and establishment want me out of the...
3,/realDonaldTrump/status/784767399442653184,Certainly has been an interesting 24 hours!
4,/realDonaldTrump/status/785561269571026946,Debate polls look great - thank you!\n#MAGA #A...
...,...,...
17211,/realDonaldTrump/status/1773561338,"""My persona will never be that of a wallflower..."
17212,/realDonaldTrump/status/1741160716,New Blog Post: Celebrity Apprentice Finale and...
17213,/realDonaldTrump/status/1737479987,Donald Trump reads Top Ten Financial Tips on L...
17214,/realDonaldTrump/status/1701461182,Donald Trump will be appearing on The View tom...


In [31]:
import string
import re 

def clean(data):
    data_clean = re.sub(r"\d+", "", data)
    data_clean = re.sub('pictwitter',  ' ', data_clean)
    data_clean = re.sub('\n',  ' ', data_clean)
    data_clean = data_clean.lower()
    data_clean = data_clean.translate(str.maketrans(' ', ' ', string.punctuation))
    data_clean = data_clean.strip()
    data_clean = re.sub('pictwitter',  ' ', data_clean)
    data_clean = re.sub('\xa0',  ' ', data_clean)
    return data_clean



In [32]:
df_clear2 = [clean(x) for x in df_clear['text']]

In [33]:
df_clear2

['here is my statement comwazigoqqmq',
 'is this really america terrible comwiwcpifu',
 'the media and establishment want me out of the race so badly   i will never drop out of the race will never let my supporters down maga',
 'certainly has been an interesting  hours',
 'debate polls look great  thank you maga americafirst compeqsswdz',
 'what they are saying about the clinton campaign’s anticatholic bigotry  httpbitlydcbtvkcrooked',
 'thank you maga americafirst comfgwjlkm',
 'i will be in cincinnati ohio tomorrow night at pm join me ohiovotesearly votetrumppence  tickets httpswwwdonaldjtrumpcomscheduleregistercincinnatioh … comxufugcfg',
 'very little pickup by the dishonest media of incredible information provided by wikileaks so dishonest rigged system',
 'thank you florida a movement that has never been seen before and will never be seen again lets get out  votetrumppence on  maga comeaalvo',
 'the very foul mouthed sen john mccain begged for my support during his  primary i gav

In [34]:
import nltk
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
from nltk.tokenize import word_tokenize


def tokenize(data):
    tokens = word_tokenize(data)
    result = [i for i in tokens if not i in stop_words]
    return result

df_clear3 = [tokenize(x) for x in df_clear2]
df_clear3

[['statement', 'comwazigoqqmq'],
 ['really', 'america', 'terrible', 'comwiwcpifu'],
 ['media',
  'establishment',
  'want',
  'race',
  'badly',
  'never',
  'drop',
  'race',
  'never',
  'let',
  'supporters',
  'maga'],
 ['certainly', 'interesting', 'hours'],
 ['debate',
  'polls',
  'look',
  'great',
  'thank',
  'maga',
  'americafirst',
  'compeqsswdz'],
 ['saying',
  'clinton',
  'campaign',
  '’',
  'anticatholic',
  'bigotry',
  'httpbitlydcbtvkcrooked'],
 ['thank', 'maga', 'americafirst', 'comfgwjlkm'],
 ['cincinnati',
  'ohio',
  'tomorrow',
  'night',
  'pm',
  'join',
  'ohiovotesearly',
  'votetrumppence',
  'tickets',
  'httpswwwdonaldjtrumpcomscheduleregistercincinnatioh',
  '…',
  'comxufugcfg'],
 ['little',
  'pickup',
  'dishonest',
  'media',
  'incredible',
  'information',
  'provided',
  'wikileaks',
  'dishonest',
  'rigged',
  'system'],
 ['thank',
  'florida',
  'movement',
  'never',
  'seen',
  'never',
  'seen',
  'lets',
  'get',
  'votetrumppence',
  'ma

In [None]:
def ToString(data):
    s = data
 
    listToStr = ' '.join(map(str, s)) 
  
    print(listToStr) 
    return df


df_clear4 = [ToString(x) for x in df_clear3]
print(df_clear4)
df_clear4

In [26]:
import numpy as np
import sklearn.cluster
import distance
from array import array


def distance_calcul(data):
    lev_similarity = -1*np.array([[distance.levenshtein(w1,w2) for w1 in data] for w2 in data])
    pred = sklearn.cluster.AffinityPropagation(affinity="euclidean", damping=0.5,random_state=None)
    pred.fit(lev_similarity)
    return pred

df_clear5 = distance_calcul(df_clear4[3])
#df_clear5 = [distance_calcul(x) for x in df_clear4]
#df_clear5.fit_predict(df_clear4[0])
print(df_clear4[0])

       Unnamed: 0         date                  id  \
0               0        Oct 7  784609194234306560   
1               1       Oct 10  785608815962099712   
2               2        Oct 8  784840992734064640   
3               3        Oct 8  784767399442653184   
4               4       Oct 10  785561269571026944   
...           ...          ...                 ...   
17211       17211  12 May 2009          1773561338   
17212       17212   8 May 2009          1741160716   
17213       17213   8 May 2009          1737479987   
17214       17214   4 May 2009          1701461182   
17215       17215   4 May 2009          1698308935   

                                             link  retweet  \
0      /realDonaldTrump/status/784609194234306560    False   
1      /realDonaldTrump/status/785608815962099712    False   
2      /realDonaldTrump/status/784840992734064641    False   
3      /realDonaldTrump/status/784767399442653184    False   
4      /realDonaldTrump/status/7855612695

In [69]:
df_clear3.info()

AttributeError: 'list' object has no attribute 'info'

In [18]:
pip install array

Note: you may need to restart the kernel to use updated packages.


ERROR: Could not find a version that satisfies the requirement array
ERROR: No matching distribution found for array
