In [2]:
'''Importing our modules'''
import re
import math

In [3]:
'''to open and read our tweets to begin the preprocessing step'''
raw_text = open('tweets.csv').read()
tab_seperated = [item.split('\t') for item in raw_text.split('\n') if len(item.split('\t')) >= 4]

In [4]:
data = {}
for i in range(len(tab_seperated)):
    '''extracting the filtered tweets and IDs'''
    data[tab_seperated[i][1]] = tab_seperated[i][4]

'''Preprocessing and cleaning'''
data_index = data.copy()    
for tweet in data_index.keys():
    data_index[tweet] = data_index[tweet].lower()
    data_index[tweet] = re.sub('\[newline\]', ' ' , data_index[tweet])
    data_index[tweet] = re.sub('https?[^\s]+', ' ' , data_index[tweet])
    data_index[tweet] = re.sub('[@#][^\s]+', ' ' , data_index[tweet])
    data_index[tweet] = re.sub(r'[0-9][^\s]+', ' ' , data_index[tweet])
    data_index[tweet] = re.sub(r'\w+\.[^\s]+', ' ' , data_index[tweet])
    data_index[tweet] = re.sub(r'[^a-zäöüß\s]', ' ', data_index[tweet])
    data_index[tweet] = re.sub(r'[^\w\s]', ' ' , data_index[tweet])
'''creating a term- frequency dictionary for the collection'''
terms = {}
for num,tweet in data_index.items():
    for word in tweet.split():
        if word in terms:
            terms[word] += 1
        else:
            terms[word] = 1

In [5]:
'''create the weighted idf values of each term'''
idfDict = {}
N = len(data_index)
idfDict = terms.copy()
for word, val in idfDict.items():
        idfDict[word]= 1 +math.log10(N/float(val))

In [6]:
'''This function takes two strings as input and computes their cosine similary score'''
def cosine(text1, text2):
    import math
    import re
    '''tokenizing the tweets and building a local word dictionary'''
    def wd(t1, t2):
        wd = {}
        t1 = t1.split()
        t2 = t2.split()
        for i in t1:
            wd[i] = 0.0
        for i in t2:
            wd[i] = 0.0
        return t1, t2, wd
    '''Building our weigthed TF vector '''
    def TF(wordDict, txt):
        tfDict = {}
        for word, count in wordDict.items():
            for i in txt:
                if word == i:
                    count +=1
            if count > 0:
                tfDict[word]=1+math.log10(count)
            else:
                tfDict[word] = 0.0
        return tfDict
    '''Build our TF-IDF vectors using our IDF dictionary'''
    def TF_IDF(tfs, idfs):
        tfidf = {}
        for word, val in tfs.items():
            tfidf[word] = val*idfs[word]
        return tfidf
    '''calculate the noramizing lenght for a given vector'''
    def length(tf):
        return math.sqrt(sum([(i**2) for i in list(tf.values())]))
    '''dot product of two vectors'''
    def dot(v1,v2):
        new_v = []
        for i in range(len(v1)):
            new_v.append(v1[i]*v2[i])
        return sum(new_v)
    
    '''tokenize and build local word dictionary'''
    txt1, txt2, wd = wd(text1, text2)
    
    '''return a 0 for empty vectors'''
    if len(txt1) == 0 or len(txt2) == 0:
        return 0
    
    '''compute and populate the tf vectors'''
    tf1 = TF(wd, txt1)
    tf2 = TF(wd, txt2)
    
    '''compute and populate the tf-idf vectors'''
    tf_idf1 = TF_IDF(tf1, idfDict)
    tf_idf2 = TF_IDF(tf2, idfDict)
    
    '''compute the lenghts of each vector'''
    l1 = length(tf_idf1)
    l2 = length(tf_idf2)
    
    ''' return 0 for vectors with length of 0'''
    if l1 == 0 or l2 == 0:
        return 0

    '''return rounded cosine similarity score'''
    return round(dot(list(tf_idf1.values()), list(tf_idf2.values()))/(l1*l2),4)

In [7]:
'''creates a list of tuples containing cosine score, id, and raw tweet
sorted by score in descending order and restricted to the top 100 '''
def top_100(tweet):
    top = []
    for id, doc in data_index.items():
        if id != tweet:
            cos = cosine(data_index[tweet], doc)
            top.append((cos,id,data[id]))
    return sorted(top)[::-1][:100]

In [8]:
'''Tweet selection from dictionary'''
for num, key in enumerate(data_index.keys()):
    if num == 0:
        print(num, data[key],'\n',key, data_index[key])

0 @knakatani @ChikonJugular @joofford @SteveBlogs11 https://t.co/WHtaRYGNSY says lifetime risk of cervical cancer in Japan is 1 in 100.  That means HPV is endemic in Japan, and screening is not working well. 
 965734992633565184           says lifetime risk of cervical cancer in japan is   in    that means hpv is endemic in japan  and screening is not working well 


In [22]:
'''Tweet selection from dictionary'''
print("Query:", data['965734992633565184'],'\n')
top = top_100('965734992633565184')
for i in top:
    print(i)

Query: @knakatani @ChikonJugular @joofford @SteveBlogs11 https://t.co/WHtaRYGNSY says lifetime risk of cervical cancer in Japan is 1 in 100.  That means HPV is endemic in Japan, and screening is not working well. 

(0.3896, '982642444943605760', "@beautyloveknow1 @VABVOX Yeah, cervical cancer is so rare that the lifetime risk of getting it is 1 in 161... and cervical isn't the only kind of cancer HPV causes.[NEWLINE][NEWLINE]Please vaccinate your kids against HPV-caused cancers.  Getting the vaccine is far safer than not getting it.")
(0.3413, '988887135699648512', "@vikki_r To prevent the cancers HPV causes?[NEWLINE]Lifetime risk of cervical cancer in US is 1 in 161 without vaccination.[NEWLINE]The vaccine's about 80-90% effective against cervical and other HPV-caused cancers.  Getting the vaccine can prevent some awful problems down the road.")
(0.3389, '968982710222970880', "@RustyPee4 @williambolivar7 HPV 16 and 18 account for ~70% of cervical cancers, that's true.[NEWLINE]HPV 31, 

In [10]:
'''Tweet selection from dictionary'''
for num, key in enumerate(data_index.keys()):
    if num == 8:
        print(num, data[key],'\n',key, data_index[key])

8 @rip_tear Tumors?[NEWLINE]Dat leg is straight mccain 
 965481507266727936   tumors  dat leg is straight mccain


In [23]:
print("Query:", data['965481507266727936'],'\n')
top = top_100('965481507266727936')
for i in top:
    print(i)

Query: @rip_tear Tumors?[NEWLINE]Dat leg is straight mccain 

(0.412, '974741740807737344', "Leg' auf[NEWLINE]#KayasGroßeKinoshow")
(0.299, '960300385738031105', '@Lenqrad du und straight als ob')
(0.2476, '960119588309688320', "Did McCain's Brain Tumor Come From Cellphone Use? https://t.co/0u4HkeKHFZ via @mercola")
(0.2446, '997875345767444480', 'HE HAS TUMORS???')
(0.2438, '988902240260886528', 'Trump is obese, not large.[NEWLINE][NEWLINE]Get it straight. https://t.co/pj8omIV710')
(0.2432, '976800329839898625', 'Show a little more leg, honey 💕[NEWLINE].[NEWLINE].[NEWLINE].[NEWLINE]#leg #woman #sculpture #clay #ceramic #body #stoneware #love #figure #female #wip #inthestudio #newwork #exhibition #black #painting #clairedelune #art #artiststudio #keramik https://t.co/YVMiCXsvP6 https://t.co/SGdsX9elW5')
(0.2394, '962780166966726656', '@the_necrosis @KieferGabriele @ChristianNbg Was soll dat heißen? 🙊')
(0.2272, '995409237324034049', '@mounira_ber تقريبا كل ال Solid tumors. المركز في أل

In [16]:
for num, key in enumerate(data_index.keys()):
    if num == 85:
        print(num, data[key],'\n',key, data_index[key])

85 I'm currently looking for a psychologist bc I need help with my depression/ anxiety and I'd like to,,, talk to someone about it bc i feel like it keeps getting worse and my head is a fucked up place tbh, I'm so lost. I don't want my parents to know bc I'm a burden already. help 
 965731052525367296 i m currently looking for a psychologist bc i need help with my depression  anxiety and i d like to    talk to someone about it bc i feel like it keeps getting worse and my head is a fucked up place tbh  i m so lost  i don t want my parents to know bc i m a burden already  help


In [24]:
'''Tweet selection from dictionary'''
print("Query:", data['965731052525367296'],'\n')
top = top_100('965731052525367296')
for i in top:
    print(i)

Query: I'm currently looking for a psychologist bc I need help with my depression/ anxiety and I'd like to,,, talk to someone about it bc i feel like it keeps getting worse and my head is a fucked up place tbh, I'm so lost. I don't want my parents to know bc I'm a burden already. help 

(0.315, '1005987366186684418', 'tomorrow i have my first appointment at the psychologist, i’m kinda scared but at the same time hopeful that it’ll help me with my depression and anxiety 🙏🏻')
(0.2972, '1007392417920806912', "I'm so unhappy with my life currently, I don't know why but I am")
(0.2918, '1020700466408296448', "I'm so depressed I don't feel like food...")
(0.2779, '1000912376835837952', 'Ol so it’s currently 3:30am in Hamburg Germany and this storm isn’t that chill anymore like i just woke up bc of it and I’m kinda scared. Someone send help not kidding')
(0.2774, '1007047898956271616', "@12Charlotte43 because I don't know anyone of the class I'm currently in and nobody was talking to me last 

Overall we are quite pleasantly suprised by the quality of the results. In most cases there is a strong thematic link between the query text and the most highly ranked tweets.

In the first two queries it is easy to distinguish the importance of a few relatively rare keywords in the results. The terms cervical, cancer and HPV appear to play a very important role in the ranking of results for query one. Japan did as well, but the first occurence of this word in the ranking is much lower. The second tweet was very short, and each word seems to have played an important role in ranking. Tumor,leg, mccain and straight are all extremely common in the results. Also noteworthy is the fact that we have many short tweets in the results, showing that shorter documents receive higher ranks.The final query was relatively longer and we can see a greater diversity of important terms in the results. Keywords include depression, anxiety, psychologist, currently, and I'm.  