In [1]:
from tweepy import Stream
import pandas as pd
import numpy as np
import tweepy
from tweepy import OAuthHandler
from tweepy.streaming import StreamListener
import time
import argparse
import string
from twython import Twython  
import json
from sklearn import preprocessing
import gensim 
import re, string
import nltk
import contractions
import inflect
from bs4 import BeautifulSoup
from nltk import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import LancasterStemmer, WordNetLemmatizer
from nltk import re
import os
import sys

In [24]:
def get_latest_tweets(data_dir, auth, time_limit, topic):
    file_name = "stream"
    print("streaming tweets...")
    tic = time.time()
    twitter_stream = Stream(auth, MyListener(data_dir, time_limit),tweet_mode='extended')
    twitter_stream.filter(track= topic) # list of querries to track
    print('tweets with '+str(topic)+' obtained')
    data = pd.read_json(data_dir+file_name+".json",lines=True)
    tweets=pd.DataFrame(columns=['time','tweet'],index=data.index)
    print('tweets are saved!')
    no_data = False
    if data.empty:
        no_data = True
    else:
        data = data[data.lang=='en']
        #To get the full-text of the tweet
        for i in data.index:  
               tweets.time[i] = data.created_at[i]
               if pd.isnull(data.retweeted_status[i]):
                     if pd.isnull(data.extended_tweet[i]):
                            tweets.tweet[i] = data.text[i]
                     else:   
                        if "full_text" in data.extended_tweet[i].keys():
                             tweets.tweet[i]=data.extended_tweet[i]["full_text"]

                        else:
                             tweets.tweet[i]=data.text[i] 
               else:
                    if 'extended_tweet' in data.retweeted_status[i].keys():
                        if "full_text" in data.retweeted_status[i]['extended_tweet'].keys():
                            tweets.tweet[i]= data.retweeted_status[i]['extended_tweet']["full_text"]
                    else:
                         tweets.tweet[i] = data.retweeted_status[i]['text']     
        tweets = tweets.sort_values('time', ascending=False)
        tweets=tweets.drop_duplicates()
        tweets.dropna(subset=['tweet'])
    toc = time.time()
    print(str(tweets.shape[0])+'tweets obtained in '+str((toc-tic)/60)+' minutes', tweets)
    return tweets, no_data

    

In [3]:
#MyListener() saves the data into a .json file with name stream
class MyListener(StreamListener):
    """Custom StreamListener for streaming data."""

    def __init__(self, data_dir, time_limit=60):
        self.start_time = time.time()
        self.limit = time_limit
        #query_fname = format_filename(query)
        self.saveFile = open(data_dir+"stream.json", 'a')
        super(MyListener, self).__init__()

    def on_data(self, data):
        if (time.time() - self.start_time) < self.limit:
            self.saveFile.write(data)
            return True
        else:
            self.saveFile.close()
            return False
            

    def on_error(self, status):
        print(status)
        return True


In [4]:
def format_filename(fname):
    """Convert file name into a safe string.
    Arguments:
        fname -- the file name to convert
    Return:
        String -- converted file name
    """
    return ''.join(convert_valid(one_char) for one_char in fname)


def convert_valid(one_char):
    """Convert a character into '_' if invalid.
    Arguments:
        one_char -- the char to convert
    Return:
        Character -- converted char
    """
    valid_chars = "-_.%s%s" % (string.ascii_letters, string.digits)
    if one_char in valid_chars:
        return one_char
    else:
        return '_'

In [5]:
def tokenize_tweet(tweet):
    tokens = [str(word) for word in str(tweet).lower().split()];
    return(tokens);

def remove_punctuation(tokens):
    clean_words = [word.translate(str.maketrans('', '', string.punctuation)) for word in tokens];
    return(clean_words);

def clean_text(model, clean_words):
    stoplist = set(stopwords.words('english'));
    tweet_nostopwords = [word for word in clean_words if word not in stoplist];
    filtered_word_list = [word for word in tweet_nostopwords if word in model.vocab];
    return(filtered_word_list);

In [6]:
def tweet_preprocess(row_tweet,model):
    return clean_text(model,remove_punctuation(tokenize_tweet(row_tweet)))

In [7]:
# def tweet_preprocess(row_tweet):
#     return [(TwitterPreprocessor(row_tweet).fully_preprocess().text).split()]

In [8]:
def vectorize_tweet(normalized_tweet,model):
    vec=np.zeros((300))
    for word in normalized_tweet:
        vec+=model[word] 
    return preprocessing.normalize(vec.reshape(1,-1))

In [23]:
def vectorize_latest_tweets(tweets, model):
    tic=time.time()
    tweets.loc[:,'normalized']=tweets.loc[:,'tweet'].apply(lambda tweet: tweet_preprocess(tweet,model))
    tweets.loc[:,'vector'] = tweets.normalized.apply(lambda tweet:vectorize_tweet(tweet, model))
    toc=time.time()
    print('tweets vectorizd in '+ str((toc-tic)/60)+' minutes')
    return tweets

In [22]:
def vectorize_user_input(user_input, model):
    
    normalized = tweet_preprocess(user_input, model)
    word_vec = vectorize_tweet(normalized, model)
    vectorized_input = {'raw_input': user_input, 'normalized': normalized, 'vector': word_vec}
    return vectorized_input
                    
        

In [11]:
def find_most_similar_tweets(vectorized_input, vectorized_tweets, topn):
    vec_tweets=np.vstack(vectorized_tweets.vector.apply(lambda x: x.tolist()))
    cos=model.cosine_similarities(vectorized_input['vector'].reshape(-1,), vec_tweets)
    vectorized_tweets.loc[:,'similarity_score'] = cos #np.round(cos,10)
    vectorized_tweets = vectorized_tweets.sort_values(by='similarity_score', ascending=False)
    return vectorized_tweets[0:topn]

In [12]:
def process_user_input(user_input, time_limit, topic):

    track_list=[k for k in topic.split(',')]
    file_name = "stream"
    if os.path.exists(data_dir+file_name+'.json'):
        os.remove(data_dir+file_name+'.json')
    #-----------------------------------------------
    # Load credentials from json file
    tweets, no_data = get_latest_tweets(data_dir, auth, time_limit, track_list)
    if no_data:
        return'There is no data with topic: '+topic+' in  '+ str(time_limit)+' seconds'
    else:
        #tweets = tweets[0:1]
        vectorized_tweets = vectorize_latest_tweets(tweets, model)
        vectorized_user_input = vectorize_user_input(user_input, model)
        #find the top topn= 10  similar tweets
        recommendations = find_most_similar_tweets(vectorized_user_input, vectorized_tweets,topn=10)
    return recommendations

In [13]:
#downloaded pretrained model
model = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary = True)

In [14]:
data_dir = '/Users/shahla/Dropbox/SharpestMinds/stream/data/'
topic = 'politics' #it can be a list of topics,  comman means 'or'
time_limit=10
user_input = 'Johnson & Johnson, one of the world s largest drug manufacturers, has gone on trial in a multi-billion dollar lawsuit by the US state of Oklahoma.'
with open("twitter_credentials.json", "r") as file:  
     credentials = json.load(file)
# Instantiate an object
python_tweets = Twython( credentials['CONSUMER_KEY'],  credentials['CONSUMER_SECRET'])
auth = tweepy.OAuthHandler(credentials['CONSUMER_KEY'], credentials['CONSUMER_SECRET'])
auth.set_access_token(credentials['ACCESS_TOKEN'], credentials['ACCESS_SECRET'])



In [25]:
recommendations = process_user_input(user_input, time_limit, topic)

streaming tweets...
tweets with ['politics'] obtained
tweets are saved!
82tweets obtained in 0.17141808271408082 minutes                    time                                              tweet
89  2019-05-29 19:21:25  Patricia de Lille is the Brooke Logan of SA po...
88  2019-05-29 19:21:25  This is not what exoneration looks like:\nhttp...
87  2019-05-29 19:21:25  It's a sensitive &amp; an emotive issue of a r...
86  2019-05-29 19:21:25  @LindseyGrahamSC IMO you should go to Therapy,...
85  2019-05-29 19:21:25  R’s drew up articles of impeachment for @RJRos...
77  2019-05-29 19:21:24  @RepJerryNadler after today I think it's time ...
71  2019-05-29 19:21:24  “I’m who I said I was. I’m a principled, const...
73  2019-05-29 19:21:24  Tories risk being captured by poisonous politi...
74  2019-05-29 19:21:24  For all the talk of the damage Trump has done ...
75  2019-05-29 19:21:24  "F**k off Uefa, is this what you want?" sing #...
76  2019-05-29 19:21:24  To make #Brexit work Bercow h

In [26]:
recommendations.iloc[1]['tweet']

"Appears the roaches are starting to crawl out into the open!  That’s what they do when you bomb them! They know whats coming and they are scrambling!\nSpecial Counsel Robert Mueller closes Russia probe, says charging Trump with a crime was ‘not an option'\n\nhttps://t.co/2mbR5eyxQD"

In [27]:
recommendations

Unnamed: 0,time,tweet,normalized,vector,similarity_score
8,2019-05-29 19:21:16,The written order just came in from the judge ...,"[written, order, came, judge, ordering, roger,...","[[-0.017372000301740496, 0.04532681765325117, ...",0.57816
50,2019-05-29 19:21:20,Appears the roaches are starting to crawl out ...,"[appears, roaches, starting, crawl, open, bomb...","[[-0.016036032500215316, 0.054101738298484754,...",0.576396
68,2019-05-29 19:21:23,My #UniversalChildCare plan quadruples the fed...,"[plan, quadruples, federal, investment, child,...","[[0.015091095878780943, -0.0029922349905775634...",0.55505
63,2019-05-29 19:21:22,@gerardjasper Tony “Hard Right” Benn: My view ...,"[tony, benn, view, eu, always, hostile, foreig...","[[0.027441587239689258, 0.03274367615949744, 0...",0.54379
46,2019-05-29 19:21:20,When history asks who broke American democracy...,"[history, asks, broke, american, democracy, tr...","[[0.03616581618506063, 0.005561904866498922, 0...",0.532711
23,2019-05-29 19:21:17,When someone starts talking about politics or ...,"[someone, starts, talking, politics, president...","[[0.014963407912619015, 0.047920886935539045, ...",0.530359
67,2019-05-29 19:21:23,Another GOP nutjob: #GOP '#deportationbus' can...,"[another, gop, nutjob, gop, candidate, georgia...","[[0.03275861291002944, -0.06458924305274043, 0...",0.517694
51,2019-05-29 19:21:21,".@georgegalloway blocked me years ago, but I h...","[blocked, years, ago, hope, people, sharing, s...","[[-0.020676730556627654, 0.05327156006399546, ...",0.513239
73,2019-05-29 19:21:24,Tories risk being captured by poisonous politi...,"[tories, risk, captured, poisonous, politics, ...","[[0.010161631901644376, 0.03340464415696811, 0...",0.507587
40,2019-05-29 19:21:19,"Chair of North Carolina GOP, others indicted o...","[chair, north, carolina, gop, others, indicted...","[[0.04559710350798578, -0.021107567110293827, ...",0.507347


In [28]:
with pd.option_context('display.max_colwidth', 300):
    print (recommendations)

                   time  \
8   2019-05-29 19:21:16   
50  2019-05-29 19:21:20   
68  2019-05-29 19:21:23   
63  2019-05-29 19:21:22   
46  2019-05-29 19:21:20   
23  2019-05-29 19:21:17   
67  2019-05-29 19:21:23   
51  2019-05-29 19:21:21   
73  2019-05-29 19:21:24   
40  2019-05-29 19:21:19   

                                                                                                                                                                                                                                                                                           tweet  \
8   The written order just came in from the judge ordering Roger Stone associate Andrew Miller to testify to a grand jury Friday at 9:30, after refusing to for more than a year as he challenged Mueller.\n\nHis attorney says he’s planning to show up.\n\nFull story: https://t.co/9Xf72ticnr   
50    Appears the roaches are starting to crawl out into the open!  That’s what they do when you bomb them! They know 