In [1]:
from tweepy import Stream
import pandas as pd
import numpy as np
import tweepy
from tweepy import OAuthHandler
from tweepy.streaming import StreamListener
import time
import argparse
import string
from twython import Twython  
import json
from sklearn import preprocessing
import gensim 
import re, string
import nltk
import contractions
import inflect
from bs4 import BeautifulSoup
from nltk import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import LancasterStemmer, WordNetLemmatizer
from nltk import re
import os
import sys

Using the following function, we get the latest tweets in the last time frame. 

In [2]:
def get_latest_tweets(data_dir, auth, time_limit, topic):

    print("streaming tweets...")
    tic = time.time()
    twitter_stream = Stream(auth, MyListener(data_dir, time_limit), tweet_mode='extended')
    twitter_stream.filter(track = topic) # list of querries to track
    t = time.time()
    print('stream data are saved in '+ str((t-tic)/60)+' minutes')
    print('reading stream file...')
    data = pd.read_json(data_dir+"stream.json", lines=True)
    ll = time.time()
    print('reading the stream file takes ' + str((ll-t)/60) +' minutes')
    tweets = pd.DataFrame(columns=['time', 'tweet'], index=data.index)
    print('getting full_text tweets...')
    no_data = False
    if data.empty:
        no_data = True
    else:
        data = data[data.lang == 'en']
        tweets=pd.DataFrame(columns = ['time','tweet'], index=data.index)
        tweets['tweet'] = data.apply(lambda x: get_full_text_tweet(x), axis = 1)
        tweets['time'] = data.created_at
        tweets = tweets.sort_values('time', ascending=False)
        tweets = tweets.drop_duplicates()
        tweets = tweets.dropna(subset = ['tweet'])
    toc = time.time()
    print(str(tweets.shape[0])+ ' full_text tweets obtained in ' + str((toc-tic)/60) + ' minutes', tweets)
    return tweets, no_data
   

The following function, get the full text of the tweet.

In [3]:
def get_full_text_tweet(tweet):
   if pd.isnull(tweet.retweeted_status):
         if pd.isnull(tweet.extended_tweet):
                full_text = tweet.text
         else:   
            if "full_text" in tweet.extended_tweet.keys():
                 full_text = tweet.extended_tweet["full_text"]

            else:
                 full_text = tweet.text 
   else:
        if 'extended_tweet' in tweet.retweeted_status.keys():
            if "full_text" in tweet.retweeted_status['extended_tweet'].keys():
                full_text = tweet.retweeted_status['extended_tweet']["full_text"]
        else:
             full_text = tweet.retweeted_status['text']    
   return full_text 


In [4]:
#MyListener() saves the data into a .json file with name stream
class MyListener(StreamListener):
    """Custom StreamListener for streaming data."""

    def __init__(self, data_dir, time_limit):
        self.start_time = time.time()
        self.limit = time_limit
        self.saveFile = open(data_dir + "stream.json", 'a')
        super(MyListener, self).__init__()

    def on_data(self, data):
        if (time.time() - self.start_time) < self.limit:
            self.saveFile.write(data)
            return True
        else:
            self.saveFile.close()
            return False
            

    def on_error(self, status):
        print(status)
        return True


The following functions, helps to clean the tweets for modeling.

In [5]:
def tokenize_tweet(tweet):
    tokens = [str(word) for word in str(tweet).lower().split()];
    return(tokens);

def remove_punctuation(tokens):
    clean_words = [word.translate(str.maketrans('', '', string.punctuation)) for word in tokens];
    return(clean_words);

def clean_text(model, clean_words):
    stoplist = set(stopwords.words('english'));
    tweet_nostopwords = [word for word in clean_words if word not in stoplist];
    filtered_word_list = [word for word in tweet_nostopwords if word in model.vocab];
    return(filtered_word_list);

In [6]:
def tweet_preprocess(row_tweet,model):
    return clean_text(model,remove_punctuation(tokenize_tweet(row_tweet)))

Using the following function, we use a model as an input , to vectorize  normalized tweets.

In [7]:
def vectorize_tweet(normalized_tweet,model):
    vec=np.zeros((300))
    for word in normalized_tweet:
        vec += model[word] 
    return preprocessing.normalize(vec.reshape(1,-1))

In [8]:
def vectorize_latest_tweets(tweets, model):
    
    print('vectorizing tweets...')
    tic = time.time()
    tweets.loc[:,'normalized'] = tweets.tweet.apply(lambda tweet: tweet_preprocess(tweet, model))
    tweets.loc[:,'vector'] = tweets.normalized.apply(lambda tweet: vectorize_tweet(tweet, model))
    toc=time.time()
    print('tweets vectorizd in '+ str((toc-tic)/60)+' minutes')
    return tweets

In [9]:
def vectorize_user_input(user_input, model):
    
    print('vectorizing user_input...')
    normalized = tweet_preprocess(user_input, model)
    word_vec = vectorize_tweet(normalized, model)
    vectorized_input = {'raw_input': user_input, 'normalized': normalized, 'vector': word_vec}      
    print('user_input is vectorized!')
    return vectorized_input

In [10]:
def find_most_similar_tweets(vectorized_input, vectorized_tweets, topn):
    vec_tweets = np.vstack(vectorized_tweets.vector.apply(lambda x: x.tolist()))
    cos=model.cosine_similarities(vectorized_input['vector'].reshape(-1,), vec_tweets)
    vectorized_tweets.loc[:,'similarity_score'] = cos 
    vectorized_tweets = vectorized_tweets.sort_values(by='similarity_score', ascending=False)
    return vectorized_tweets[0:topn]

In [11]:
def process_user_input(user_input, time_limit, topic, topn):

    track_list = [k for k in topic.split(',')]
    file_name = "stream"
    if os.path.exists(data_dir+file_name + '.json'):
        os.remove(data_dir + file_name + '.json')
    #-----------------------------------------------
    # Load credentials from json file
    tweets, no_data = get_latest_tweets(data_dir, auth, time_limit, track_list)
    if no_data:
        return 'There is no data with topic: '+ topic +' in  '+ str(time_limit) +' seconds'
    else:
        vectorized_tweets = vectorize_latest_tweets(tweets, model)
        vectorized_user_input = vectorize_user_input(user_input, model)
        #find the top topn= 10  similar tweets
        recommendations = find_most_similar_tweets(vectorized_user_input, vectorized_tweets, topn)
        print('top '+ str(topn) + ' similar tweets are obtained!')
    return recommendations

In [12]:
#downloaded pretrained model
model = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary = True)

In [13]:
data_dir = './stream/data/'
from os import listdir
from os.path import isfile, join
onlyfiles = [f for f in listdir(data_dir) if isfile(join(data_dir, f))]
onlyfiles

[' stream_topic.json',
 '.DS_Store',
 'data.json',
 'fetched_tweets.txt',
 'file_name',
 'stream.json',
 'stream__Asian_food.json',
 'stream_Asian_food.json',
 'stream_food.json',
 'stream_politics.json',
 't.json']

In [16]:
data_dir = './'
topic = 'Canada' #it can be a list of topics,  comman means 'or'
topn = 10
time_limit = 20
user_input = 'Tarif'
with open(data_dir + "twitter_credentials.json", "r") as file:  
     credentials = json.load(file)
# Instantiate an object
python_tweets = Twython( credentials['CONSUMER_KEY'], credentials['CONSUMER_SECRET'])
auth = tweepy.OAuthHandler(credentials['CONSUMER_KEY'], credentials['CONSUMER_SECRET'])
auth.set_access_token(credentials['ACCESS_TOKEN'], credentials['ACCESS_SECRET'])

In [None]:
recommendations = process_user_input(user_input, time_limit, topic, topn)

In [29]:
recommendations.iloc[0]['tweet']

'Global marijuana trade is still five to seven years off\n\nUnder the Cannabis Act, Canadian producers are currently only allowed to export weed for medical use, and then only to countries that allow cannabis to be imported.\n\nhttps://t.co/n036yiuaPy'

In [30]:
recommendations

Unnamed: 0,time,tweet,normalized,vector,similarity_score
20,2019-06-19 14:32:23,Global marijuana trade is still five to seven ...,"[global, marijuana, trade, still, five, seven,...","[[0.029329505649927836, 0.013802065266448206, ...",0.278243
26,2019-06-19 14:32:25,American Insulin Scandal: Vial of Humalog from...,"[american, insulin, scandal, vial, eli, lilly,...","[[-0.04532776933211481, 0.06511309352842061, 0...",0.274862
35,2019-06-19 14:32:29,Privatization &amp; ever more privatization. T...,"[privatization, amp, ever, privatization, buy,...","[[-0.05269165186022342, 0.003248564575928917, ...",0.269835
19,2019-06-19 14:32:23,FINALLY! Canada has banned shark fin imports a...,"[finally, canada, banned, shark, fin, imports,...","[[-0.009868834595657796, 0.04313267093561433, ...",0.254936
41,2019-06-19 14:32:32,AMAZING! Canada passed legislation to put an e...,"[amazing, canada, passed, legislation, put, en...","[[-0.03511252114909569, 0.018028544242165667, ...",0.227339
23,2019-06-19 14:32:25,"141,900 square feet of artificial turf importe...","[square, feet, artificial, turf, imported, fra...","[[-0.08134927342802921, 0.0635687164262338, 0....",0.216241
47,2019-06-19 14:32:33,In our last #FBCountryIndex Canada ranked 8th ...,"[last, canada, ranked, 8th, environmental, fri...","[[-0.03796684711200454, 0.11958902120645264, -...",0.205407
11,2019-06-19 14:32:20,Tiny House Warriors respond to Canada’s announ...,"[tiny, house, warriors, respond, announcement,...","[[0.051278696066418, 0.047228611905171664, 0.0...",0.190744
40,2019-06-19 14:32:31,Webinar this week: Join Food Allergy Canada's ...,"[webinar, week, join, food, allergy, webinar, ...","[[-0.05409327902854551, -0.01027698925766548, ...",0.18494
17,2019-06-19 14:32:22,I’m going to try my hand at translating. \n\nT...,"[going, try, hand, translating, puppet, puppet...","[[0.07208209450574447, -0.02368210062479829, 0...",0.179053


In [47]:
with pd.option_context('display.max_colwidth', 300):
    print (recommendations)

                  time  \
5  2019-06-01 14:19:49   
15 2019-06-01 14:19:51   
12 2019-06-01 14:19:51   
11 2019-06-01 14:19:51   
40 2019-06-01 14:20:00   
37 2019-06-01 14:19:58   
20 2019-06-01 14:19:53   
46 2019-06-01 14:20:01   
10 2019-06-01 14:19:50   
53 2019-06-01 14:20:03   
7  2019-06-01 14:19:49   
33 2019-06-01 14:19:57   
41 2019-06-01 14:20:00   
25 2019-06-01 14:19:54   
38 2019-06-01 14:19:58   
32 2019-06-01 14:19:57   
13 2019-06-01 14:19:51   
22 2019-06-01 14:19:54   
48 2019-06-01 14:20:02   
1  2019-06-01 14:19:47   
45 2019-06-01 14:20:01   
27 2019-06-01 14:19:54   
6  2019-06-01 14:19:49   
34 2019-06-01 14:19:57   
30 2019-06-01 14:19:55   
14 2019-06-01 14:19:51   
49 2019-06-01 14:20:03   
0  2019-06-01 14:19:46   
28 2019-06-01 14:19:54   
2  2019-06-01 14:19:47   
23 2019-06-01 14:19:53   
54 2019-06-01 14:20:04   
18 2019-06-01 14:19:52   
55 2019-06-01 14:20:03   
51 2019-06-01 14:20:02   
16 2019-06-01 14:19:52   
9  2019-06-01 14:19:50   
29 2019-06-0