In [1]:
from pymongo import MongoClient
client = MongoClient('localhost', 27017)

In [2]:
import tweepy
import json
import TweetCleanser
import TweetStemToken
import settings

In [3]:
db = client.local

In [4]:
class MyListener(tweepy.StreamListener):
        
    def on_data(self, data):
        try:
            tweet = json.loads(data)
            # Filter out retweets and tweets from SouthwestAir and tweets without geo-tags
            if not tweet['retweeted'] and 'RT @' not in tweet['text'] and tweet['user']['screen_name'] != 'SouthwestAir' and tweet['place']:
                
                # Preprocess tweet
                text = tweet['text']
                cleanser = TweetCleanser.TweetCleanser()
                stemmer = TweetStemToken.TweetStemToken()
                processed_words = cleanser.to_ascii(text)
                processed_words = cleanser.remove_urls(processed_words)
                processed_words = cleanser.remove_special_characters(processed_words)
                processed_words = cleanser.remove_usernames(processed_words)
                processed_words = cleanser.remove_numbers(processed_words)
                processed_words = stemmer.stem(processed_words)
                processed_words = stemmer.tokenize(processed_words)
                processed_words = cleanser.remove_stopwords(processed_words)
                
                id_str = tweet['id_str']
                place_type = tweet['place']['place_type']
                name = tweet['place']['name']
                full_name = tweet['place']['full_name']
                country_code = tweet['place']['country_code']
                country = tweet['place']['country']
                coordinates = tweet['place']['bounding_box']['coordinates']
                created_at = tweet['created_at']
                
                db.testTweets.insert_one(
                    {
                        'id_str': id_str,
                        'text': text,
                        'place_type': place_type,
                        'name': name,
                        'full_name': full_name,
                        'country_code': country_code,
                        'country': country,
                        'coordinates': coordinates,
                        'created_at': created_at,
                        'processed_words': processed_words,
                    }
                )
                
        except Exception as e:
            print("error: " + str(e))
        
    def on_error(self, error):
        if error == 420:
            return False
        print(error)

In [5]:
C_KEY = settings.C_KEY
C_SECRET = settings.C_SECRET
A_TOKEN_KEY = settings.A_TOKEN_KEY
A_TOKEN_SECRET = settings.A_TOKEN_SECRET

In [None]:
auth = tweepy.OAuthHandler(C_KEY, C_SECRET)
auth.set_access_token(A_TOKEN_KEY, A_TOKEN_SECRET)
api = tweepy.API(auth)

In [None]:
listener = MyListener(api=tweepy.API(wait_on_rate_limit=True))
myStream = tweepy.Stream(auth=auth, listener=listener)
myStream.filter(track=['southwest airlines', 'SouthwestAir', 'southwestairlines', 'flying southwest', 'fly southwest'])