In [None]:
from pymongo import MongoClient
client = MongoClient('localhost', 27017)

In [None]:
import nltk
import re
import tweepy
import csv
import json
import settings

In [None]:
db = client.local

In [None]:
stopwords = nltk.corpus.stopwords.words("english")
whitelist = ["n't", "not"]

In [None]:
class TweetCleanser:
    def to_ascii(self, text):
        return text.encode('ascii', 'ignore').decode('ascii')
    
    def remove_urls(self, text):
        return re.sub(r"http.?://[^\s]+[\s]?", '', text)
    
    def remove_special_characters(self, text):
        return re.sub("[^0-9A-Za-z']", ' ', text)
    
    def remove_usernames(self, text):
        return re.sub(r"@[^\s]+[\s]?", '', text)
    
    def remove_numbers(self, text):
        return re.sub(r"\s?[0-9]+\.?[0-9]*", '', text)
    
    def remove_stopwords(self, tokens):
        for token in tokens:
            if token in stopwords and token not in whitelist:
                tokens.remove(token)
        return tokens

In [None]:
class TweetStemToken:
    def stem(self, text):
        stemmer = nltk.PorterStemmer() 
        return stemmer.stem(text.lower())
    
    def tokenize(self, text):
        tokenizer = nltk.word_tokenize
        tokens = [] + tokenizer(text)
        return tokens

In [None]:
class MyListener(tweepy.StreamListener):
        
    def on_data(self, data):
        try:
            tweet = json.loads(data)
            # Filter out retweets and tweets from SouthwestAir and tweets without geo-tags
            if not tweet['retweeted'] and 'RT @' not in tweet['text'] and tweet['user']['screen_name'] != 'SouthwestAir' and tweet['place']:
                
                # Preprocess tweet
                text = tweet['text']
                cleanser = TweetCleanser()
                stemmer = TweetStemToken()
                processed_words = cleanser.to_ascii(text)
                processed_words = cleanser.remove_urls(processed_words)
                processed_words = cleanser.remove_special_characters(processed_words)
                processed_words = cleanser.remove_usernames(processed_words)
                processed_words = cleanser.remove_numbers(processed_words)
                processed_words = stemmer.stem(processed_words)
                processed_words = stemmer.tokenize(processed_words)
                
                id_str = tweet['id_str']
                place_type = tweet['place']['place_type']
                name = tweet['place']['name']
                full_name = tweet['place']['full_name']
                country_code = tweet['place']['country_code']
                country = tweet['place']['country']
                coordinates = tweet['place']['bounding_box']['coordinates']
                
                db.testTweets.insert_one(
                    {
                        'id_str': id_str,
                        'text': text,
                        'place_type': place_type,
                        'name': name,
                        'full_name': full_name,
                        'country_code': country_code,
                        'country': country,
                        'coordinates': coordinates,
                        'processed_words': processed_words,
                    }
                )
                
        except Exception as e:
            print("error: " + str(e))
        
    def on_error(self, error):
        if error == 420:
            return False
        print(error)

In [None]:
C_KEY = settings.C_KEY
C_SECRET = settings.C_SECRET
A_TOKEN_KEY = settings.A_TOKEN_KEY
A_TOKEN_SECRET = settings.A_TOKEN_SECRET

In [None]:
auth = tweepy.OAuthHandler(C_KEY, C_SECRET)
auth.set_access_token(A_TOKEN_KEY, A_TOKEN_SECRET)
api = tweepy.API(auth)

In [None]:
listener = MyListener(api=tweepy.API(wait_on_rate_limit=True))
myStream = tweepy.Stream(auth=auth, listener=listener)
myStream.filter(track=['southwest airlines', 'SouthwestAir', 'southwestairlines', 'flying southwest', 'fly southwest'])