### The purpose of this file is to collect 150 tweets for our training data set. After collecting the data, we put the tweets into a csv file in order to manually label each tweet as positive, negative, or neutral. We then put this labeled data back into a mongo DB to run through our classifier.

In [1]:
from pymongo import MongoClient
client = MongoClient('localhost', 27017)

In [5]:
import tweepy
import json
import csv
import pandas as pd
import TweetCleanser
import TweetStemToken
import settings
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np

In [3]:
db = client.local

In [22]:
class MyListener(tweepy.StreamListener):
        
    def on_data(self, data):
        try:
            tweet = json.loads(data)
            # Filter out retweets and tweets from SouthwestAir
            if not tweet['retweeted'] and 'RT @' not in tweet['text'] and tweet['user']['screen_name'] != 'SouthwestAir':
                if db.trainTweets.find().count() <= 150:
                    id_str = tweet['id_str']
                    text = tweet['text']
                    db.trainTweets.insert_one(
                        {
                            'id_str': id_str,
                            'text': text,
                        }
                    )
                else:
                    return False
                
        except Exception as e:
            print("error: " + str(e))
        
    def on_error(self, error):
        if error == 420:
            return False
        print(error)

In [23]:
C_KEY = settings.C_KEY
C_SECRET = settings.C_SECRET
A_TOKEN_KEY = settings.A_TOKEN_KEY
A_TOKEN_SECRET = settings.A_TOKEN_SECRET

In [24]:
auth = tweepy.OAuthHandler(C_KEY, C_SECRET)
auth.set_access_token(A_TOKEN_KEY, A_TOKEN_SECRET)
api = tweepy.API(auth)

In [25]:
listener = MyListener(api=tweepy.API(wait_on_rate_limit=True))
myStream = tweepy.Stream(auth=auth, listener=listener)
myStream.filter(track=['southwest airlines', 'SouthwestAir', 'southwestairlines', 'flying southwest', 'fly southwest'])

In [28]:
existingTweets = open('trainTweets.csv', 'r').read()
with open('trainTweets.csv', 'wb') as outFile:
    writer = csv.writer(outFile)
    for tweet in tweets.find():
        if tweet['id_str'] not in existingTweets:
            writer.writerow([tweet['id_str'], tweet['text'].encode('ascii', 'ignore').decode('ascii')])

In [4]:
train = pd.read_csv('trainTweets.csv', header='infer')
for index, tweet in train.iterrows():
    
    # Preprocess tweet
    text = tweet['text']
    cleanser = TweetCleanser.TweetCleanser()
    stemmer = TweetStemToken.TweetStemToken()
    processed_words = cleanser.to_ascii(text)
    processed_words = cleanser.remove_urls(processed_words)
    processed_words = cleanser.remove_special_characters(processed_words)
    processed_words = cleanser.remove_usernames(processed_words)
    processed_words = cleanser.remove_numbers(processed_words)
    processed_words = stemmer.stem(processed_words)
    processed_words = stemmer.tokenize(processed_words)
    processed_words = cleanser.remove_stopwords(processed_words)
    
    id_str = tweet['id_str']
    category = tweet['category']
    
    db.trainData.insert_one(
        {
            'id_str': id_str,
            'text': text,
            'processed_words': processed_words,
            'category': category
        }
    )