In [4]:
#!/usr/bin/python
# -*- coding: utf-8 -*-

import tweepy
import spacy
import csv
import json
import pandas as pd, numpy as np
import os

In [5]:
# Twitter API credentials
credentialsPath = r'..\0_data\credentials'
with open(os.path.join(credentialsPath, 'twitter_credentials.json')) as cred_data:
    info = json.load(cred_data)
    consumer_key = info['CONSUMER_KEY']
    consumer_secret = info['CONSUMER_SECRET']
    access_key = info['ACCESS_KEY']
    access_secret = info['ACCESS_SECRET']

# Create the api endpoint
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
api = tweepy.API(auth)

In [6]:
# Convert to data frame and capture each column feature in an array
def toDataFrame(tweets):
    DataSet = pd.DataFrame()
    
    # Get tweet, username, and user ID
    tweetsText = []
    tweetsUserID = []
    
    for tweet in tweets:
        if 'retweeted_status' in  dir(tweet):
            tweetsText.append(tweet.retweeted_status.full_text.encode('utf-8'))                
        else:
            tweetsText.append(tweet.full_text.encode('utf-8'))
                
        tweetsUserID.append(tweet.user.id)
            
    DataSet['Text'] = [text for text in tweetsText]
    DataSet['User'] = [tweet.user.name.encode('utf-8') for tweet in tweets]
    DataSet['UserID'] = [ID for ID in tweetsUserID]
    
    # Get images
    tweetsImages = []
    for tweet in tweets:
        if 'media' in tweet.entities:
            for image in tweet.entities['media']:
                tweetsImages.append(image['media_url'])
        else:
            tweetsImages.append(np.nan)
            
    # Get Longitudes
    DataSet['Image Urls'] = [image for image in tweetsImages]
            
    tweetsLongitudes = []
    for tweet in tweets:
        if tweet.coordinates is not None:
            tweetsLongitudes.append(tweet.coordinates["coordinates"][0])
        else:
            tweetsLongitudes.append(np.nan)
            
    DataSet['Longitude'] = [longitude for longitude in tweetsLongitudes]
    
    
    # Get Latitudes
    tweetsLatitudes = []
    for tweet in tweets:
        if tweet.coordinates is not None:
            tweetsLatitudes.append(tweet.coordinates["coordinates"][1])
        else:
            tweetsLatitudes.append(np.nan)
            
    DataSet['Latitude'] = [latitude for latitude in tweetsLatitudes]
    
    # Get created_at
    tweetsPosted = []
    for tweet in tweets:
        tweetsPosted.append(tweet.created_at)
            
    DataSet['Created'] = [created for created in tweetsPosted]
    
    # Get tweet IDs
    tweetIDs = []
    for tweet in tweets:
        tweetIDs.append(tweet.id)
            
    DataSet['tweetID'] = [ID for ID in tweetIDs]
    
    # Get other fields
    locations = []
    for tweet in tweets:
        if tweet.user.location is not None:
            locations.append(tweet.user.location)
        else:
            locations.append(np.nan)
            
    DataSet['location'] = [loc for loc in locations]
   # DataSet['screen_name'] = [name for name in screen_name]
    
    return DataSet

# Specify the maximum number of tweets that you want to be extracted.
maximum_number_of_tweets_to_be_extracted = \
    int(input('Enter the number of tweets that you want to extract- '))

# Specify if you are looking for hashtags or mentions
# Note: mentions will get you exactly a mention and are case insensitive
# Note 2: hashtags are case insensitive and not required
# Example: datascience, Datascience, #datascience, and #Datascience will return tweets about data science
# Hashtags of course are explicit mentions of a term, but are not case sensitive on Twitter
tag_choice = input('Enter which you are searching for, a hashtag (#), or a mention (@)- ')

# Specify the term you want to scrape
mention = input('Enter the term you want to scrape- ')

results = []

for tweet_info in tweepy.Cursor(api.search, q=str(tag_choice) + mention,
                           tweet_mode='extended').items(maximum_number_of_tweets_to_be_extracted):
    results.append(tweet_info)

data = toDataFrame(results)

Enter the number of tweets that you want to extract- 100
Enter which you are searching for, a hashtag (#), or a mention (@)- @
Enter the term you want to scrape- sfmta


In [7]:
# Instantiate the clean text and user arrays
clean_texts = []
clean_users = []

# Clean the text data
for row in data.itertuples():
    text = row[1].decode('utf-8').encode('ascii', 'ignore').decode('ascii').strip()
    user = row[2].decode('utf-8').encode('ascii', 'ignore').decode('ascii').strip()
    
#     text_doc = nlp.make_doc(text)
#     tokens = [token for token in text_doc if not token.is_stop]
#     tokens = [token for token in tokens if token.text != ' ']    
#     tokenz_final = [token.text for token in tokens]
    
#     text = " ".join(tokenz_final).strip()
    text = text.replace('\n', '')
    
    clean_texts.append(text)
    clean_users.append(user)
    
clean_df = pd.DataFrame({'Clean Tweet':clean_texts, 'Clean Username':clean_users,
                         'User ID':data['UserID'],
                         'Location':data['location'],
                         'Image URLs':data['Image Urls'],
                         'Latitude':data['Latitude'],
                         'Longitude':data['Longitude']})

# Remove the usernames (and thus tweets) that use unicode (were coerced to empty strings by the previous step)
# This means there are no non-English tweets/usernames
indexNames = clean_df[(clean_df['Clean Username'] == '') | (clean_df['Clean Username'] == '@')].index

clean_df.drop(indexNames, inplace=True)

clean_df.head()

Unnamed: 0,Clean Tweet,Clean Username,User ID,Location,Image URLs,Latitude,Longitude
0,About the only good thing about @SFMTA all nig...,Andrea Mallis,97576864,"Berkeley, California",,,
1,About the only good thing about @SFMTA all nig...,Dennis O'Donnell,191325456,,http://pbs.twimg.com/ext_tw_video_thumb/110826...,,
2,@udderlydelight @SFMTA ikr!? over here in the ...,yuna yuna,238625955,"San Francisco, CA",,,
3,@yuna_yuna_____ @SFMTA Its ridiculous!!!!!!,japanese barbeque fingers,485270552,,,,
4,@udderlydelight @SFMTA muni being sus as usual,yuna yuna,238625955,"San Francisco, CA",,,


In [8]:
outputPath = r'..\0_data\manual'
filePath = os.path.join(outputPath,'tweets_with_mention_' + mention + '.csv')
if not os.path.isfile(filePath):
    clean_df.to_csv(filePath, index=False)
else:
    with open(filePath, 'a') as file:
        clean_df.to_csv(file, index = False)