In [29]:
%autosave 45
#!/usr/bin/python
# -*- coding: utf-8 -*-

import tweepy
import spacy
import csv
import json
import pandas as pd
import numpy as np
import os
from spacy.lang.en import English

Autosaving every 45 seconds


In [31]:
# Twitter API credentials
credentialsPath = r'..\0_data\credentials'
with open(os.path.join(credentialsPath, 'twitter_credentials.json')) as cred_data:
    info = json.load(cred_data)
    consumer_key = info['CONSUMER_KEY']
    consumer_secret = info['CONSUMER_SECRET']
    access_key = info['ACCESS_KEY']
    access_secret = info['ACCESS_SECRET']

# Create the api endpoint

auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
api = tweepy.API(auth)

In [39]:
# Convert to data frame
def toDataFrame(tweets):
    DataSet = pd.DataFrame()
    
    # Get tweet information
    tweetIDs = []
    for tweet in tweets:
        tweetIDs.append(tweet.id)              
    DataSet['Tweet ID'] = [ID for ID in tweetIDs]
        
    tweetsText = []
    for tweet in tweets:
            if 'retweeted_status' in  dir(tweet):
                tweetsText.append(tweet.retweeted_status.full_text.encode('utf-8'))                
            else:
                tweetsText.append(tweet.full_text.encode('utf-8'))  
    DataSet['Text'] = [text for text in tweetsText]
    
    # Get user information
    DataSet['User'] = [tweet.user.name.encode('utf-8') for tweet in tweets]
    DataSet['Screen Name'] = [tweet.user.screen_name.encode('utf-8') for tweet in tweets]
    
    DataSet['User ID'] = [tweet.user.id for tweet in tweets]
    
    # Get media
    tweetsImages = []
    for tweet in tweets:
        if 'media' in tweet.entities:
            for image in tweet.entities['media']:
                tweetsImages.append(image['media_url'])
        else:
            tweetsImages.append('')  
    DataSet['Image Urls'] = [image for image in tweetsImages]
          
    # Get location    
    tweetsLongitudes = []
    for tweet in tweets:
        if tweet.coordinates is not None:
            tweetsLongitudes.append(tweet.coordinates["coordinates"][0])
        else:
            tweetsLongitudes.append('')
    DataSet['Longitude'] = [longitude for longitude in tweetsLongitudes]
    
    tweetsLatitudes = []
    for tweet in tweets:
        if tweet.coordinates is not None:
            tweetsLatitudes.append(tweet.coordinates["coordinates"][1])
        else:
            tweetsLatitudes.append('')
    DataSet['Latitude'] = [latitude for latitude in tweetsLatitudes]
    
    locations = []
    for tweet in tweets:
        if tweet.user.location is not None:
            locations.append(tweet.user.location.encode('utf-8'))
        else:
            locations.append(np.nan)            
    DataSet['Location'] = [loc for loc in locations]
    
    # Get other fields  
    tweetsPosted = []
    for tweet in tweets:
        tweetsPosted.append(tweet.created_at)
    DataSet['Created'] = [created for created in tweetsPosted]
    
    return DataSet

# Specify the term you want to scrape
mention = input('Enter the term you want to scrape- ')

results = []

for tweet_info in tweepy.Cursor(api.search, q=mention,
                           tweet_mode='extended').items():
    results.append(tweet_info)

dataframe = toDataFrame(results)
dataframe['mentions'] = np.nan

Enter the term you want to scrape- sfmta_muni


In [40]:
# Load the spacy library and English corpus
nlp = spacy.load('en_core_web_sm')

nlp.pipeline

[('tagger', <spacy.pipeline.Tagger at 0x1fd1b528860>),
 ('parser', <spacy.pipeline.DependencyParser at 0x1fd00b03db0>),
 ('ner', <spacy.pipeline.EntityRecognizer at 0x1fd00b03d58>)]

In [41]:
from collections import defaultdict

# Instantiate cleaned data dictionary
cleaned_data = defaultdict(list)

# Clean the text data
for row in dataframe.itertuples():
    tweetid = row[1]
    text = row[2].decode('utf-8').encode('ascii', 'ignore').decode('ascii').strip()
    user = row[3].decode('utf-8').encode('ascii', 'ignore').decode('ascii').strip()
    screen_name = row[4].decode('utf-8').encode('ascii', 'ignore').decode('ascii').strip()
    userid = row[5]
    imageurl = row[6]
    longitude = row[7]
    latitude = row[8]
    location = row[9]
    created = row[10]
    mentions = row[11]
    
    text_doc = nlp.make_doc(text)
    tokens = [token for token in text_doc]
    tokens = [token for token in tokens if not token.is_punct]
    tokens = [token for token in tokens if token.text != ' ']
    mentions = [token.text for token in tokens if token.text[0:1] == "@"]
    mentions = [e[1:] for e in mentions]
    tokens_final = [token.text for token in tokens]
    
    text = " ".join(tokens_final).strip()
    text = text.replace('\n', '')
    
    cleaned_data['Tweet ID'].append(tweetid)
    cleaned_data['Text'].append(text)
    cleaned_data['User'].append(user)
    cleaned_data['Screen Name'].append(screen_name)
    cleaned_data['User ID'].append(userid)
    cleaned_data['Image'].append(imageurl)
    cleaned_data['Longitude'].append(longitude)
    cleaned_data['Latitude'].append(latitude)
    cleaned_data['Location'].append(location)
    cleaned_data['Created'].append(created)
    cleaned_data['Mentions'].append(mentions)

cleaned_dataframe = pd.DataFrame(cleaned_data)

In [45]:
cleaned_dataframe.tail(15)

Unnamed: 0,Tweet ID,Text,User,Screen Name,User ID,Image,Longitude,Latitude,Location,Created,Mentions
4047,1120327139725733889,@StanleyRoberts @sfmta_muni The real reason Ca...,BarlowD,Bertie799,845663028372631552,,,,b'',2019-04-22 14:02:59,"[StanleyRoberts, sfmta_muni]"
4048,1120324793897668608,@sfmta_muni Translation We ai nt got enough dr...,Stanley Roberts,StanleyRoberts,90895400,,,,b'N. 33.4500 W.112.0667 \xf0\x9f\x8c\xb5\xf0\x...,2019-04-22 13:53:40,[sfmta_muni]
4049,1120324261699366914,HeadsUp Due to operator availability issues th...,SFMTA,sfmta_muni,109702390,,,,"b'San Francisco, CA'",2019-04-22 13:51:33,[]
4050,1120324206984515584,@sfmta_muni @EdReiskin @BrinkmanCheryl @London...,LOCO SF,LOCOSF2014,1408575710,,,,b'San Francisco',2019-04-22 13:51:20,"[sfmta_muni, EdReiskin, BrinkmanCheryl, London..."
4051,1120322437151973376,HeadsUp Due to operator availability issues th...,SFMTA,sfmta_muni,109702390,,,,"b'San Francisco, CA'",2019-04-22 13:44:18,[]
4052,1120321335052210177,On this Earth Day I would love free transit in...,Claudia Preciado,Claudia_Pres,337221203,,,,"b'Oakland, CA'",2019-04-22 13:39:55,"[SFBART, sfmta_muni, SFBayFerry, rideact, Calt..."
4053,1120312896951447554,@KimNews3LV that woman who got her hand caught...,Cyril Hall,Cyril_Hall_,305284179,,,,b'United States',2019-04-22 13:06:23,"[KimNews3LV, SFBART, sfmta_muni]"
4054,1120311454635311104,Thanks to all those who joined @SFPD @sfmta_mu...,Vision Zero SF,VisionZeroSF,2457485305,http://pbs.twimg.com/media/D4wlVJpWkAIyZXn.jpg,,,b'San Francisco',2019-04-22 13:00:39,"[SFPD, sfmta_muni, SandraLeeFewer]"
4055,1120288661675888641,Just saw two drunk men keep harassing young wo...,Cherry I. Marley,erupardur,1108098870960107520,,,,b'',2019-04-22 11:30:05,"[sfmta_muni, DionLimTV, Hoodline]"
4056,1120279347221262339,See if you qualify to be a Muni Driver https:/...,ibrahim noman,yemen97,607127523,,,,b'\xd9\x85\xd8\xba\xd8\xaa\xd8\xb1\xd8\xa8 \xd...,2019-04-22 10:53:04,[]


In [None]:
outputPath = r'..\0_data\manual'
filePath = os.path.join(outputPath,'tweets_with_' + 'mention_' + mention + '.csv')
if not os.path.isfile(filePath):
    cleaned_dataframe.to_csv(filePath, index=False)
else:
    with open(filePath, 'a') as file:
        cleaned_dataframe.to_csv(file, index = False)
print ('Extracted ' + str(len(results)) 
    + ' tweets with ' + mention)