In [1]:
import tweepy 
import csv

DEBUG = False

#Twitter API credentials
consumer_key = ""
consumer_secret = ""
access_key = ""
access_secret = ""

In [2]:
def get_all_tweets(screen_name):
    #Twitter only allows access to a users most recent 3240 tweets with this method
    
    #authorize twitter, initialize tweepy
    auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
    auth.set_access_token(access_key, access_secret)
    api = tweepy.API(auth)
    
    #initialize a list to hold all the tweepy Tweets
    alltweets = []    
    
    #make initial request for most recent tweets (200 is the maximum allowed count)
    new_tweets = api.user_timeline(screen_name = screen_name,count=200)
    
    #save most recent tweets
    alltweets.extend(new_tweets)
    
    user=api.get_user(screen_name=screen_name)
    
    
    #save the id of the oldest tweet less one
    oldest = alltweets[-1].id - 1
    
    #keep grabbing tweets until there are no tweets left to grab
    while len(new_tweets) > 0:
        if DEBUG: print "getting tweets before %s" % (oldest)
        
        #all subsiquent requests use the max_id param to prevent duplicates
        new_tweets = api.user_timeline(screen_name = screen_name,
                                       count=200,max_id=oldest)
        
        #save most recent tweets
        alltweets.extend(new_tweets)
        
        #update the id of the oldest tweet less one
        oldest = alltweets[-1].id - 1
        
        if DEBUG: print "...%s tweets downloaded so far" % (len(alltweets))
    
    #transform the tweepy tweets into a 2D array that will populate the csv    
    outtweets = [{'id_str' : tweet.id_str, 
                  'created_at' : tweet.created_at, 
                  'text' : tweet.text.encode("utf-8"),
                  'retweet_count' : tweet.retweet_count,
                  'screen_name' : screen_name}
                 
                 for tweet in alltweets]
    
    
    return outtweets
    



In [3]:
tweets = get_all_tweets("thechainsmokers")

In [4]:
tweets

[{'created_at': datetime.datetime(2017, 3, 6, 19, 54, 21),
  'id_str': u'838840194212098048',
  'retweet_count': 492,
  'screen_name': 'thechainsmokers',
  'text': "This is wild. We aren't worth of this but we are super grateful! Feeling extra motivated now! https://t.co/DAYPufbh1I thank you guys"},
 {'created_at': datetime.datetime(2017, 3, 6, 7, 52, 30),
  'id_str': u'838658534908702721',
  'retweet_count': 710,
  'screen_name': 'thechainsmokers',
  'text': 'Thank you @iHeartRadio for everything. That was so a really fun and special night!'},
 {'created_at': datetime.datetime(2017, 3, 6, 2, 59, 14),
  'id_str': u'838584730849280001',
  'retweet_count': 2942,
  'screen_name': 'thechainsmokers',
  'text': 'Really happy right now'},
 {'created_at': datetime.datetime(2017, 3, 6, 2, 56, 58),
  'id_str': u'838584161246064640',
  'retweet_count': 994,
  'screen_name': 'thechainsmokers',
  'text': 'RT @iHeartRadio: These three though \xf0\x9f\x99\x8c\xf0\x9f\x99\x8c\xf0\x9f\x99\x8c #iHeartAw

In [5]:
# original tweets, no RT
original_tweets = [t for t in tweets if t['text'][:2] != 'RT']

In [6]:
#tweet ids
ids = [tw["id_str"] for tw in original_tweets]

In [7]:
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_key, access_secret)
api = tweepy.API(auth)

In [9]:
#tweet ids in non unicode
new_ids=[]
for i in ids:
    new_ids.append(i.encode('utf-8'))

In [111]:
# This pulls retweeters from tweet id of original tweet
import time
def id_sn_location(ids):
    """ids are equal to original tweet id"""
    RTS=[]
    for i in ids:
        try:
            RTS.append((i, api.get_status(i).retweets()))
        
            # Some tweepy api call, ex) api.get_user(screen_name = usrScreenName)
        except tweepy.TweepError as e:
            pass
        time.sleep(13)
        
        with open('user.csv','wb') as f:
            writer = csv.writer(f)
            retweet=[]
            for i, t in RTS:
                for rts in t:
                    retweet.append([i,
                                    rts.user.screen_name.encode('utf-8'),
                                    rts.user.location.encode('utf-8')])
            writer.writerow(['id','screen_name', 'location'])
            writer.writerows(retweet)

In [None]:
#id_sn_location(new_ids[300:1001])

In [11]:
import pandas as pd

In [18]:
#all influencers 
chainsmokers=pd.read_pickle('chainsmokers.pkl')
calvinharris=pd.read_pickle('calvin_harris.pkl')
diddy=pd.read_pickle('diddy.pkl')
djkhaled=pd.read_pickle('djkhaled.pkl')
marshmello=pd.read_pickle('marshmello.pkl')
martingarrix=pd.read_pickle('martingarrix.pkl')
chance=pd.read_pickle('chance.pkl')
jamescorden=pd.read_pickle('jamescorden.pkl')
ultra=pd.read_pickle('ultra.pkl')
kaskade=pd.read_pickle('kaskade.pkl')

In [112]:
#retweeters
kaskade.head()

Unnamed: 0,id,screen_name,location
0,836984179338272768,iamclaudia_,
1,836984179338272768,emilymxj,"Chicago, IL"
2,836984179338272768,LilEmpireMusic,LA
3,836984179338272768,adasoftwaer,Frankfurt am Main
4,836984179338272768,TheKingDustin,"Skokie, IL"


In [20]:
# This groupby allows me to go over all the user per id
kaskade_rts=kaskade.groupby(['id'], as_index=False).agg(lambda x: set(x))
kaskade_rts.head()

Unnamed: 0,id,screen_name,location
0,686981454748893184,"{lesleysmalls, leflower, aYoBrandon, bryanaeri...","{nan, nan, nan, Canterbury, Kent, phx, az, nan..."
1,686982572308578304,"{ricardo_avitia, lesleysmalls, xxxdanielcd, le...","{nan, California, USA, South Los Angeles, CA, ..."
2,686995727579496448,"{CaaatchMyDrift, MrPartyyyHardy, DECO2akaSEA, ...","{nan, nan, a foodie in VA/DC, 201 • 420, nan, ..."
3,687073144948011008,"{plsdre, misakimemii, fluffingkendall, samfeld...","{nan, A Galaxy Far, Far Away, nan, Trujillo, P..."
4,687306427782803456,"{funxjjh, DJVinnyVice, ajk_jp, samfeldt_, waiw...","{nan, nan, nan, Tokyo, Japan, Chiba, Japan, サボ..."


In [21]:
kaskade_rts2=kaskade_rts.drop(['location'], axis=1)
kaskade_rts3=kaskade_rts2['screen_name'].str.join(sep='*').str.get_dummies(sep='*')

In [22]:
kaskade_rts3.head()

Unnamed: 0,005nightrdire,04N1374,05rd17o,0718Gomez,07Adeakm,09210info,09moCza,0Alber0,0ImJP0,0ceanicskies,...,zp_pidoxpr,zshooter,zufars,zulixo,zulmastillwlhel,zurizadays,zyiarreee,zymappso,zyu_chan3020,zz6AcwyYmGT35wC
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [23]:
kaskade_dummies=pd.concat([kaskade_rts2, kaskade_rts3], axis=1)
kaskade_dummies.head()

Unnamed: 0,id,screen_name,005nightrdire,04N1374,05rd17o,0718Gomez,07Adeakm,09210info,09moCza,0Alber0,...,zp_pidoxpr,zshooter,zufars,zulixo,zulmastillwlhel,zurizadays,zyiarreee,zymappso,zyu_chan3020,zz6AcwyYmGT35wC
0,686981454748893184,"{lesleysmalls, leflower, aYoBrandon, bryanaeri...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,686982572308578304,"{ricardo_avitia, lesleysmalls, xxxdanielcd, le...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,686995727579496448,"{CaaatchMyDrift, MrPartyyyHardy, DECO2akaSEA, ...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,687073144948011008,"{plsdre, misakimemii, fluffingkendall, samfeld...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,687306427782803456,"{funxjjh, DJVinnyVice, ajk_jp, samfeldt_, waiw...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [25]:
kaskade_dummies.sum(axis=0).sort_values(ascending=False)

id                 5533345123103970518
Kaskade_MX                         299
FedeeeeOK                          236
BhuvaneshBp                        195
MomentOfChaos                      167
oceanicglider                      128
WhenInRome_18                       98
savageslitz                         83
ZhaoChuen                           75
misakimemii                         74
edmchic_nadz                        72
NoreenTranceCat                     65
SaurabhSwift                        55
funnybunny640                       51
pinguina_23                         49
hollymostdope                       46
nervo_colombia                      45
89623699167                         44
bigbootydoobie                      40
paulosj16                           39
_huertta                            37
Minichyna01                         36
mikey929198555                      35
justgema                            35
dabxrave                            34
SHSoni_v1                

In [26]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
import nltk
from nltk.corpus import stopwords
import re

In [29]:
#has tweets without http and RT
retweets=pd.read_pickle('retweets.pkl')

In [28]:
retweets.head()

Unnamed: 0,created_at,id_str,retweet_count,screen_name,text
0,2017-02-28 16:47:36,836618869989081090,4,Kaskade_MX,@fairchild_music: Something big coming soon. ...
1,2017-02-28 16:47:17,836618790490226688,12,Kaskade_MX,@Arkade: TITAN - @fairchild_music \n3.3.17
2,2017-02-28 16:47:04,836618735330930688,67,Kaskade_MX,@kaskade: In case you were working or somethi...
3,2017-02-25 06:31:54,835376757335343106,27,Kaskade_MX,@kaskade: Full on summer vibes here in Brazil...
4,2017-02-25 06:31:50,835376744005795840,14,Kaskade_MX,@Arkade: Risqué - @Thekissermusic \n💋OUT NOW...


In [30]:
Retweeters=[]
for i in retweets['text']:
    Retweeters.append(i)
Retweeters

[" @fairchild_music: Something big coming soon. Keep an eye on @kaskade 's @Arkade this week. ",
 ' @Arkade: TITAN - @fairchild_music \n3.3.17 ',
 ' @kaskade: In case you were working or something today (I\xe2\x80\x99m sorry about u having to work), \nGet to  ASAP\n#SpringFl\xe2\x80\xa6',
 ' @kaskade: Full on summer vibes here in Brazil. Sun is down and a perfect 77 degrees here in Floripa. Living that life\xe2\x80\xa6  #SummerNights',
 ' @Arkade: Risqu\xc3\xa9 - @Thekissermusic \n\xf0\x9f\x92\x8bOUT NOW\xf0\x9f\x92\x8b\n ',
 ' @ShopKaskade: You guys feel that? Spring is right around the corner. Shop accordingly.\n\nCC: @kaskade ',
 ' @kaskade: Friday vibez. \n#KaskadePointsAtStuff ',
 ' @felixcartal: still fakin it @kaskade ',
 ' @kaskade: It\xe2\x80\x99s all good @marshmellomusic we should start a club.\n\nWe could invite Daft Punk too. \xf0\x9f\xa4\x96 ',
 " @Kaskade411: Hey, @Hefty if you like it, you should put a @Kaskade ring on it. I mean, if @JohnCena isn't scared of a little c

In [31]:
RT_txt=','.join(Retweeters)
RT_txt



In [64]:
stopset = set(stopwords.words('english'))
stopset.update(stopwords.words('spanish'))
stopset.update(['love', 'like', 'new', 'amp', 'get', 'one', 'see', 'si', 'today', 'tonight',
               'go','want','good', 'back', 'thank', 'us', 'day','people','make',
                'much', 'know','nan'])

In [135]:
vectorizer=TfidfVectorizer(stop_words=stopset,use_idf=True, ngram_range=(1,3))
X=vectorizer.fit_transform([RT_txt])

In [136]:
lsa=TruncatedSVD(n_components=1, n_iter=100, random_state=42)
lsa.fit(X)

TruncatedSVD(algorithm='randomized', n_components=1, n_iter=100,
       random_state=42, tol=0.0)

In [137]:
terms = vectorizer.get_feature_names()
for i, comp in enumerate(lsa.components_): 
    termsInComp = zip (terms,comp)
    sortedTerms =  sorted(termsInComp, key=lambda x: x[1], reverse=True) [:20]
    print("Kaskade %d:" % i )
    for term in sortedTerms:
        print(term[0])
    print (" ")

Kaskade 0:
kaskade
music
arkade
set
kaskade kaskade
cidmusic
felixcartal
redux
time
kaskadela
let
year
night
need
guys
ultra
show
song
way
remix
 


In [68]:
locations=[]
for i in kaskade['location']:
    locations.append(i)
locations

[nan,
 'Chicago, IL',
 'LA',
 'Frankfurt am Main',
 'Skokie, IL',
 'Seattle, WA',
 nan,
 nan,
 'New York',
 'Miami Beach, Florida',
 'New York',
 'Kaskade Land',
 'Honolulu, HI',
 'Toronto, Canada',
 'Great state of Texas',
 'Chicago',
 'Chicago',
 'Let the music speak.',
 'Chicago, IL',
 'New York',
 'New York',
 'Las Vegas, NV',
 'still at work, tx',
 nan,
 'New York',
 'New York',
 'Brasil',
 nan,
 'still at work, tx',
 'Senoia, GA',
 'Las Vegas, NV',
 'New York',
 'New York',
 nan,
 'Chicago, IL',
 'Michigan',
 '94112 & 95111',
 'under the electric sky ',
 'Dallas Texas, USA',
 'EPTX\xe2\x9c\x88\xef\xb8\x8fSATX\xe2\x9c\x88\xef\xb8\x8f',
 'Dallas',
 'San Diego',
 'Wyoming',
 'Phoenix, Arizona',
 'Maine',
 'Massachusetts',
 'New Jersey',
 nan,
 'Delaware',
 'Minnesota',
 nan,
 'Franklin, Tennessee, U.S.',
 'San Diego, CA',
 'Follow Nation',
 'under the electric sky ',
 'Maine',
 'Miami Beach, Florida',
 'Massachusetts',
 'New Jersey',
 'Delaware',
 nan,
 'Minnesota',
 'Illinois, USA'

In [77]:
locations2=(map(str,locations))

In [78]:
locations2

['nan',
 'Chicago, IL',
 'LA',
 'Frankfurt am Main',
 'Skokie, IL',
 'Seattle, WA',
 'nan',
 'nan',
 'New York',
 'Miami Beach, Florida',
 'New York',
 'Kaskade Land',
 'Honolulu, HI',
 'Toronto, Canada',
 'Great state of Texas',
 'Chicago',
 'Chicago',
 'Let the music speak.',
 'Chicago, IL',
 'New York',
 'New York',
 'Las Vegas, NV',
 'still at work, tx',
 'nan',
 'New York',
 'New York',
 'Brasil',
 'nan',
 'still at work, tx',
 'Senoia, GA',
 'Las Vegas, NV',
 'New York',
 'New York',
 'nan',
 'Chicago, IL',
 'Michigan',
 '94112 & 95111',
 'under the electric sky ',
 'Dallas Texas, USA',
 'EPTX\xe2\x9c\x88\xef\xb8\x8fSATX\xe2\x9c\x88\xef\xb8\x8f',
 'Dallas',
 'San Diego',
 'Wyoming',
 'Phoenix, Arizona',
 'Maine',
 'Massachusetts',
 'New Jersey',
 'nan',
 'Delaware',
 'Minnesota',
 'nan',
 'Franklin, Tennessee, U.S.',
 'San Diego, CA',
 'Follow Nation',
 'under the electric sky ',
 'Maine',
 'Miami Beach, Florida',
 'Massachusetts',
 'New Jersey',
 'Delaware',
 'nan',
 'Minnesota'

In [87]:
locations3=','.join(locations2)

In [106]:
loc_vec=TfidfVectorizer(stop_words=stopset,use_idf=True, ngram_range=(1,3))
loc_X=loc_vec.fit_transform(locations2)

In [107]:
loc_X.shape

(36133, 11472)

In [108]:
loc_lsa=TruncatedSVD(n_components=10, n_iter=100, random_state=42)
loc_lsa.fit(loc_X)

TruncatedSVD(algorithm='randomized', n_components=10, n_iter=100,
       random_state=42, tol=0.0)

In [110]:
terms2 = loc_vec.get_feature_names()
for i, comp in enumerate(loc_lsa.components_): 
    termsInComp = zip (terms2,comp)
    sortedTerms =  sorted(termsInComp, key=lambda x: x[1], reverse=True) [:10]
    print("Concept %d:" % i )
    for term in sortedTerms:
        print(term[0])
    print (" ")

Concept 0:
york
york ny
ny
york city
city
york york
york usa
usa
angeles
brooklyn
 
Concept 1:
california
california usa
usa
angeles
southern california
angeles california
southern
angeles ca
ca
san
 
Concept 2:
washington
washington dc
dc
foodie
foodie va
foodie va dc
va dc
seattle washington
va
seattle
 
Concept 3:
texas
dallas
dallas texas
austin texas
austin
houston
houston texas
texas usa
usa
tx
 
Concept 4:
angeles
angeles ca
ca
angeles california
san
anaheim ca
anaheim
york angeles
diego ca
san diego ca
 
Concept 5:
ny
york ny
angeles ca
angeles
ca
brooklyn ny
york ny usa
ny usa
brooklyn
manhattan ny
 
Concept 6:
florida
miami
miami florida
usa
south
florida usa
south florida
tampa florida
tampa
orlando florida
 
Concept 7:
jersey
usa
jersey usa
ohio
york jersey
california usa
northern jersey
arizona
northern
virginia
 
Concept 8:
ohio
columbus ohio
columbus
cleveland ohio
cleveland
cincinnati ohio
cincinnati
usa
ohio usa
dayton ohio
 
Concept 9:
city
york city
ny
york ny
kansas

In [133]:
kaskade.groupby(['id', 'screen_name', 'location']).size()

id                  screen_name      location                  
686981454748893184  Chikita323       Los Angeles CA                1
                    Oskeylp22        (213)                         1
                    ReyJayy          phx, az                       1
                    aYoBrandon       Los Angeles, CA               1
                    bryanaerin_      Los Angeles, CA               1
                    charleysuxx      California, USA               1
                    geoxperez        Riverside/East LA             1
                    leflower         California, USA               1
                    shackcanterbury  Canterbury, Kent              1
                    thatsmejesus     Glassell Park, Los Angeles    1
686982572308578304  Chikita323       Los Angeles CA                1
                    MarvinB_Street   Northridge, CA                1
                    ReyJayy          phx, az                       1
                    _Charlie5        So