# musoW Twitter Pipeline

## Imports

In [2]:
path = '../'
import pandas as pd
#import custom functions
from PYTHON_FILES.LogReg_Searches import LogRegSearches

## Variables

In [3]:
# descriptions training set -> v2 = musow+mji descriptions vs summarized scrapes from twitter searches  
archive_desc_training_v2 = pd.read_pickle(path+'LOGREG_RELEVANCE/TRAINING_SETS/archive_desc_training_v2.pkl')

# twitter training set -> v1 = tweets from bigrams vs tweets for digital humanities and music company 
twitter_training_set_v1 = pd.read_pickle(path+'LOGREG_RELEVANCE/TRAINING_SETS/twitter_training_v1.pkl')

## Training twitter and descriptions classifiers

This is a ONE TIME operation. The models are pickled and loaded later to predict new results from LOGREG_RELEVANCE/MODELS folder. 

In [3]:
# one time training on twitter
twitter_training_model = LogRegSearches.train(twitter_training_set_v1, 'tweet', 'Target', 10, 'precision', 1000, 'twitter_pipeline', path)

# one time training on resources
resource_training_model = LogRegSearches.train(archive_desc_training_v2, 'Description', 'Target', 10, 'precision', 1000, 'resources_pipeline',path)

report:
              precision    recall  f1-score   support

           0       0.93      0.98      0.95      4379
           1       0.97      0.92      0.95      4379

    accuracy                           0.95      8758
   macro avg       0.95      0.95      0.95      8758
weighted avg       0.95      0.95      0.95      8758

report:
              precision    recall  f1-score   support

           0       0.93      0.91      0.92       538
           1       0.94      0.95      0.95       786

    accuracy                           0.94      1324
   macro avg       0.93      0.93      0.93      1324
weighted avg       0.93      0.94      0.93      1324



## Query Twitter

Calls Twitter API with a list of keywords and return results as raw csv and clean pickle in TWITTER_SEARCHES/RAW_SEARCHES folder.

In [4]:
#load token
token = 'AAAAAAAAAAAAAAAAAAAAAJgsNAEAAAAAQcsgbUnOJJmqmU483%2F8x6n9V1i8%3Df0qaEo9cV1sWP4eyNQ6E9s8BiRjvFTSN9mSqithe8uIXSNP68x'
#a selection of keywords based on MJI and musoW datasets

#Choose keywords  
keywords = ['music archive', 'music collection']

#search timeframe (if using custom search)
start = ['2022-05-01T00:00:00.000Z', '2022-05-02T00:00:00.000Z', '2022-05-03T00:00:00.000Z', '2022-05-04T00:00:00.000Z', '2022-05-05T00:00:00.000Z', '2022-05-06T00:00:00.000Z', '2022-05-07T00:00:00.000Z']
end = ['2022-05-01T23:59:59.000Z', '2022-05-02T23:59:59.000Z', '2022-05-03T23:59:59.000Z', '2022-05-04T23:59:59.000Z', '2022-05-05T23:59:59.000Z', '2022-05-06T23:59:59.000Z', '2022-05-07T23:59:59.000Z']

#choose search option 
## search last week
tweets = LogRegSearches.search_twitter_weekly(token, keywords, 50, 50)
## search custom timeframe
#LogRegSearches.search_twitter_custom(token, better_keywords, start, end, 500, 500)

-------------------
Token:  None
Endpoint Response Code: 200
-------------------
Start Date:  2022-06-02T00:00:00.000Z
# of Tweets added from this response:  46
Total # of Tweets added for '"music archive" -is:retweet': 46
-------------------
Total number of results: 46
-------------------
Token:  None
Endpoint Response Code: 200
Next Token:  b26v89c19zqg8o3fpyzm97i9sajv7ora5bnmao52pyzct
Start Date:  2022-06-02T00:00:00.000Z
# of Tweets added from this response:  50
Total # of Tweets added for '"music collection" -is:retweet': 50
-------------------
Total number of results: 50


## Classify tweets

In [6]:
#load all search results into a single dataframe 
tweets_to_classify = LogRegSearches.tweets_to_classify(path+'TWITTER_SEARCHES/RAW_SEARCHES/', f'{tweets[0][-16:]}.pkl')
tweets_to_classify

Total tweets to classify: 35


Unnamed: 0,user,created_at,lang,like_count,quote_count,reply_count,retweet_count,tweet,URL,Search KW
0,CNadineL,2022-06-08 05:13:13+00:00,en,0,0,0,0,Dan Needham | Christian Music Archive https://...,https://www.christianmusicarchive.com/artist/d...,"""music archive"" -is:retweet"
1,EricLindhardt,2022-06-08 02:38:16+00:00,en,1,0,0,0,Mr. Bond Music Archive Compilation of music ...,"http://t.me/OdersMemeDepot, https://t.me/MrBon...","""music archive"" -is:retweet"
2,fogelnet,2022-06-08 00:02:19+00:00,en,0,0,0,0,"Phil Lesh and Friends from Sat, Jun 4, 2022 at...",https://livemusicarchive.app/music/artists/Phi...,"""music archive"" -is:retweet"
3,EU_SSSA,2022-06-07 14:28:12+00:00,en,5,0,0,3,Tickets are available for a free day conferenc...,http://bit.ly/TradArchive,"""music archive"" -is:retweet"
4,NewsdayFrank,2022-06-07 13:18:12+00:00,en,0,0,0,0,I take a walk on the wild side on this perfect...,https://www.newsday.com/entertainment/music/lo...,"""music archive"" -is:retweet"
5,AndiDurrant,2022-06-07 11:47:10+00:00,en,6,0,0,0,We did 1995 and 2009 on this week's Dance Musi...,https://www.dancemusicarchive.com/radioshow,"""music archive"" -is:retweet"
6,TradMusicForum,2022-06-07 07:59:19+00:00,en,5,1,0,3,Join us THIS SATURDAY 11th June in Edinburgh ...,https://scottishstorytellingcentre.online.red6...,"""music archive"" -is:retweet"
7,BBCR6MusicBot,2022-06-07 02:00:34+00:00,en,0,0,0,0,NowPlaying on BBC6Music: 6MusicArtistCollect...,https://www.bbc.co.uk/programmes/m0017scd,"""music archive"" -is:retweet"
8,fogelnet,2022-06-06 14:13:32+00:00,en,0,0,0,0,"Grateful Dead from Sat, Jun 6, 1992 at Rich St...",https://livemusicarchive.app/music/artists/Gra...,"""music archive"" -is:retweet"
9,MDMArchive,2022-06-06 10:28:00+00:00,en,2,1,0,0,As part of the Manchester Histories Festival C...,https://manchesterhistories.co.uk/our-festival...,"""music archive"" -is:retweet"


In [8]:
#run classification and get links from results
predicted_tweets, twitter_link_list = LogRegSearches.predict_twitter(path, 'twitter_pipeline', tweets_to_classify, 'tweet', 1)
predicted_tweets

Total tweets predicted: 23


Unnamed: 0,tweet,Prediction,Score,Probability,Input Length,URL,Search KW,tweet date
0,Paul Ingles' MUSIC ARCHIVE SHOWCASE 147a - A L...,1,10.270383,0.999965,99,https://beta.prx.org/stories/420264,"""music archive"" -is:retweet",2022-06-05
1,Dan Needham | Christian Music Archive https://...,1,9.402788,0.999918,61,https://www.christianmusicarchive.com/artist/d...,"""music archive"" -is:retweet",2022-06-08
2,Steam ended Before I finished Streaming Paint...,1,9.007617,0.999878,279,http://fma.com,"""music archive"" -is:retweet",2022-06-03
3,"Grateful Dead from Sat, Jun 6, 1992 at Rich St...",1,8.427915,0.999781,98,https://livemusicarchive.app/music/artists/Gra...,"""music archive"" -is:retweet",2022-06-06
4,"Grateful Dead from Sun, Jun 4, 1978 at Campus ...",1,8.427915,0.999781,118,https://livemusicarchive.app/music/artists/Gra...,"""music archive"" -is:retweet",2022-06-04
5,I LOVE IT I can't wait to display it with my ...,1,7.723218,0.999558,128,http://mariahcarey.com,"""music collection"" -is:retweet",2022-06-08
6,I have a large music collection. Use the nam...,1,7.383933,0.999379,141,https://objkt.com/profile/napoleonbonaparte/ow...,"""music collection"" -is:retweet",2022-06-08
7,Tickets are available for a free day conferenc...,1,6.814175,0.998903,238,http://bit.ly/TradArchive,"""music archive"" -is:retweet",2022-06-07
8,"Phil Lesh and Friends from Sat, Jun 4, 2022 at...",1,6.764507,0.998847,112,https://livemusicarchive.app/music/artists/Phi...,"""music archive"" -is:retweet",2022-06-08
9,NowPlaying on BBC6Music: 6MusicArtistCollect...,1,5.608325,0.996346,158,https://www.bbc.co.uk/programmes/m0017scd,"""music archive"" -is:retweet",2022-06-07


## Scrape URLS

In [9]:
#scrape URL list and return a DF for resource classification
scraped_links = LogRegSearches.scrape_links(twitter_link_list, predicted_tweets, f'{tweets[0][-16:]}_scrapes')

1 https://beta.prx.org/stories/420264


Your max_length is set to 120, but you input_length is only 81. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=40)


2 https://www.christianmusicarchive.com/artist/dan-needham
3 https://livemusicarchive.app/music/artists/GratefulDead/recordings/gd1992-06-06.140277.sbd.cm.miller.flac1644
4 https://livemusicarchive.app/music/artists/GratefulDead/recordings/gd1978-06-04.sbd.cantor.miller.94407.sbeok.flac16
5 http://mariahcarey.com
6 https://objkt.com/profile/napoleonbonaparte/owned?mimetypes=audio
7 http://bit.ly/TradArchive
8 https://livemusicarchive.app/music/artists/PhilLeshandFriends/recordings/PhilLesh_Friends2022-06-04.FOB-Schoeps-1648


Your max_length is set to 120, but you input_length is only 73. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=36)


9 https://www.bbc.co.uk/programmes/m0017scd
10 https://livemusicarchive.app/music/artists/GratefulDead/recordings/gd74-06-18.sbd.sacks.209.sbefail.shnf
11 https://bit.ly/3zsMai2
12 https://livemusicarchive.app/music/artists/GratefulDead/recordings/gd73-11-14.sbd.vernon.5612.sbeok.shnf
13 https://livemusicarchive.app/music/artists/PhilLeshandFriends/recordings/phil2007-06-03.bk4022.acm671.burke.flac16
14 https://objkt.com/asset/hicetnunc/437646
15 http://ballsackradio.com
16 https://paintswap.finance/marketplace/collections/0x6dc1c82cafb211d4559c172bb7ee30e0dad885cf


Token indices sequence length is longer than the specified maximum sequence length for this model (1097 > 1024). Running this sequence through the model will result in indexing errors


17 https://assetstore.unity.com/packages/audio/music/orchestral/total-music-collection-89126?aid=1011l7qkc
18 https://www.dancemusicarchive.com/radioshow


Your max_length is set to 120, but you input_length is only 118. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=59)


19 https://deathbeforepopcountry.com/collections/country-music-collection/products/midnight-stroller-death-before-pop-country-glow-t-shirt-pre-order-printing-week-of-6-15
17


## Classify web resources

In [10]:
predicted_resources = LogRegSearches.predict_resource(path, 'resources_pipeline', scraped_links, 'Description', 1, f'{tweets[0][-16:]}')
predicted_resources

Unnamed: 0,tweet,Prediction,Score,Probability,Input Length,URL,Search KW,tweet date,Title,Description
0,NowPlaying on BBC6Music: 6MusicArtistCollect...,1,7.197628,0.999252,148,https://www.bbc.co.uk/programmes/m0017scd,"""music archive"" -is:retweet",2022-06-07,"BBC Radio 6 Music - 6 Music Artist Collection,...",Midnight Meets with Colin Murray is a 6 Music...
1,Tickets are available for a free day conferenc...,1,3.704251,0.975973,165,http://bit.ly/TradArchive,"""music archive"" -is:retweet",2022-06-07,Scottish Storytelling Centre,A conference to explore the possibilities for...
2,Paul Ingles' MUSIC ARCHIVE SHOWCASE 147a - A L...,1,1.921387,0.872293,3,https://beta.prx.org/stories/420264,"""music archive"" -is:retweet",2022-06-05,PRX,PRX
3,Midnight Stroller! New design available https:...,1,1.469016,0.812908,129,https://deathbeforepopcountry.com/collections/...,"""music collection"" -is:retweet",2022-06-08,Midnight Stroller | Death Before Pop Country |...,Midnight Stroller Glow T Shirt! Pre-Order pri...
4,Launch Party - Nintendo Rhythm Heaven Fever Co...,1,0.408276,0.600674,159,http://ballsackradio.com,"""music collection"" -is:retweet",2022-06-08,ballsack radio :),DJ BajaBlast's Twitter account has been runni...
5,We did 1995 and 2009 on this week's Dance Musi...,1,0.325359,0.58063,151,https://www.dancemusicarchive.com/radioshow,"""music archive"" -is:retweet",2022-06-07,Radio Show | Dancemusicarchive,Jamiroquai is the latest episode of Andidurra...
6,Timeless Christian Music Collection 2022 Lyric...,1,0.213204,0.5531,679,https://bit.ly/3zsMai2,"""music collection"" -is:retweet",2022-06-08,Timeless Christian Music Collection 2022 Lyric...,Timeless Christian Music Collection 2022 Lyri...
