# README 
- This notebook includes an early version of the musoW discovery pipeline, focused on twitter. 
- It covers evaluating twitter search results for relevance, scraping text from the URLs of suggested relevant results, and then evaluating those results for relevance before returning a final results list for a user to check. 
- Predictions are run against relevant training sets (twitter, descriptions).
- Once this flow has been tested the aim is to rebuild it with all key functions as external python files so that the user would only have to manipulate the necessary inputs at every step to get a result. 

DO NOT USE

Step 1:
- Load libraries and functions 

In [1]:
#imports + path
from __future__ import print_function
import requests
import pandas as pd
import pickle
import numpy as np
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression
from sklearn.metrics import classification_report 
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from bs4 import BeautifulSoup
pd.set_option('display.max_rows', 100)
path = '/Users/laurentfintoni/Desktop/University/COURSE DOCS/THESIS/Internship/musow-pipeline/'

In [2]:
#PREDICTION FUNCTION W/ and W/O LOGREGCV 

def lr_model_predict_cv(t_input, t_feature, target, cv_int, score_type, p_input, p_feature, filename, path):
    count_vect = CountVectorizer()
    tfidf_transformer = TfidfTransformer()
    x_count = count_vect.fit_transform(t_input[t_feature])
    x_train = tfidf_transformer.fit_transform(x_count)
    y_train = t_input[target].values
    model = LogisticRegressionCV(solver='liblinear', random_state=44, cv=cv_int, scoring=score_type)
    model.fit(x_train, y_train)
    export = f'LOGREG_RELEVANCE/MODELS/{filename}.sav'
    pickle.dump(model, open(path+export, 'wb'))
    x_new_count = count_vect.transform(p_input[p_feature])
    x_new_train = tfidf_transformer.transform(x_new_count)
    y_predict = model.predict(x_new_train)
    scores = model.decision_function(x_new_train)
    probability = model.predict_log_proba(x_new_train)
    results = [r for r in y_predict]
    result = p_input.copy()
    result['Prediction'] = results
    result['Score'] = [s for s in scores]
    result['Probability'] = [p for p in probability]
    result['Input Length'] = result[p_feature].str.len()
    return result

def lr_model_predict(t_input, t_feature, target, p_input, p_feature, filename, path):
    count_vect = CountVectorizer()
    tfidf_transformer = TfidfTransformer()
    x_count = count_vect.fit_transform(t_input[t_feature])
    x_train = tfidf_transformer.fit_transform(x_count)
    y_train = t_input[target].values
    model = LogisticRegression(solver='liblinear', C=10.0,random_state=44)
    model.fit(x_train, y_train)
    export = f'LOGREG_RELEVANCE/MODELS/{filename}.sav'
    pickle.dump(model, open(path+export, 'wb'))
    x_new_count = count_vect.transform(p_input[p_feature])
    x_new_train = tfidf_transformer.transform(x_new_count)
    y_predict = model.predict(x_new_train)
    scores = model.decision_function(x_new_train)
    probability = model.predict_log_proba(x_new_train)
    results = [r for r in y_predict]
    result = p_input.copy()
    result['Prediction'] = results
    result['Score'] = [s for s in scores]
    result['Probability'] = [p for p in probability]
    result['Input Length'] = result[p_feature].str.len()
    return result

In [3]:
#scrape URLs for title, desc and URL text 

def scrape_links(link_list):
    links = pd.DataFrame(columns=['Title', 'Description', 'URL'])
    for link in link_list:
        URL = link
        try:
            headers = {'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36'}
            page = requests.get(URL, headers=headers, timeout=5)
            status = page.status_code
            if status == 200:
                soup = BeautifulSoup(page.content, "html.parser")
                if soup and soup.find('head') and soup.find('body'):
                    title = ' '.join([t.text for t in soup.find('head').find_all('title')]).strip()
                    text = ' '.join([p.text for p in soup.find('body').find_all('p')]).strip()
                    new_row = {'Title': title, 'Description': text, 'URL': URL.strip()}
                    links = links.append(new_row, ignore_index=True)
        except requests.exceptions.ConnectionError:
            pass
        except Exception:
            continue
        except AssertionError:
            pass
    return links

Step 2:
- load training and prediction sets 
- twitter training sets are taken from bigram searches ran for the whole of 2021. Filtered by language to remove non english results for now and sampled to even out the two halves of the set. Negative set uses 'digital humanities' and 'problem solving' bigrams, positive set uses bigrams from keyword research 
- prediction sets are taken from bigrams searches during the first 3 months of 2022. 

In [4]:
#description training set 
training_set_even_adds = pd.read_pickle(path+'LOGREG_RELEVANCE/TRAINING_SETS/trainingset_even_extended.pkl')
new_training_set = pd.read_pickle(path+'LOGREG_RELEVANCE/TRAINING_SETS/new_training_set.pkl')

In [2]:
#negative twitter training set
dh = pd.read_pickle(path+'TWITTER_SEARCHES/NEGATIVE/digital_humanities_2021.pkl')
music_company = pd.read_pickle(path+'TWITTER_SEARCHES/NEGATIVE/music_company_2021.pkl')
twitter_neg = pd.concat([dh, music_company])
twitter_neg = twitter_neg.loc[twitter_neg['lang'] == 'en']
twitter_neg['Target'] = '0'
twitter_neg = twitter_neg.sample(n=4379, random_state=56)
twitter_neg = twitter_neg[['tweet', 'Target']].reset_index(drop=True)

In [3]:
#positive twitter training set 
music_collection = pd.read_pickle(path+'TWITTER_SEARCHES/MUSOW BIGRAMS/twitter_music_collection.pkl')
song_dataset = pd.read_pickle(path+'TWITTER_SEARCHES/MUSOW BIGRAMS/twitter_song_dataset.pkl')
sound_archive = pd.read_pickle(path+'TWITTER_SEARCHES/MJI BIGRAMS/twitter_sound_archive.pkl')
digital_archive = pd.read_pickle(path+'TWITTER_SEARCHES/MUSOW BIGRAMS/twitter_digital_archive.pkl')
music_archive = pd.read_pickle(path+'TWITTER_SEARCHES/MUSOW BIGRAMS/twitter_music_archive.pkl')
digi_music_archive = pd.read_pickle(path+'TWITTER_SEARCHES/MUSOW BIGRAMS/twitter_digital_music_archive.pkl')
midi_file = pd.read_pickle(path+'TWITTER_SEARCHES/MUSOW BIGRAMS/twitter_midi_file.pkl')
music_data = pd.read_pickle(path+'TWITTER_SEARCHES/MUSOW BIGRAMS/twitter_music_data.pkl')
music_research = pd.read_pickle(path+'TWITTER_SEARCHES/MJI BIGRAMS/twitter_music_research.pkl')
music_dataset = pd.read_pickle(path+'TWITTER_SEARCHES/MUSOW BIGRAMS/twitter_music_dataset.pkl')
twitter_pos = pd.concat([sound_archive, music_collection, digital_archive, music_archive, song_dataset, digi_music_archive, midi_file, music_data, music_research, music_dataset])
twitter_pos = twitter_pos.loc[twitter_pos['lang'] == 'en']
twitter_pos['Target'] = '1'
twitter_pos = twitter_pos[['tweet', 'Target']].reset_index(drop=True)

In [4]:
#final twitter training set
twitter_set = pd.concat([twitter_pos, twitter_neg])
twitter_set['Target'] = twitter_set['Target'].astype('int')
twitter_set = twitter_set.reset_index(drop=True)
twitter_set.to_pickle(path+'LOGREG_RELEVANCE/TRAINING_SETS/twitter_training_v1.pkl')

In [9]:
#load a prediction set and check its length
prediction_twitter = pd.read_pickle(path+'TWITTER_SEARCHES/PREDICTIONS/music_information_22.pkl')
prediction_twitter = prediction_twitter.loc[prediction_twitter['lang'] == 'en']
len(prediction_twitter)

249

Step 3:
- run the predict function for twitter (logregcv and logreg options)
- filter the results by positives and optionally by inclusion of the 'music' kw
- return a df w/ tweet, prediction value, confidence score, probability, length of input and url 

In [22]:
#variable for removing unwanted results 
discard = ['youtu', '404', 'Not Found', 'bandcamp', 'ebay', 'It needs a human touch', 'Page not found', 'open.spotify.com', 'We\'re sorry...', 'Not Acceptable!', 'Access denied', '412 Error', 'goo.gl', 'instagr.am', 'soundcloud', 'apple.co', 'amzn', 'masterstillmusic', 'Facebook', 'facebook', 'sheetmusiclibrary.website', 'Unsupported browser', 'Last.fm', 'last.fm', 'amazon.com', 'tidal.com', 'tmblr.co', 'blogspot', 'dailymusicroll', 'PortalTaxiMusic', 'apple.news', 'yahoo.com', 'sheetmusicplus.com', 'musicnotes.com', 'musescore.com', 'etsy', 'nts.live', 'twitch.tv', 'YouTube', 'radiosparx.com', 'freemusicarchive.org', 'blastradio', 'opensea', 'mixcloud', 'catalog.works', 'nft', 'NFT', 'allmusic.com', 'foundation.app', 'Robot or human?', 'heardle', 'insession.agency', 'jobvite', 'career', 'docs.google.com/forms/', 'discogs.com']

In [19]:
#run w/ LogRegCV
tweet_predict_cv = lr_model_predict_cv(twitter_set, 'tweet', 'Target', 2, 'precision', prediction_twitter, 'tweet', 'twitter_test_cv', '/Users/laurentfintoni/Desktop/University/COURSE DOCS/THESIS/Internship/musow-pipeline/')

In [23]:
#filter and display results 
tweet_predict_cv_df = tweet_predict_cv.copy()
tweet_predict_cv_df = tweet_predict_cv_df.loc[tweet_predict_cv_df['Prediction'] == 1]
tweet_predict_cv_df = tweet_predict_cv_df[~tweet_predict_cv_df.url.str.contains('|'.join(discard))]
tweet_predict_cv_df = tweet_predict_cv_df.drop_duplicates(subset=['tweet'])
tweet_predict_cv_df = tweet_predict_cv_df.sort_values(by='Score', ascending=False).reset_index(drop=True)
tweet_predict_cv_df = tweet_predict_cv_df[['tweet', 'Prediction', 'Score', 'Probability', 'Input Length', 'url']]
tweet_predict_cv_df

Unnamed: 0,tweet,Prediction,Score,Probability,Input Length,url
0,#Canadian #Music #Entertainment Be a part of M...,1,18.079471,"[-18.07947057632295, -1.4066488517440156e-08]",171,https://news247planet.com/?p=65019
1,Ethio Kinits Model for first-ever Music Inform...,1,11.075965,"[-11.075980676183649, -1.547982590549974e-05]",140,http://arxiv.org/abs/2201.08448v1
2,We are looking forward to another performance ...,1,10.132918,"[-10.132958083140784, -3.974850971744616e-05]",260,https://mic.hr/en/performance-of-detonis-the-w...
3,The song was found engraved on an ancient marb...,1,9.974115,"[-9.974161514496993, -4.658936695017157e-05]",160,https://www.history.com/news/what-is-the-oldes...
4,USD147314S - Music information chart https:/...,1,9.735083,"[-9.735142633966415, -5.916898534118858e-05]",86,https://patents.google.com/patent/USD147314
5,Download the score at the Estonian Music Infor...,1,9.480556,"[-9.480631986333197, -7.631860362772335e-05]",86,https://www.emic.ee/?sisu=uudis_edasi&mid=27&l...
6,☆彡.。　Music Information The first round of Dig...,1,8.872101,"[-8.872240750172454, -0.00014023785228088978]",273,
7,Listen now top hits music Information First - ...,1,8.462928,"[-8.46313887548954, -0.00021113067317936473]",107,https://streamingV2.shoutcast.com/201912
8,New Year DIAMOND FES will be held urgently wit...,1,8.223613,"[-8.223881488653847, -0.0002682080996323188]",220,https://evening-mashup.com/archives/157061
9,13 Behind The Scenes Music Information https:/...,1,7.679999,"[-7.680460557880087, -0.0004618688265768492]",62,https://manoranjannews.xyz/2022/03/19/13-behin...


In [24]:
#optional filter by kw
tweet_predict_cv_df_kw = tweet_predict_cv_df[tweet_predict_cv_df['tweet'].str.contains('music')]
tweet_predict_cv_df_kw = tweet_predict_cv_df_kw.drop_duplicates(subset=['tweet'])
tweet_predict_cv_df_kw = tweet_predict_cv_df_kw.reset_index(drop=True)
tweet_predict_cv_df_kw 

Unnamed: 0,tweet,Prediction,Score,Probability,Input Length,url
0,Ethio Kinits Model for first-ever Music Inform...,1,11.075965,"[-11.075980676183649, -1.547982590549974e-05]",140,http://arxiv.org/abs/2201.08448v1
1,We are looking forward to another performance ...,1,10.132918,"[-10.132958083140784, -3.974850971744616e-05]",260,https://mic.hr/en/performance-of-detonis-the-w...
2,The song was found engraved on an ancient marb...,1,9.974115,"[-9.974161514496993, -4.658936695017157e-05]",160,https://www.history.com/news/what-is-the-oldes...
3,Listen now top hits music Information First - ...,1,8.462928,"[-8.46313887548954, -0.00021113067317936473]",107,https://streamingV2.shoutcast.com/201912
4,New Year DIAMOND FES will be held urgently wit...,1,8.223613,"[-8.223881488653847, -0.0002682080996323188]",220,https://evening-mashup.com/archives/157061
5,Babooze part deux - gonna join @BraddahGomes f...,1,7.498173,"[-7.498726490438036, -0.0005539425749380356]",279,http://KXPA.com
6,"[Music Information] #IA's best album, which in...",1,6.899588,"[-6.9005955840537725, -0.001007692938852532]",301,https://linkco.re/32s3tQF6
7,@astent @gchrupala Julian Urbano did some good...,1,6.670827,"[-6.672093420157561, -0.0012665479702428492]",188,https://www.slideshare.net/caerolus/statistica...
8,MQA x NY:LON Connect: “The typical 192/24 audi...,1,6.576698,"[-6.578089450065189, -0.0013914710317405562]",305,https://bit.ly/3hzgVaY
9,Lawson Entertainment Inc. “Theatrical version ...,1,6.04019,"[-6.042568321226585, -0.002378276011019153]",251,https://re-how.net/all/1779774/


In [25]:
#run w/ LogReg
tweet_predict = lr_model_predict(twitter_set, 'tweet', 'Target', prediction_twitter, 'tweet', 'twitter_test', '/Users/laurentfintoni/Desktop/University/COURSE DOCS/THESIS/Internship/musow-pipeline/')

In [26]:
#filter and display results 
tweet_predict_df = tweet_predict.copy()
tweet_predict_df = tweet_predict.loc[tweet_predict['Prediction'] == 1]
tweet_predict_df = tweet_predict_df[~tweet_predict_df.url.str.contains('|'.join(discard))]
tweet_predict_df = tweet_predict_df.drop_duplicates(subset=['tweet'])
tweet_predict_df = tweet_predict_df.sort_values(by='Score', ascending=False).reset_index(drop=True)
tweet_predict_df = tweet_predict_df[['tweet', 'Prediction', 'Score', 'Probability', 'Input Length', 'url']]
tweet_predict_df

Unnamed: 0,tweet,Prediction,Score,Probability,Input Length,url
0,#Canadian #Music #Entertainment Be a part of M...,1,10.987578,"[-10.987595077830505, -1.6910317443757728e-05]",171,https://news247planet.com/?p=65019
1,Ethio Kinits Model for first-ever Music Inform...,1,6.647761,"[-6.6490575749870855, -0.001296081750301942]",140,http://arxiv.org/abs/2201.08448v1
2,We are looking forward to another performance ...,1,6.355117,"[-6.356852884507208, -0.00173632434076957]",260,https://mic.hr/en/performance-of-detonis-the-w...
3,Download the score at the Estonian Music Infor...,1,5.917311,"[-5.919999612304317, -0.0026888128375173155]",86,https://www.emic.ee/?sisu=uudis_edasi&mid=27&l...
4,The song was found engraved on an ancient marb...,1,5.838125,"[-5.841035145713381, -0.0029100632374643148]",160,https://www.history.com/news/what-is-the-oldes...
5,USD147314S - Music information chart https:/...,1,5.806869,"[-5.809871266838435, -0.0030023184166938044]",86,https://patents.google.com/patent/USD147314
6,☆彡.。　Music Information The first round of Dig...,1,5.726753,"[-5.7300054196826, -0.0032523427732106484]",273,
7,Listen now top hits music Information First - ...,1,5.227824,"[-5.233174581593174, -0.0053508473314261475]",107,https://streamingV2.shoutcast.com/201912
8,New Year DIAMOND FES will be held urgently wit...,1,4.966142,"[-4.97308775011561, -0.006945808770592626]",220,https://evening-mashup.com/archives/157061
9,13 Behind The Scenes Music Information https:/...,1,4.788413,"[-4.796704647171208, -0.008291188811090899]",62,https://manoranjannews.xyz/2022/03/19/13-behin...


In [27]:
#optional filter by kw
tweet_predict_df_kw = tweet_predict_df[tweet_predict_df['tweet'].str.contains('music')]
tweet_predict_df_kw = tweet_predict_df_kw.drop_duplicates(subset=['tweet'])
tweet_predict_df_kw = tweet_predict_df_kw.reset_index(drop=True)
tweet_predict_df_kw 

Unnamed: 0,tweet,Prediction,Score,Probability,Input Length,url
0,Ethio Kinits Model for first-ever Music Inform...,1,6.647761,"[-6.6490575749870855, -0.001296081750301942]",140,http://arxiv.org/abs/2201.08448v1
1,We are looking forward to another performance ...,1,6.355117,"[-6.356852884507208, -0.00173632434076957]",260,https://mic.hr/en/performance-of-detonis-the-w...
2,The song was found engraved on an ancient marb...,1,5.838125,"[-5.841035145713381, -0.0029100632374643148]",160,https://www.history.com/news/what-is-the-oldes...
3,Listen now top hits music Information First - ...,1,5.227824,"[-5.233174581593174, -0.0053508473314261475]",107,https://streamingV2.shoutcast.com/201912
4,New Year DIAMOND FES will be held urgently wit...,1,4.966142,"[-4.97308775011561, -0.006945808770592626]",220,https://evening-mashup.com/archives/157061
5,Babooze part deux - gonna join @BraddahGomes f...,1,4.372113,"[-4.384658961056869, -0.012545505990268222]",279,http://KXPA.com
6,"[Music Information] #IA's best album, which in...",1,4.245728,"[-4.2599520811871185, -0.014223657462958563]",301,https://linkco.re/32s3tQF6
7,@astent @gchrupala Julian Urbano did some good...,1,4.06991,"[-4.086844912800471, -0.016934717002967795]",188,https://www.slideshare.net/caerolus/statistica...
8,MQA x NY:LON Connect: “The typical 192/24 audi...,1,4.023501,"[-4.0412334201430795, -0.017732068716144956]",305,https://bit.ly/3hzgVaY
9,Johnny Mullins was the quintessential Springfi...,1,3.486152,"[-3.5163105924546416, -0.030159088859903118]",301,https://www.theacousticshoppe.com/acoustic-mus...


Step 4:
- grab links from twitter predictions and scrape them for text 
- return a new table that can be used to predict relevance of URLs 

In [28]:
#URLs to list 
twitter_link_list_cv = [link for link in tweet_predict_cv_df_kw['url'] if 'twitter' not in link]
twitter_link_list = [link for link in tweet_predict_df_kw['url'] if 'twitter' not in link]

In [None]:
#scrape URL list 
links_to_add_cv = scrape_links(twitter_link_list_cv)
links_to_add = scrape_links(twitter_link_list)

In [134]:
#remove empty descriptions and pickle results 
links_to_add_cv = links_to_add_cv[links_to_add_cv.Description != ''].reset_index(drop=True)
links_to_add_cv.to_pickle(path+'LOGREG_RELEVANCE/SCRAPES/music_information_22_cv_scrapes.pkl')
links_to_add = links_to_add[links_to_add.Description != ''].reset_index(drop=True)
links_to_add.to_pickle(path+'LOGREG_RELEVANCE/SCRAPES/music_information_22_scrapes.pkl')

In [None]:
#optional load the scrapes for next step
#links_to_add_cv = pd.read_pickle(path+'LOGREG_RELEVANCE/SCRAPES/music_mag_22_cv_scrapes.pkl')
#links_to_add = pd.read_pickle(path+'LOGREG_RELEVANCE/SCRAPES/music_mag_22_scrapes.pkl')

Step 5:
- run the predict function on scraped URLs 
- return a DF w/ title, description, url, prediction, confidence score, probability and input length

In [135]:
#run with LogRegCV 
twitter_preds_cv = lr_model_predict_cv(new_training_set, 'Description', 'Target', 2, 'precision', links_to_add_cv, 'Description', 'extended_even_model_cv_twitter', '/Users/laurentfintoni/Desktop/University/COURSE DOCS/THESIS/Internship/musow-pipeline/')

In [140]:
#filter results by positive value and score
twitter_preds_cv_df = twitter_preds_cv.copy()
twitter_preds_cv_df = twitter_preds_cv_df.loc[twitter_preds_cv_df['Prediction'] == 1]
twitter_preds_cv_df = twitter_preds_cv_df[~twitter_preds_cv_df.Title.str.contains('|'.join(discard))]
twitter_preds_cv_df = twitter_preds_cv_df[~twitter_preds_cv_df.URL.str.contains('|'.join(discard))]
twitter_preds_cv_df = twitter_preds_cv_df.sort_values(by='Score', ascending=False).reset_index(drop=True)
twitter_preds_cv_df

Unnamed: 0,Title,Description,URL,Prediction,Score,Probability,Input Length
0,Rain and Thunder by Seven Nations,From the recording Rain and Thunder,https://sevennations.com/track/596555/rain-and...,1,7.739494,"[-7.739929631636128, -0.0004351968758552599]",35
1,Library Player: Pittman MusicDB 2021,"Developer of audio, membership and to do list ...",https://www.libraryplayer.co.uk/2021/02/musicd...,1,5.708366,"[-5.711678861983636, -0.0033125962845379363]",55
2,Home - Phat Music,"First published in 1997 out of Byron Bay, Aust...",http://phatmusiccollection.com,1,5.601697,"[-5.605381408104382, -0.003684797988279074]",316
3,Albums – Tom Lehrer Songs,An Evening (wasted) With Tom Lehrer:\nstream M...,https://tomlehrersongs.com/albums/,1,5.058579,"[-5.064913667359378, -0.006334476523915236]",333
4,"Ellen Tucker Emerson Music Collection, 1855-18...","ELLEN TUCKER EMERSON MUSIC COLLECTION, 1855-18...",https://bit.ly/3LZ8ZxG,1,4.888492,"[-4.8959960778959015, -0.00750454718448568]",4017
5,Music & Instruments For Sale Online - Cabinet ...,©2018 Cabinets Knobs & More,https://cabinetknobsandmore.com/product-catego...,1,4.687318,"[-4.696487312126836, -0.009169190994102776]",27
6,Plexamp - Apps on Google Play,This site uses cookies from Google to deliver ...,https://play.google.com/store/apps/details?id=...,1,4.368937,"[-4.381522231954484, -0.012585168682340192]",82
7,By the People Women’s Suffrage in Sheet Music,Instructions: The primary goal of transcribing...,https://buff.ly/3sTkxKf,1,3.779332,"[-3.801912713784345, -0.022581067935243348]",427
8,BBC Sounds - The Collection - Available Episodes,Tom Ravenscroft invites a different guest each...,https://www.bbc.co.uk/sounds/series/m0014cg0,1,3.739632,"[-3.763116445645012, -0.023484915235659536]",1284
9,Happy Carefree Pop ~ Background Music #1703868...,"Content generally available for advertising, p...",https://www.pond5.com/item/170386811,1,3.42779,"[-3.459732513727417, -0.0319429573599602]",215


In [78]:
twitter_preds_cv_df.iloc[0]['URL']

'https://www.nkoda.com/publishers/Accademia-Lirica-Toscana-Domenico-Cimarosa'

In [None]:
#optional filtering step 
twitter_preds_cv_df.loc[twitter_preds_cv_df['Description'].str.contains('data')]

In [149]:
#run with LogReg 
twitter_preds = lr_model_predict(new_training_set, 'Description', 'Target', links_to_add, 'Description', 'extended_even_model_twitter', '/Users/laurentfintoni/Desktop/University/COURSE DOCS/THESIS/Internship/musow-pipeline/')

In [150]:
#filter results by positive value and score
twitter_preds_df = twitter_preds.copy()
twitter_preds_df = twitter_preds_df.loc[twitter_preds_df['Prediction'] == 1]
twitter_preds_df = twitter_preds_df[~twitter_preds_df.Title.str.contains('|'.join(discard))]
twitter_preds_df = twitter_preds_df[~twitter_preds_df.URL.str.contains('|'.join(discard))]
twitter_preds_df = twitter_preds_df.sort_values(by='Score', ascending=False).reset_index(drop=True)
twitter_preds_df

Unnamed: 0,Title,Description,URL,Prediction,Score,Probability,Input Length
0,Y Glomen - National Library of Wales Archives ...,Ffeil = File 2 ff. Llais a Phiano neu Delyn.,https://archives.library.wales/index.php/y-glomen,1,2.8623,"[-2.917864733229452, -0.05556447992768134]",44
1,[Music book] - Yale University Library,Found In:\n \nBeinecke Rare Book and...,https://bit.ly/35NIxRz,1,2.146859,"[-2.2573714379301535, -0.11051280738378409]",681
2,ITMA announce new collection of field recordin...,The New Demesne features singers and instrumen...,https://www.hotpress.com/music/itma-announce-n...,1,1.785169,"[-1.9402636164535878, -0.15509487615941947]",2750
3,(76) Page 66 - Guid new year to ane an' a' \r\...,‹‹‹ prev (75)\r\n\t\t\t\t\t\t\tPage 65\n \n\r\...,http://digital.nls.uk/90262121,1,1.5398,"[-1.7340700011122872, -0.19426984848625778]",719
4,(62) Page 2 - 'Twas within a furlong of Edenbo...,‹‹‹ prev (61)\r\n\t\t\t\t\t\t\tTitle page\n \n...,http://digital.nls.uk/94723316,1,1.407211,"[-1.6262062145259017, -0.2189950441223107]",736
5,(39) Page 13 \r\n\t\t\t - Glen Collection of p...,‹‹‹ prev (38)\r\n\t\t\t\t\t\t\t\n \n\r\n\t\t\t...,http://digital.nls.uk/87898604,1,1.245725,"[-1.4986076126730141, -0.2528827347692595]",728
6,Wittliff Collections acquires Stevie Ray Vaugh...,The Wittliff Collections at Texas State Univer...,https://ift.tt/UPzfGt1,1,1.226859,"[-1.4839877664440284, -0.2571292401815912]",3218
7,(73) Page 53 \r\n\t\t\t - Inglis Collection of...,‹‹‹ prev (72)\r\n\t\t\t\t\t\t\tPage 52\n \n\r\...,http://digital.nls.uk/94576220,1,1.194395,"[-1.4589774419717148, -0.26458275613200255]",733
8,Welsh Traditional Music | The National Library...,The study of Welsh traditional music here at t...,https://www.library.wales/collections/learn-mo...,1,1.10053,"[-1.3877332932240711, -0.28720288816632455]",1923
9,Radio Show | Dancemusicarchive,DANCE MUSIC ARCHIVE Home 1980s 1990s 2000s 201...,https://www.dancemusicarchive.com/radioshow,1,0.760206,"[-1.1438140955338771, -0.38360801152476526]",1558
