# Pipeline

## Imports

In [32]:
import os , json , csv , datetime , dateutil.parser , unicodedata , time
from datetime import datetime , date , timedelta 
# classifier
import pandas as pd
from pandas import Timestamp as timestamp
import pickle
import numpy as np
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression
from sklearn.metrics import classification_report 
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.feature_extraction.text import TfidfVectorizer

# web scraping
import requests
from bs4 import BeautifulSoup
#!pip3 install trafilatura
import trafilatura
from transformers import pipeline


## Variables

In [33]:
path = '../'

# descriptions training set -> v2 = musow+mji descriptions vs summarized scrapes from twitter searches  
archive_desc_training_v2 = pd.read_pickle(path+'LOGREG_RELEVANCE/TRAINING_SETS/archive_desc_training_v2.pkl')

# twitter training set -> v1 = tweets from bigrams vs tweets for digital humanities and music company 
twitter_training_set_v1 = pd.read_pickle(path+'LOGREG_RELEVANCE/TRAINING_SETS/twitter_training_v1.pkl')

#kw and sites to remove from url and title strings 
discard = ['youtu', '404', 'Not Found', 'bandcamp', 'ebay', 'It needs a human touch', 'Page not found', 'open.spotify.com', 'We\'re sorry...', 'Not Acceptable!', 'Access denied', '412 Error', 'goo.gl', 'instagr.am', 'soundcloud', 'apple.co', 'amzn', 'masterstillmusic', 'Facebook', 'facebook', 'sheetmusiclibrary.website', 'Unsupported browser', 'Last.fm', 'last.fm', 'amazon', 'tidal.com', 'tmblr.co', 'blogspot', 'dailymusicroll', 'PortalTaxiMusic', 'apple.news', 'yahoo.com', 'sheetmusicplus.com', 'musicnotes.com', 'musescore.com', 'etsy', 'nts.live', 'twitch.tv', 'YouTube', 'radiosparx.com', 'freemusicarchive.org', 'blastradio', 'opensea', 'mixcloud', 'catalog.works', 'nft', 'NFT', 'allmusic.com', 'foundation.app', 'Robot or human?', 'heardle', 'insession.agency', 'jobvite', 'career', 'docs.google.com/forms/', 'discogs.com', 'zora.co', 'play.google.com', 't.me', 'mintable.app', 'instagram', 'linkedin', 'forms.gle', 'vimeo', 'radioiita', 'spotify', 'event', 'mediafire', 'noodsradio', 'pinterest', 'rakuten', 'stackoverflow', 'fiverr', 'patreon']

## Functions

In [44]:
def lr_training(t_input, t_feature, target, cv_int, score_type, filename, path):
    """ Create a text classifier based on Logistic regression and TF-IDF. Use cross validation 
    
    Parameters
    ----------
    t_input: list
        dataframe including the training set
    t_feature: list
        df column, text of tweet or description of the resource
    target: list
        df column, [0,1] values
    cv_int: int
        the number of cross validation folding
    score_type: str
        precision or recall
    filename: str
        model file name
    path: str
        parent folder
    """
    # TODO eda to define max_features=1000
      
    #count_vect = CountVectorizer()
    #tfidf_transformer = TfidfTransformer() 
    #x_train = tfidf_transformer.fit_transform(x_count)
    tfidf_transformer = TfidfVectorizer(ngram_range=(1, 2), lowercase=True, max_features=1000) 
    x_train = tfidf_transformer.fit_transform(t_input[t_feature])
    y_train = t_input[target].values
    model = LogisticRegressionCV(solver='liblinear', random_state=44, cv=cv_int, scoring=score_type)
    
    # export
    model.fit(x_train, y_train)
    export_model = f'LOGREG_RELEVANCE/MODELS/{filename}_model.pkl'
    export_vectorizer = f'LOGREG_RELEVANCE/MODELS/{filename}_vectorizer.pkl'
    pickle.dump(model, open(path+export_model, 'wb'))
    pickle.dump(tfidf_transformer, open(path+export_vectorizer, 'wb'))
    
    # report
    y_pred = cross_val_predict(model, x_train, y_train, cv=cv_int)
    report = classification_report(y_train, y_pred)
    print('report:', report, sep='\n')
    return model
    
def lr_predict(path, filename, p_input, p_feature):
    """ Classify text using a pickled model based on Logistic regression and TF-IDF.
    
    Parameters
    ----------
    p_input: list
        dataframe including the prediction set
    p_feature: list
        df column, text of tweet or description of the resource
    filename: str
        model file name
    path: str
        parent folder
    """
    export_model = f'{path}LOGREG_RELEVANCE/MODELS/{filename}_model.pkl'
    export_vectorizer = f'{path}LOGREG_RELEVANCE/MODELS/{filename}_vectorizer.pkl'
    model = pickle.load(open(export_model, 'rb'))
    tfidf_transformer = pickle.load(open(export_vectorizer, 'rb'))
  
    #result = loaded_model.score(X_test, Y_test)
    #x_new_count = count_vect.transform(p_input[p_feature])
    x_predict = tfidf_transformer.transform(p_input[p_feature])
    y_predict = model.predict(x_predict)
    scores = model.decision_function(x_predict)
    probability = model.predict_proba(x_predict)
    
    #results = [r for r in y_predict]
    result = p_input.copy()
    result['Prediction'] = y_predict
    result['Score'] = scores
    result['Probability'] = probability[:,1]
    result['Input Length'] = result[p_feature].str.len()
    return result

def create_url(keyword, start_date, end_date, max_results):
        search_url = "https://api.twitter.com/2/tweets/search/all" #Change to the endpoint you want to collect data from
        #change params based on the endpoint you are using
        query_params = {'query': keyword,
                        'start_time': start_date,
                        'end_time': end_date,
                        'max_results': max_results,
                        'expansions': 'author_id,in_reply_to_user_id,geo.place_id',
                        'tweet.fields': 'id,text,author_id,in_reply_to_user_id,geo,conversation_id,created_at,lang,public_metrics,referenced_tweets,reply_settings,source,entities',
                        'user.fields': 'id,name,username,created_at,description,public_metrics,verified',
                        'place.fields': 'full_name,id,country,country_code,geo,name,place_type',
                        'next_token': {}}
        return (search_url, query_params)
    
def connect_to_endpoint(url, headers, params, next_token = None):
    params['next_token'] = next_token   #params object received from create_url function
    response = requests.request("GET", url, headers = headers, params = params)
    print("Endpoint Response Code: " + str(response.status_code))
    if response.status_code != 200:
        raise Exception(response.status_code, response.text)
    return response.json()
 
def append_to_csv(json_response, fileName):
    #A counter variable
    counter = 0

    #Open OR create the target CSV file
    csvFile = open(fileName, "a", newline="", encoding='utf-8')
    csvWriter = csv.writer(csvFile)
    
    #setup usernames via includes
    username = {user['id']: user['username'] for user in json_response['includes']['users']}
    
    #Loop through each tweet
    for tweet in json_response['data']:

        # 1. Username
        author_id = tweet['author_id']
        user = username[author_id]

        # 2. Time created
        created_at = dateutil.parser.parse(tweet['created_at'])

        # 3. Language
        lang = tweet['lang']

        # 4. Tweet metrics
        retweet_count = tweet['public_metrics']['retweet_count']
        reply_count = tweet['public_metrics']['reply_count']
        like_count = tweet['public_metrics']['like_count']
        quote_count = tweet['public_metrics']['quote_count']

        #5. URLs 
        if ('entities' in tweet) and ('urls' in tweet['entities']):
            for url in tweet['entities']['urls']:
                url = [url['expanded_url'] for url in tweet['entities']['urls'] if 'twitter.com' not in url['expanded_url']]
                url = ', '.join(url)
        else:
            url = " "
        
        #6. Tweet text
        text = tweet['text'] 
        
        # Assemble all data in a list
        res = [user, created_at, lang, like_count, quote_count, reply_count, retweet_count, text, url]

        # Append the result to the CSV file
        csvWriter.writerow(res)
        counter += 1    
    
    # When done, close the CSV file
    csvFile.close()

    # Print the number of tweets for this iteration
    print("# of Tweets added from this response: ", counter) 
    
def twitter_search(token, input_keywords, start, end, mresults, mcount, path='../'):
    
    # TODO filter tweets in english only OR tweak TF-IDF stopwords (lang detection)
    # TODO clean tweets from @ and emoji
    bearer_token = token
    headers = {"Authorization": "Bearer {}".format(bearer_token)}
    input_keywords   
    start_list = start
    end_list =  end
    max_results = mresults
    total_tweets = 0

    # Create file
    file_name = str(end[0]).replace(':','-').replace('/','-')
    csvFile = open(f'{path}TWITTER_SEARCHES/{file_name}.csv', "a", newline="", encoding='utf-8')
    csvWriter = csv.writer(csvFile)
    csvWriter.writerow(['user', 'created_at', 'lang', 'like_count', 'quote_count', 'reply_count','retweet_count','tweet', 'url'])
    csvFile.close()

    for i in range(0,len(start_list)):
        # Inputs
        count = 0 # Counting tweets per time period
        max_count = mcount # Max tweets per time period
        flag = True
        next_token = None
        
        while flag:
            # Check if max_count reached
            if count >= max_count:
                break
            print("-------------------")
            print("Token: ", next_token)
            for keyword in input_keywords:
                url = create_url(keyword, start_list[i],end_list[i], max_results)
                json_response = connect_to_endpoint(url[0], headers, url[1], next_token)
                result_count = json_response['meta']['result_count']

                if 'next_token' in json_response['meta']:
                    # Save the token to use for next call
                    next_token = json_response['meta']['next_token']
                    print("Next Token: ", next_token)
                    if result_count is not None and result_count > 0 and next_token is not None:
                        print("Start Date: ", start_list[i])
                        append_to_csv(json_response, f'{path}TWITTER_SEARCHES/{file_name}.csv')
                        count += result_count
                        total_tweets += result_count
                        print("Total # of Tweets added: ", total_tweets)
                        print("-------------------")
                        time.sleep(5)                
                # If no next token exists
                else:
                    if result_count is not None and result_count > 0:
                        print("-------------------")
                        print("Start Date: ", start_list[i])
                        append_to_csv(json_response, f'{path}TWITTER_SEARCHES/{file_name}.csv')
                        count += result_count
                        total_tweets += result_count
                        print("Total # of Tweets added: ", total_tweets)
                        print("-------------------")
                        time.sleep(5)

                    #Since this is the final request, turn flag to false to move to the next time period.
                    flag = False
                    next_token = None
                time.sleep(5)
    print("Total number of results: ", total_tweets)
    
    df = pd.read_csv(f'{path}TWITTER_SEARCHES/{file_name}.csv', keep_default_na=False, dtype={"user": "string", "lang": "string", "tweet": "string", "url": "string"})
    
    # clean the tweet from meentions and hashtags
    df['tweet'].replace( { r"@[A-Za-z0-9_]+" : '' }, inplace= True, regex = True)
    df['tweet'].replace( { r"#[A-Za-z0-9_]+" : '' }, inplace= True, regex = True)
    
    # remove tweets that are not in english
    df = df[df['lang'].isin(['en'])]
    
    return df

def scrape_links(link_list):
    links = pd.DataFrame(columns=['Title', 'Description', 'URL'])
    summarizer = pipeline("summarization")
    
    for link in link_list:
        URL = link
        page = None
        ARTICLE = ''
        try:
            x = requests.head(URL)
            content_type = x.headers["Content-Type"] if "Content-Type" in x.headers else "None"
            if ("text/html" in content_type.lower()):
                page = requests.get(URL)
        except Exception:
            pass
        
        if page:
            soup = BeautifulSoup(page.content, "html.parser")
            title = ' '.join([t.text for t in soup.find('head').find_all('title')]).strip() \
                if soup and soup.find('head') and soup.find('body') is not None \
                else URL
            
            try:
                downloaded = trafilatura.fetch_url(URL)
                ARTICLE = trafilatura.extract(downloaded, include_comments=False, include_tables=True)
            except Exception:
                results = soup.find_all(['h1', 'p'])
                text = [result.text for result in results]
                ARTICLE = ' '.join(text)
            
            if ARTICLE is not None and len(ARTICLE) > 200:
                # text summarisation
                max_chunk = 500
                #removing special characters and replacing with end of sentence
                ARTICLE = ARTICLE.replace('.', '.<eos>')
                ARTICLE = ARTICLE.replace('?', '?<eos>')
                ARTICLE = ARTICLE.replace('!', '!<eos>')
                sentences = ARTICLE.split('<eos>')
                current_chunk = 0 
                chunks = []

                # split text to process
                for sentence in sentences:
                    if len(chunks) == current_chunk + 1: 
                        if len(chunks[current_chunk]) + len(sentence.split(' ')) <= max_chunk:
                            chunks[current_chunk].extend(sentence.split(' '))
                        else:
                            current_chunk += 1
                            chunks.append(sentence.split(' '))
                    else:
                        chunks.append(sentence.split(' '))

                for chunk_id in range(len(chunks)):
                    chunks[chunk_id] = ' '.join(chunks[chunk_id])
                try:
                    res = summarizer(chunks, max_length=120, min_length=30, do_sample=False)
                    # summary
                    text = ' '.join([summ['summary_text'] for summ in res])
                except Exception:
                    text = ARTICLE
                    continue
            else:
                text = ARTICLE
            print(URL,title,'\n',text)
            new_row = {'Title': title, 'Description': text, 'URL': URL.strip()}
            links = links.append(new_row, ignore_index=True)
    return links


## Training twitter and descriptions classifiers

This is a ONE TIME operation. The models are pickled and loaded later to predict new results

In [45]:
# one time training on twitter
#twitter_training_model = lr_training(twitter_training_set_v1, 'tweet', 'Target', 10, 'precision', 'twitter', path)

# one time training on resources
#resource_training_model = lr_training(archive_desc_training_v2, 'Description', 'Target', 10, 'f1','resources',path)

report:
              precision    recall  f1-score   support

           0       0.93      0.98      0.95      4379
           1       0.97      0.92      0.95      4379

    accuracy                           0.95      8758
   macro avg       0.95      0.95      0.95      8758
weighted avg       0.95      0.95      0.95      8758

report:
              precision    recall  f1-score   support

           0       0.94      0.91      0.93       538
           1       0.94      0.96      0.95       786

    accuracy                           0.94      1324
   macro avg       0.94      0.94      0.94      1324
weighted avg       0.94      0.94      0.94      1324



## Query Twitter

Calls Twitter API with the list of keywords and returns the table `prediction_twitter`

In [35]:
token = 'AAAAAAAAAAAAAAAAAAAAAJgsNAEAAAAAQcsgbUnOJJmqmU483%2F8x6n9V1i8%3Df0qaEo9cV1sWP4eyNQ6E9s8BiRjvFTSN9mSqithe8uIXSNP68x'

# a selection of keywords from KEYWORDS/bg_summary.csv
# keywords = ['sheet music','music archive','music collection','music library','black music','sound recording','midi file','early music','sound archive','music information','music history','music research','musical score','song dataset','library music','music oral','score collection','digitized score']
keywords = ['sheet music','music archive']
input_keywords = [k+" -is:retweet" for k in keywords] 

today = date.today()
week_ago = today - timedelta(days=7)
start = [week_ago.strftime("%Y-%m-%dT%H:%M:%S.000Z")]
end = [today.strftime("%Y-%m-%dT%H:%M:%S.000Z")]

mresults = 50 # for each keyword
mcount = 50 # for each timespan (only one, last week, here)

prediction_twitter = twitter_search(token, input_keywords, start, end, mresults, mcount, path)

-------------------
Token:  None
Endpoint Response Code: 200
Next Token:  b26v89c19zqg8o3fpyzm912w83sgldpfb9tpxxch3vibh
Start Date:  2022-06-01T00:00:00.000Z
# of Tweets added from this response:  50
Total # of Tweets added:  50
-------------------
Endpoint Response Code: 200
Next Token:  b26v89c19zqg8o3fpyzm8yxvexdv4ml8a38i7w3zevzwd
Start Date:  2022-06-01T00:00:00.000Z
# of Tweets added from this response:  50
Total # of Tweets added:  100
-------------------
Total number of results:  100


## Classify tweets

In [40]:
# predictions
twitter_predictions = lr_predict(path, 'twitter', prediction_twitter, 'tweet')

tweet_predict_cv_df = twitter_predictions.copy().drop_duplicates()
tweet_predict_cv_df = tweet_predict_cv_df.loc[tweet_predict_cv_df['Prediction'] == 1]
tweet_predict_cv_df = tweet_predict_cv_df[~tweet_predict_cv_df.url.str.contains('|'.join(discard))]
tweet_predict_cv_df = tweet_predict_cv_df.sort_values(by='Score', ascending=False).reset_index(drop=True)
tweet_predict_cv_df = tweet_predict_cv_df[['tweet', 'Prediction', 'Score', 'Probability', 'Input Length', 'url']]
tweet_predict_cv_df

Unnamed: 0,tweet,Prediction,Score,Probability,Input Length,url
0,"Same, but then I use it to hunt down moods, t...",1,9.879258,0.999949,150,
1,Tickets are available for a free day conferenc...,1,8.314758,0.999755,238,http://bit.ly/TradArchive
2,Musicians: Is there a good app that not only s...,1,7.562798,0.999481,122,
3,"Spotlighting , our Artist Collection is now av...",1,7.401332,0.999390,226,
4,60 Must-Have Albums from 1972 for a Vinyl Coll...,1,7.032431,0.999118,86,https://bit.ly/3f1GNLr
...,...,...,...,...,...,...
128,*throws music on shuffle to fall asleep* *mos...,1,0.245210,0.560997,117,
129,https://t.co/JlrA7jrviw THAT'S WHAT'S WRONG WI...,1,0.171323,0.542726,116,https://archive.org/details/78_thats-whats-wro...
130,"In the ongoing series ""I'm hopeless at sheet m...",1,0.154822,0.538628,294,
131,Babson undergrad here - this is the wrong a...,1,0.087492,0.521859,283,


## Scrape URLS

In [41]:
# get links from positive tweets results
twitter_link_list = [link for link in tweet_predict_cv_df['url'] if 'twitter' not in link]

# scrape URL list
links_to_add = scrape_links(twitter_link_list)

# remove empty descriptions 
links_to_add = links_to_add.fillna('None')
links_to_add = links_to_add[links_to_add.Description != '']
links_to_add = links_to_add[links_to_add.Description != 'None'].reset_index(drop=True)

No model was supplied, defaulted to sshleifer/distilbart-cnn-12-6 (https://huggingface.co/sshleifer/distilbart-cnn-12-6)


http://bit.ly/TradArchive Scottish Storytelling Centre 
 None


  links = links.append(new_row, ignore_index=True)


https://bit.ly/3f1GNLr Perceptions, Observations, & Musings of an Old Man: 60 Must-Have Albums from 1972  for a Vinyl Collection 
 


  links = links.append(new_row, ignore_index=True)


https://nickbob.com/2022/06/07/words-music-may-1965-launches-the-lou-reed-archive-unheard-music-marks-the-genesis-of-the-velvet-undergroundavailable-late-august/ Words & Music, May 1965 launches the Lou Reed Archive: unheard music marks the genesis of The Velvet Underground: available late August – Bucketfull Of Brains 
  Words & Music, May 1965, is the first title of the Lou Reed Archive Series . Reed was a self-taught guitarist, a shy, Long Island-bred artist who spent much of his youth seeking refuge in rock’n’roll .  Lou Reed and John Cale discovered a musical kinship with each other through the summer and fall of 1965 . The earliest known recordings of ‘Heroin’ and ‘I’m Waiting for the Man’ are among the gems on Words & Music .  Other notable selections include ‘Buttercup Song’, ‘Men of Good Fortune’ and ‘Buzz Buzz Buzz Buzz’ . Also included is a Cale-fronted edition of ‘Wrap Your Troubles in Dreams,’ which Nico recorded for her 1967 debut, Chelsea Girl .


  links = links.append(new_row, ignore_index=True)


https://lugarconparlantes.com/?p=1474 "Words & Music, May 1965", la primera entrega de Lou Reed Archive Series - ConParlantes 
  Light in the Attic Records anounces el lanzamiento of una serie de archivo de música de Lou Reed . The material recién remasterizado se extrae de cintas grabadas by Reed antes of The Velvet Underground . El álbum inicial, 'Words & Music, May 1965», llegará el 26 de agosto .


  links = links.append(new_row, ignore_index=True)


https://viinterchild.notion.site/Filler-videos-for-streaming-e2d3c27cf4c44493b30a779f9ee25b12 Notion – The all-in-one workspace for your notes, tasks, wikis, and databases. 
  @keyframes rotate {0% {-webkit-transform:rotate(0) translateZ(0); .@-mo-animation :rotate 1s linear infinite; . @keyframe: rotate; .@keyframes: rotate . @-moS: rotate. @-s: rotate, rotate.@-s. rotate .


  links = links.append(new_row, ignore_index=True)


http://ballsackradio.com ballsack radio :) 
  DJ BajaBlast's Twitter account has been running since November 29, 2020 . Cassette tapes for sale SOON ! Artist - name: "Buttcheek Broadcast (April Fools '22)


  links = links.append(new_row, ignore_index=True)


https://www.newframe.com/rangoato-hlasane-salutes-kwaitos-intellectual-legacy/ Rangoato Hlasane salutes kwaito’s intellectual legacy : New Frame 
  The tale of Tsodio was an obscure tale travelling through Sepedi orature . Johannes Mokgwadi recorded it in 1974, transforming it into a song that would open it to further reinterpretation . Paulina Mphoka was the first of many to place it in the social context of her time . Joe Shirimani’s version of the tale took on a more pop feel and lost much of its traditional sound .  Sesasedi Sa Tsodio is an attempt to explore the multiple contexts that have become attached to the tale . The film is set against the backdrop of three places that contribute significantly to South Africa’s incessant hunger for migrant labour: Meadowlands, Mamelodi and Mahikeng .  Sesasedi Sa Tsodio is part of Hlasane’s PhD thesis at the University of the Witwatersrand, which explores kwaito visual culture . The film is about the unfinished business of the genre and how

  links = links.append(new_row, ignore_index=True)


https://www.diamondzoc.com/collections/renaissance-chessboard-collection Renaissance Chessboard Collection – Diamondz Original Clothing 
  Hell Razah Music Inc.  and Diamondz O.C. C.  present . The Renaissance Chessboard Collection. The Renaissance chessboard Collection is a collection of clothing and accessories .


  links = links.append(new_row, ignore_index=True)


http://hermitage-crabapple.amebaownd.com/pages/811095/page_201701220951 SELECTION  （J.S.Bach etc.） | Hermitage's Crab Apple - Cat.M. 
 ! d o c t y p e h t m l >


  links = links.append(new_row, ignore_index=True)


https://shiptoshoremedia.com/collections/scarlet-moon-records Scarlet Moon Records | Ship to Shore PhonoCo. - Ship to Shore Media 
 Your Cart is Empty


  links = links.append(new_row, ignore_index=True)


https://shiptoshoremedia.com/collections/scarlet-moon-records Scarlet Moon Records | Ship to Shore PhonoCo. - Ship to Shore Media 
 Your Cart is Empty


  links = links.append(new_row, ignore_index=True)


https://shiptoshoremedia.com/collections/scarlet-moon-records Scarlet Moon Records | Ship to Shore PhonoCo. - Ship to Shore Media 
 Your Cart is Empty


  links = links.append(new_row, ignore_index=True)


https://www.washingtonpost.com/dc-md-va/2022/06/05/joe-bussard-record-collector-78s/ Joe Bussard amassed thousands of 78 records over a lifetime devoted to preserving early American roots music - The Washington Post 
 None


  links = links.append(new_row, ignore_index=True)


https://www.redkattwreckchords.com/rk7/checagou-vol-3 Checagou, Vol. 3 — red.katt.wreck.chords 
 Checagou, Vol. 3 Get the music: spotify amazon apple music iheart pandora deezer Released 2021Red.katt.wreck.chords


  links = links.append(new_row, ignore_index=True)


https://archive.org/details/lp_the-traditional-music-of-beech-mountain-no_various_0 The Traditional Music Of Beech Mountain, North Carolina, Vol 1: "The Older Ballads And Sacred Songs" : Various : Free Download, Borrow, and Streaming : Internet Archive 
  The Farmer's Curst Wife (Child 278) and The Two Sisters (Child 10) are among the most popular children's songs . The children's version of this week's Top Five is based on the book, "Awake, Awake"


  links = links.append(new_row, ignore_index=True)
Your max_length is set to 120, but you input_length is only 26. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=13)


http://www.largeheartedboy.com/blog/archive/2022/06/robert_mcgills.html Largehearted Boy: Robert McGill's Playlist for His Novel "A Suitable Companion for the End of Your Life" 
  Robert McGill's novel A Suitable Companion for the End of Your Life is a dark, speculative novel with echoes of The Handmaid’s Tale, set against the backdrop of a plague . McGill says he needed silence to fully immerse himself in the weird version of reality he was creating .  Gordon Lightfoot’s “If You Could Read My Mind” is the only song on this list that gets a shout-out in A Suitable Companion . Regina Spektor's “Better” explores how, as adults, we look to intimate relationships to help fix us and others .  Robert McGill's novel, A Suitable Companion, is inspired by the songs of Belle and Sebastian and Mor Mor MorMor . McGill says the songs inspired his novel's opening scenario, in which Regan condemns herself to living out her final days while sealed in her house, felt more relevant to contemporary life 

  links = links.append(new_row, ignore_index=True)


http://www.resident-music.com/productdetails&product_id=90983 steps - platinum collection - resident 
  25 years of Steps hits come together on one package to celebrate the band’s incredible success . Includes all their career highlights plus two brand-new tracks recorded exclusively for this release .


  links = links.append(new_row, ignore_index=True)


https://www.mymediaalexa.com My Media for Amazon Alexa 
  My Media for Alexa installs a tiny media server on your computer that integrates with the ease and power of Amazon Alexa to give you voice control over your media collection . My Media lets you stream your music collection to your Amazon Echo or Amazon Dot without having to upload all your collection to the Cloud . This keeps your music under your control .  The Sonos One does not currently support all the Audio Player API interfaces that My Media for Alexa requires . Alexa streams your music from your music library therefore your computer must be on whilst you play your music .  Amazon does not officially provide access to third party skills to fully support Alexa multi-room commands . However, click here for a workaround that enables multiroom in a different way . You make a single payment to use the My Media for Alexa app for a year .


  links = links.append(new_row, ignore_index=True)
Your max_length is set to 120, but you input_length is only 67. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=33)


https://lttr.ai/xzV9 Emma Peterson Releases Her Debut Album "Just For The Record" 
  Emma Peterson's new album Just For The Record is a collection of 6 songs . Peterson is from Winnipegosis, Manitoba . The 6 song set is a party anthem for anyone with small town roots .  Emma Peterson's debut album, Just For The Record, is a pop-country collection of engaging, crowd-pleasing songs from an up-and-coming talent . Peterson is no stranger to the Canadian country music scene, having accumulated multiple Manitoba Country Music Award nominations, including a win in the Emerging Artist category .  A day without music is a waste of day, he believes, and can usually be found cranking some of his favourite Canadian artists like James Barker Band, Madeline Merlo, and Brett Kissel .


  links = links.append(new_row, ignore_index=True)


http://dlvr.it/SRmcYx Alexa Morris Rises To VP At Warner Chappell Music 
  Alexa Morris Rises To VP At Warner Chappell Music . Her duties involve drafting and negotiating publishing and administration agreements, joint ventures and catalog acquisitions .


  links = links.append(new_row, ignore_index=True)


https://www.washingtonpost.com/dc-md-va/2022/06/05/joe-bussard-record-collector-78s/?tid=ss_tw Joe Bussard amassed thousands of 78 records over a lifetime devoted to preserving early American roots music - The Washington Post 
 None


  links = links.append(new_row, ignore_index=True)


http://www.largeheartedboy.com/blog/archive/2022/06/shorties_an_exc_113.html Largehearted Boy: Shorties (An Excerpt from Saeed Jones' Forthcoming Collection, A Conversation Between Jeff Tweedy and Terry Allen, and more) 
  The Oxford American shared an excerpt from Saeed Jones' poetry collection Alive at the End of the World . Texas Monthly shared a conversation between Jeff Tweedy and Terry Allen . The Current listed the best songs of 2022 so far .


  links = links.append(new_row, ignore_index=True)


http://eepurl.com/h3RcRf Black Diamond Film & Music Festival June 17th - 19th @ Crescent Cultural Center 
 None


  links = links.append(new_row, ignore_index=True)


http://dlvr.it/SRmXZ5 Alexa Morris Rises To VP At Warner Chappell Music 
  Alexa Morris Rises To VP At Warner Chappell Music . Her duties involve drafting and negotiating publishing and administration agreements, joint ventures and catalog acquisitions .


  links = links.append(new_row, ignore_index=True)
Your max_length is set to 120, but you input_length is only 86. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=43)


http://dlvr.it/SRlnrt Anuel AA, the King of Latin Trap Music, Launches a New NFT Collection - KAZI MAGAZINE 
  Anuel AA, the Puerto Rican rapper and singer who’s often credited with popularizing the Latin trap subgenre, has launched a new collection of NFTs . The move is indicative of an emerging trend in which musicians are using blockchain technology to monetize their work .  His NFT collection’s launch will coincide with his next world tour, where Anuel hopes to meet more fans globally . His dream is to inspire more people to remain true to themselves, chasing after their dreams .


  links = links.append(new_row, ignore_index=True)


http://www.cetvphilippines.com/moira-dela-torre-angela-ken-bini-and-bgyos-songs-to-send-to-the-moon-in-2023/ Moira Dela Torre, Angela Ken, BINI, and BGYO’s songs to send to the moon in 2023 – CETV Philippines 
  Moira Dela Torre, Angela Ken, BINI and BGYO’s music will be sent to the moon in 2023 as part of the Lunar Codex . The Lunar Codex is an archive of art, literature, music, and film bound for the Moon .


  links = links.append(new_row, ignore_index=True)
Your max_length is set to 120, but you input_length is only 91. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=45)


https://archive.org/details/dmst2007-05-15 Do Make Say Think Live at The Music Box on 2007-05-15 : Free Download, Borrow, and Streaming : Internet Archive 
  Due to a planned power outage on Friday, 1/14, between 8am-1pm PST, some services may be impacted . The outage is scheduled for 8am to 1pm PST .


  links = links.append(new_row, ignore_index=True)


https://archive.org/details/78_lover-boy_sherry-parsons-lewis-scott-joe-leahy_gbia0284409b LOVER BOY : SHERRY PARSONS : Free Download, Borrow, and Streaming : Internet Archive 
  Four stylii were used to transfer this record . The preferred versions suggested by an audio engineer at George Blood, L. P. have been copied to have the more friendly filenames .


  links = links.append(new_row, ignore_index=True)
Your max_length is set to 120, but you input_length is only 28. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=14)


https://bit.ly/3x2eKVN Lunchtime Vocal & Piano Recital  ft. Harshita Parekh &  Nathan Stubbings Tickets, Wed 15 Jun 2022 at 12:30 | Eventbrite 
  Lunchtime Vocal & Piano Recital ft.  Harshita Parekh & Nathan Stubbings (voice) for a free lunchtime recital .  Advanced booking is advised, but a limited number of spaces will be available to walk up on the day . There will be no rush to book ahead of the event, but there will be plenty of places for walk-ups .


  links = links.append(new_row, ignore_index=True)


https://dcpla.ie/3zmtBc8 eResources | Dublin City Council 
  Through Dublin City Libraries you can access a wide variety of online resources such as ebooks, databases, archives, dictionaries, publications and music collections . Some of these online resources are only available in branches, but lots can be accessed wherever you may be with your library card . As part of your membership, you can also avail of online courses .  Free music and art instruction through self-paced video lessons from Grammy Award-winning music and artistic professionals . Free access to both beginner and advanced lessons for instruments, voice, art and music theory .  Dublin City Council Digital Repository features photographs, postcards, letters, maps and ephemeral material . Traffic jams during the 1974 CIE Bus Strikes, and jubilant Heffo’s army supporters are among 43,000 historic photographs and documents that are freely available online .  If your barcode begins TEMP, visit your local library to upgrade 

  links = links.append(new_row, ignore_index=True)


https://archive.org/details/78_looks-like-the-jokes-on-me_jaye-pace-eddy-wilson-leo-lefleur-herman-stein_gbia0284412b LOOKS LIKE THE JOKE'S ON ME : JAYE PACE : Free Download, Borrow, and Streaming : Internet Archive 
  Four stylii were used to transfer this record . The preferred versions suggested by an audio engineer at George Blood, L. P. have been copied to have the more friendly filenames .


  links = links.append(new_row, ignore_index=True)
Your max_length is set to 120, but you input_length is only 68. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=34)


https://www.aweber.com/b/O_HCM Discovered Insider brings you the best videos from Emerging Creators 
  Yung Martez is a buzzing new artist out of Houston, Texas ready to explode from the streets to the charts in 2022 . Check out this official music video for ‘Angels’ from is latest album release ELEVATION . The Art of Ratchet is a Texas super group consisting of Mr. Rack Daddy and Dallas legend himself himself FAT Pimp .  APEX Legends highlights from around the world!                 |.                |                |          Ś��            :Jumpmaster Episode 9.                :                |                |                :                             ‚�Ś™: Pundits: Jumpmaster: “Criminals”, ‚”; ‚�‚’s “’�s”: ‘’; ‘Crimins’: �


  links = links.append(new_row, ignore_index=True)


https://archive.org/details/muscletough2022-06-04.cmc65xt.sbd.matrix.flac24 Muscle Tough Live at Park City Music Hall on 2022-06-04 : Free Download, Borrow, and Streaming : Internet Archive 
  The normal 16 bit CD quality version of this show is available for download/CD burning . This is a 24 bit record, NOT intended for CD burning . Muscle Tough opened for DJ Logic & Friends .


  links = links.append(new_row, ignore_index=True)


https://archive.org/details/muscletough2022-06-04.cmc65xt.sbd.matrix.flac16 Muscle Tough Live at Park City Music Hall on 2022-06-04 : Free Download, Borrow, and Streaming : Internet Archive 
  Muscle Tough recorded, edited and mixed by Eric McRoberts 06/06-07/2022 - FOH mix by Eric DiBerardino . Set one: Bummer Boys, Bread Dread and Future Suture Selfie .


  links = links.append(new_row, ignore_index=True)


https://rekkerd.org/save-50-on-passages-lofi-melodies-analog-one-shots-hazel-by-waxie-music-library/ Save 50% on Passages Lofi Melodies, Analog One Shots & Hazel by Waxie Music Library 
  Prime Loops has launched a limited time promotion on a selection of Waxie Music Library sample packs . Geared towards Lofi, Chill Hop, Ambient, Boom Bap, and Trap beats, Waxie Vol. 1 – Passages contains 12 original compositions .


  links = links.append(new_row, ignore_index=True)
Your max_length is set to 120, but you input_length is only 95. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=47)


https://www.popsike.com/Music-For-Advertising-Joan-John-Shakespeare-Private-Demo-Library-1st-UK-LP-60s/312095070017.html popsike.com - Login - vinyl records price guide 
  You first have to click on this link in order to validate your registration . Did you enter a valid email-address when registering ?  Maybe the email landed in your Spam- folder ?  Are you using the correct username/password ? Username and password are CaSe SeNsItIvE !


  links = links.append(new_row, ignore_index=True)


https://www.allaccess.com/story/218859 2022 'iHeartRadio Music Festival' Lineup Revealed 
  The two-day Main Stage will feature AVRIL LAVIGNE, BLACK EYED PEAS, HALSEY, LL COOL J FEAT, DJ Z-TRIP, LUKE COMBS, MAREN MORRIS, MEGAN THEE STALLION, MORGAN WALLEN, NICKI MINAJ, PAT BENATAR & NEIL GIRALDO, SAM SMITH, THE BLACK KEYS and more . The event at LAS VEGAS' T-MOBILE ARENA will be hosted by R


  links = links.append(new_row, ignore_index=True)
Your max_length is set to 120, but you input_length is only 116. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=58)


http://eepurl.com/h3Nj5z Looking for New Age Music for Cable TV Show Placements! 
  New Age music for multiple cable TV shows . Can be either songs with vocals or instrumentals . Vocals must be clean!  We have placed hundreds of songs on multiple reality TV shows and they're always looking for more .


  links = links.append(new_row, ignore_index=True)


http://dlvr.it/SRn7lx Endurance Music Group Re-Signs Songwriter/Artist Jake Rose 
  Endurance Music Group Re-Signs Songwriter/Artist Jake Rose . Jake Rose has penned recent singles and cuts by JIMMIE ALLEN, FLORIDA GEORGIA LINE, BRIAN KELLEY, BLAKE SHELTON and more .


  links = links.append(new_row, ignore_index=True)


http://dlvr.it/SRn4l8 Endurance Music Group Re-Signs Songwriter/Artist Jake Rose 
  Endurance Music Group Re-Signs Songwriter/Artist Jake Rose . Jake Rose has penned recent singles and cuts by JIMMIE ALLEN, FLORIDA GEORGIA LINE, BRIAN KELLEY, BLAKE SHELTON and more .


  links = links.append(new_row, ignore_index=True)
Your max_length is set to 120, but you input_length is only 99. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=49)


https://wavefarm.org/wf/archive/yvjqh0 Wave Farm | Ginger Radio Hour: Kelefa Sanneh 
  Host Justin Maiman, from Cairo, shares his latest inspirations on the "Ginger Radio Hour" The show features music, field recordings, performances, and interviews, primarily with people in and around the Catskill Mountains of New York . This week, an all-music hour features tracks inspired by book "Major Labels: A History of Popular Music in Seven Genres"


  links = links.append(new_row, ignore_index=True)


https://archive.org/details/78_thats-whats-wrong-with-jimmy_sherry-parsons-carr-david-joe-leahy_gbia0284409a THAT'S WHAT'S WRONG WITH JIMMY : SHERRY PARSONS : Free Download, Borrow, and Streaming : Internet Archive 
  Four stylii were used to transfer this record . The preferred versions suggested by an audio engineer at George Blood, L. P. have been copied to have the more friendly filenames .


  links = links.append(new_row, ignore_index=True)


http://dlvr.it/SRm8z2 2022 'iHeartRadio Music Festival' Lineup Revealed 
  The two-day Main Stage will feature AVRIL LAVIGNE, BLACK EYED PEAS, HALSEY, LL COOL J FEAT, DJ Z-TRIP, LUKE COMBS, MAREN MORRIS, MEGAN THEE STALLION, MORGAN WALLEN, NICKI MINAJ, PAT BENATAR & NEIL GIRALDO, SAM SMITH, THE BLACK KEYS and more . The event at LAS VEGAS' T-MOBILE ARENA will be hosted by R


  links = links.append(new_row, ignore_index=True)


## Classify web resources

In [42]:
resources_predictions = lr_predict(path, 'resources', links_to_add, 'Description')

resources_preds_cv_df = resources_predictions.copy()
resources_preds_cv_df = resources_preds_cv_df.loc[resources_preds_cv_df['Prediction'] == 1]
resources_preds_cv_df = resources_preds_cv_df[~resources_preds_cv_df.Title.str.contains('|'.join(discard))]
resources_preds_cv_df = resources_preds_cv_df[~resources_preds_cv_df.URL.str.contains('|'.join(discard))]
resources_preds_cv_df.sort_values(by='Score', ascending=False).reset_index(drop=True)

Unnamed: 0,Title,Description,URL,Prediction,Score,Probability,Input Length
0,Renaissance Chessboard Collection – Diamondz O...,Hell Razah Music Inc. and Diamondz O.C. C. ...,https://www.diamondzoc.com/collections/renaiss...,1,6.514313,0.99852,178
1,Notion – The all-in-one workspace for your not...,@keyframes rotate {0% {-webkit-transform:rota...,https://viinterchild.notion.site/Filler-videos...,1,4.383168,0.987668,202
2,SELECTION （J.S.Bach etc.） | Hermitage's Crab ...,! d o c t y p e h t m l >,http://hermitage-crabapple.amebaownd.com/pages...,1,4.383168,0.987668,25
3,"Save 50% on Passages Lofi Melodies, Analog One...",Prime Loops has launched a limited time promo...,https://rekkerd.org/save-50-on-passages-lofi-m...,1,3.831362,0.97878,230
4,"Checagou, Vol. 3 — red.katt.wreck.chords","Checagou, Vol. 3 Get the music: spotify amazon...",https://www.redkattwreckchords.com/rk7/checago...,1,3.465472,0.969689,115
5,Largehearted Boy: Shorties (An Excerpt from Sa...,The Oxford American shared an excerpt from Sa...,http://www.largeheartedboy.com/blog/archive/20...,1,3.012738,0.953146,231
6,Endurance Music Group Re-Signs Songwriter/Arti...,Endurance Music Group Re-Signs Songwriter/Art...,http://dlvr.it/SRn4l8,1,2.68003,0.935838,184
7,Endurance Music Group Re-Signs Songwriter/Arti...,Endurance Music Group Re-Signs Songwriter/Art...,http://dlvr.it/SRn7lx,1,2.68003,0.935838,184
8,Alexa Morris Rises To VP At Warner Chappell Music,Alexa Morris Rises To VP At Warner Chappell M...,http://dlvr.it/SRmcYx,1,2.609467,0.931468,180
9,Alexa Morris Rises To VP At Warner Chappell Music,Alexa Morris Rises To VP At Warner Chappell M...,http://dlvr.it/SRmXZ5,1,2.609467,0.931468,180
