# Pipeline

- This notebook is used to test various aspects of the twitter pipeline 

## Imports

In [1]:
path = '../'
import csv , dateutil.parser , time
from datetime import date , timedelta 
import os
# classifier
import pandas as pd
import pickle
from sklearn.linear_model import LogisticRegressionCV
from sklearn.metrics import classification_report 
from sklearn.model_selection import cross_val_predict
from sklearn.feature_extraction.text import TfidfVectorizer
# web scraping
import requests
from bs4 import BeautifulSoup
#!pip3 install trafilatura
import trafilatura
from transformers import pipeline
#cleaning 
import emoji
import re
#functions
from PYTHON_FILES.LogReg_Searches import LogRegSearches

## Variables

In [20]:
# descriptions training set -> v2 = musow+mji descriptions vs summarized scrapes from twitter searches  
archive_desc_training_v2 = pd.read_pickle(path+'LOGREG_RELEVANCE/TRAINING_SETS/archive_desc_training_v2.pkl')

# twitter training set -> v1 = tweets from bigrams vs tweets for digital humanities and music company 
twitter_training_set_v1 = pd.read_pickle(path+'LOGREG_RELEVANCE/TRAINING_SETS/twitter_training_v1.pkl')

## Functions

In [3]:
def lr_training(t_input, t_feature, target, cv_int, score_type, filename, path):
    """ Create a text classifier based on Logistic regression and TF-IDF. Use cross validation 
    
    Parameters
    ----------
    t_input: 
        dataframe of the training set
    t_feature: 
        df column, text of tweet or description of the resource
    target: 
        df column, [0,1] values
    cv_int: int
        the number of cross validation folding
    score_type: str
        precision or recall
    filename: str
        model file name
    path: str
        parent folder
    """
    # TODO eda to define max_features=1000
      
    tfidf_transformer = TfidfVectorizer(ngram_range=(1, 2), lowercase=True, max_features=1000) 
    x_train = tfidf_transformer.fit_transform(t_input[t_feature])
    y_train = t_input[target].values
    model = LogisticRegressionCV(solver='liblinear', random_state=44, cv=cv_int, scoring=score_type)
    
    # export
    model.fit(x_train, y_train)
    export_model = f'LOGREG_RELEVANCE/MODELS/{filename}_model.pkl'
    export_vectorizer = f'LOGREG_RELEVANCE/MODELS/{filename}_vectorizer.pkl'
    pickle.dump(model, open(path+export_model, 'wb'))
    pickle.dump(tfidf_transformer, open(path+export_vectorizer, 'wb'))
    
    # report
    y_pred = cross_val_predict(model, x_train, y_train, cv=cv_int)
    report = classification_report(y_train, y_pred)
    print('report:', report, sep='\n')
    return model
    
def lr_predict(path, filename, p_input, p_feature):
    """ Classify text using a pickled model based on Logistic regression and TF-IDF.
    
    Parameters
    ----------
    p_input: 
        dataframe of the prediction set
    p_feature: 
        df column, text of tweet or description of the resource
    filename: str
        model file name
    path: str
        parent folder
    """
    export_model = f'{path}LOGREG_RELEVANCE/MODELS/{filename}_model.pkl'
    export_vectorizer = f'{path}LOGREG_RELEVANCE/MODELS/{filename}_vectorizer.pkl'
    model = pickle.load(open(export_model, 'rb'))
    tfidf_transformer = pickle.load(open(export_vectorizer, 'rb'))
  
    #result = loaded_model.score(X_test, Y_test)
    #x_new_count = count_vect.transform(p_input[p_feature])
    x_predict = tfidf_transformer.transform(p_input[p_feature])
    y_predict = model.predict(x_predict)
    scores = model.decision_function(x_predict)
    probability = model.predict_proba(x_predict)
    
    #results = [r for r in y_predict]
    result = p_input.copy()
    result['Prediction'] = y_predict
    result['Score'] = scores
    result['Probability'] = probability[:,1]
    result['Input Length'] = result[p_feature].str.len()
    return result

def create_url(keyword, start_date, end_date, max_results):
        search_url = "https://api.twitter.com/2/tweets/search/all" #Change to the endpoint you want to collect data from
        #change params based on the endpoint you are using
        query_params = {'query': keyword,
                        'start_time': start_date,
                        'end_time': end_date,
                        'max_results': max_results,
                        'expansions': 'author_id,in_reply_to_user_id,geo.place_id',
                        'tweet.fields': 'id,text,author_id,in_reply_to_user_id,geo,conversation_id,created_at,lang,public_metrics,referenced_tweets,reply_settings,source,entities',
                        'user.fields': 'id,name,username,created_at,description,public_metrics,verified',
                        'place.fields': 'full_name,id,country,country_code,geo,name,place_type',
                        'next_token': {}}
        return (search_url, query_params)
    
def connect_to_endpoint(url, headers, params, next_token = None):
    params['next_token'] = next_token   #params object received from create_url function
    response = requests.request("GET", url, headers = headers, params = params)
    print("Endpoint Response Code: " + str(response.status_code))
    if response.status_code != 200:
        raise Exception(response.status_code, response.text)
    return response.json()

def give_emoji_free_text(text):
    return emoji.replace_emoji(text, replace='')

def append_to_csv(json_response, fileName):
    #A counter variable
    counter = 0

    #Open OR create the target CSV file
    csvFile = open(fileName, "a", newline="", encoding='utf-8')
    csvWriter = csv.writer(csvFile)
    
    #setup usernames via includes
    username = {user['id']: user['username'] for user in json_response['includes']['users']}
    
    #Loop through each tweet
    for tweet in json_response['data']:

        # 1. Username
        author_id = tweet['author_id']
        user = username[author_id]

        # 2. Time created
        created_at = dateutil.parser.parse(tweet['created_at'])

        # 3. Language
        lang = tweet['lang']

        # 4. Tweet metrics
        retweet_count = tweet['public_metrics']['retweet_count']
        reply_count = tweet['public_metrics']['reply_count']
        like_count = tweet['public_metrics']['like_count']
        quote_count = tweet['public_metrics']['quote_count']

        #5. URLs w/ a catch for tweets w/ two links TODO: how to catch more than two links? 
        if ('entities' in tweet) and ('urls' in tweet['entities']):
            for url in tweet['entities']['urls']:
                url = [url['expanded_url'] for url in tweet['entities']['urls'] if 'twitter.com' not in url['expanded_url']]
                url = ', '.join(url)
        else:
            url = ""
        
        #6. Tweet text
        text = give_emoji_free_text(tweet['text']) 
        
        # Assemble all data in a list
        res = [user, created_at, lang, like_count, quote_count, reply_count, retweet_count, text, url]

        # Append the result to the CSV file
        csvWriter.writerow(res)
        counter += 1    
    
    # When done, close the CSV file
    csvFile.close()

    # Print the number of tweets for this iteration
    print("# of Tweets added from this response: ", counter) 

def twitter_search(token, keyword, start, end, mresults, mcount, file_name):
    
    # TODO filter tweets in english only OR tweak TF-IDF stopwords (lang detection)
    bearer_token = token
    headers = {"Authorization": "Bearer {}".format(bearer_token)} 
    start_list = start
    end_list =  end
    max_results = mresults
    total_tweets = 0

    # Create file
    csvFile = open(f'{path}TWITTER_SEARCHES/RAW_SEARCHES/{file_name}.csv', "a", newline="", encoding='utf-8')
    csvWriter = csv.writer(csvFile)
    csvWriter.writerow(['user', 'created_at', 'lang', 'like_count', 'quote_count', 'reply_count','retweet_count','tweet', 'URL'])
    csvFile.close()

    for i in range(0,len(start_list)):
        # Inputs
        count = 0 # Counting tweets per time period
        max_count = mcount # Max tweets per time period
        flag = True
        next_token = None
        
        while flag:
            # Check if max_count reached
            if count >= max_count:
                break
            print("-------------------")
            print("Token: ", next_token)
            url = create_url(keyword, start_list[i],end_list[i], max_results)
            json_response = connect_to_endpoint(url[0], headers, url[1], next_token)
            result_count = json_response['meta']['result_count']

            if 'next_token' in json_response['meta']:
                # Save the token to use for next call
                next_token = json_response['meta']['next_token']
                print("Next Token: ", next_token)
                if result_count is not None and result_count > 0 and next_token is not None:
                    print("Start Date: ", start_list[i])
                    append_to_csv(json_response, f'{path}TWITTER_SEARCHES/RAW_SEARCHES/{file_name}.csv')
                    count += result_count
                    total_tweets += result_count
                    print(f"Total # of Tweets added for '{keyword}':", total_tweets)
                    print("-------------------")
                    time.sleep(5)                
            # If no next token exists
            else:
                if result_count is not None and result_count > 0:
                    print("-------------------")
                    print("Start Date: ", start_list[i])
                    append_to_csv(json_response, f'{path}TWITTER_SEARCHES/RAW_SEARCHES/{file_name}.csv')
                    count += result_count
                    total_tweets += result_count
                    print(f"Total # of Tweets added for '{keyword}':", total_tweets)
                    print("-------------------")
                    time.sleep(5)

                #Since this is the final request, turn flag to false to move to the next time period.
                flag = False
                next_token = None
            time.sleep(5)
    print("Total number of results:", total_tweets)
    
    df = pd.read_csv(f'{path}TWITTER_SEARCHES/RAW_SEARCHES/{file_name}.csv', keep_default_na=False, dtype={"user": "string", "lang": "string", "tweet": "string", "URL": "string"})
    
    # clean the tweet from meentions, hashtags, emojis
    df['tweet'].replace( { r"@[A-Za-z0-9_]+": '' }, inplace= True, regex = True)
    df['tweet'].replace( { r"#": '' }, inplace= True, regex = True)
    
    # remove tweets that are not in english, have empty URLs, or have duplicate URLs
    df = df[df['lang'].isin(['en'])]
    df = df[df.URL != '']
    df = df.drop_duplicates(['URL'], keep='last')

    #add a column for the search keyword
    df['Search KW'] = keyword

    #pickle df for reuse
    df.to_pickle(f'{path}TWITTER_SEARCHES/RAW_SEARCHES/{file_name}.pkl')

def scrape_links(link_list, pred_df, filename):
    """ Scrape links from classified tweets, save scrapes, combine them w/ tweets and return a DF for description classification.
    
    Parameters
    ----------
    p_input: 
        dataframe of the prediction set
    p_feature: 
        df column, text of tweet or description of the resource
    filename: str
        model file name
    path: str
        parent folder
    score: int
        which prediction score to filter the results by 1/0
    discard: variable
        a list of terms to check against to remove tweets
    filter: str 
        a string against which to further filter predictions 
    """
    links = pd.DataFrame(columns=['Title', 'Description', 'URL'])
    summarizer = pipeline("summarization", model='sshleifer/distilbart-cnn-12-6')
    
    for link in link_list:
        URL = link
        page = None
        ARTICLE = ''
        try:
            x = requests.head(URL, timeout=15)
            content_type = x.headers["Content-Type"] if "Content-Type" in x.headers else "None"
            if ("text/html" in content_type.lower()):
                page = requests.get(URL, timeout=15)
        except Exception:
            pass
        
        if page:
            soup = BeautifulSoup(page.content, "html.parser")
            title = ' '.join([t.text for t in soup.find('head').find_all('title')]).strip() \
                if soup and soup.find('head') and soup.find('body') is not None \
                else URL
            
            try:
                downloaded = trafilatura.fetch_url(URL)
                ARTICLE = trafilatura.extract(downloaded, include_comments=False, include_tables=True, target_language='en', deduplicate=True)
            except Exception:
                results = soup.find_all(['h1', 'p'])
                text = [result.text for result in results]
                ARTICLE = ' '.join(text)
            
            if ARTICLE is not None and len(ARTICLE) > 200:
                # text summarisation
                max_chunk = 500
                #removing special characters and replacing with end of sentence
                ARTICLE = ARTICLE.replace('.', '.<eos>')
                ARTICLE = ARTICLE.replace('?', '?<eos>')
                ARTICLE = ARTICLE.replace('!', '!<eos>')
                sentences = ARTICLE.split('<eos>')
                current_chunk = 0 
                chunks = []

                # split text to process
                for sentence in sentences:
                    if len(chunks) == current_chunk + 1: 
                        if len(chunks[current_chunk]) + len(sentence.split(' ')) <= max_chunk:
                            chunks[current_chunk].extend(sentence.split(' '))
                        else:
                            current_chunk += 1
                            chunks.append(sentence.split(' '))
                    else:
                        chunks.append(sentence.split(' '))

                for chunk_id in range(len(chunks)):
                    chunks[chunk_id] = ' '.join(chunks[chunk_id])
                try:
                    res = summarizer(chunks, min_length = int(0.2 * len(text)), max_length = int(0.5 * len(text)), do_sample=False)
                    # summary
                    text = ' '.join([summ['summary_text'] for summ in res])
                except Exception:
                    text = ARTICLE
                    continue
            else:
                text = ARTICLE
            print(URL)
            new_row = {'Title': title, 'Description': text, 'URL': URL.strip()}
            new_df = pd.DataFrame(data=new_row, index=[0])
            links = pd.concat([links, new_df], ignore_index=True)
    discard = ['None', '! D O C T Y P E h t m l >', '! d o c t y p e h t m l >', '! D O C T Y P E H T M L >']
    links = links.fillna('None')
    links = links[~links.Description.str.contains('|'.join(discard))]
    twitter_scrapes_preds = pd.merge(pred_df, links, on='URL')
    twitter_scrapes_preds.to_pickle(f'{path}LOGREG_RELEVANCE/SCRAPES/{filename}.pkl')
    print(len(twitter_scrapes_preds))
    return twitter_scrapes_preds

def twitter_predictions(path, filename, p_input, p_feature, score, discard, filter):
    """ Predict relevant tweets using a pickled model based on Logistic regression and TF-IDF.
    
    Parameters
    ----------
    p_input: 
        dataframe of the prediction set
    p_feature: 
        df column, text of tweet or description of the resource
    filename: str
        model file name
    path: str
        parent folder
    score: int
        which prediction score to filter the results by 1/0
    discard: variable
        a list of terms to check against to remove tweets
    filter: str 
        a string against which to further filter predictions 
    """
    preds = lr_predict(path, filename, p_input, p_feature)
    preds = preds.drop_duplicates(['tweet'], keep='last')
    preds = preds.loc[preds['Prediction'] == score]
    preds = preds[~preds.URL.str.contains('|'.join(discard))]
    preds = preds.sort_values(by='Score', ascending=False).reset_index(drop=True)
    preds = preds[['tweet', 'Prediction', 'Score', 'Probability', 'Input Length', 'URL', 'Search KW']]
    if filter != '':
        preds = preds[preds['tweet'].str.contains(filter)]
        preds = preds.reset_index(drop=True)
    twitter_link_list = [link for link in preds['URL']]
    print('Total tweets classified:', len(preds))
    return preds, twitter_link_list

def resource_predictions(path, filename, p_input, p_feature, score, discard, savefile):
    """ Predict relevant URL descriptions using a pickled model based on Logistic regression and TF-IDF.
    
    Parameters
    ----------
    p_input: 
        dataframe of the prediction set
    p_feature: 
        df column, text of tweet or description of the resource
    filename: str
        model file name
    path: str
        parent folder
    score: int
        which prediction score to filter the results by 1/0
    discard: variable
        a list of terms to check against to remove tweets
    savefile: str
        name for the final csv to be saved under 
    """
    if len(filename) == 0:
        return 'Sorry no URLs to classify!'
    preds = lr_predict(path, filename, p_input, p_feature)
    preds = preds.drop_duplicates(['Description'], keep='last')
    preds = preds.loc[preds['Description'] != '']
    preds = preds.loc[preds['Prediction'] == score]
    preds = preds[~preds.URL.str.contains('|'.join(discard))]
    preds = preds[~preds.Title.str.contains('|'.join(discard))]
    preds = preds.sort_values(by='Score', ascending=False).reset_index(drop=True)
    preds.to_csv(f'{path}LOGREG_RELEVANCE/PREDICTIONS/{savefile}.csv')
    print(preds)
    return preds

def tweets_to_classify(path, filetype):   
    """ Merge all tweet searches together.
    
    Parameters
    ----------
    path: 
        for raw searches folder
    filetype: 
        the ending of the files to load, you can call just .pkl or also the date tag from file names
    """  
    raw_searches = path+'TWITTER_SEARCHES/RAW_SEARCHES/'
    result = pd.DataFrame()
    tweets_to_classify = pd.DataFrame()
    for file in os.listdir(raw_searches):
        if file.endswith(filetype):
            result = pd.read_pickle(raw_searches+file)
        tweets_to_classify = pd.concat([tweets_to_classify, result])
        tweets_to_classify = tweets_to_classify.reset_index(drop=True)
    print('Total tweets to classify:', len(tweets_to_classify))
    return tweets_to_classify

## Training twitter and descriptions classifiers

This is a ONE TIME operation. The models are pickled and loaded later to predict new results

In [None]:
# one time training on twitter
twitter_training_model = LogRegSearches.train(twitter_training_set_v1, 'tweet', 'Target', 150, 'precision', 10, 'twitter_v1_150maxfeats', path)

# one time training on resources
#resource_training_model = lr_training(archive_desc_training_v2, 'Description', 'Target', 10, 'f1','resources_v2',path)

## Query Twitter

Calls Twitter API with the list of keywords and returns the table `prediction_twitter`

In [None]:
#load token
token = 'AAAAAAAAAAAAAAAAAAAAAJgsNAEAAAAAQcsgbUnOJJmqmU483%2F8x6n9V1i8%3Df0qaEo9cV1sWP4eyNQ6E9s8BiRjvFTSN9mSqithe8uIXSNP68x'
#a selection of keywords based on MJI and musoW datasets

#choose KW list 
##OG keyword list
keywords = ['oral history', 'music magazine', 'sound archive', 'music history', 'music culture', 'music research', 'sheet music', 'music library', 'digital library', 'music collection', 'digital collection', 'sound recording', 'midi file', 'audio file', 'music information', 'musical score', 'digital score', 'song dataset', 'digital edition', 'digital archive', 'digital library', 'music archive', 'music library', 'archive collection']

##best performing list based on v1 tests
better_keywords = ['audio file', 'music archive', 'music collection', 'music library', 'sheet music', 'sound archive', 'sound recording']

#custom timeframe for searching
start = ['2022-01-01T00:00:00.000Z', '2022-02-01T00:00:00.000Z', '2022-03-01T00:00:00.000Z']
end = ['2022-01-31T23:59:59.000Z', '2022-02-28T23:59:59.000Z', '2022-03-31T23:59:59.000Z']

#choose search option 
## search last week
#twitter_search_weekly(token, better_keywords, 500, 500)
## search custom timeframe
twitter_search_custom(token, better_keywords, start, end, 500, 500)

In [None]:
#load token
token = 'AAAAAAAAAAAAAAAAAAAAAJgsNAEAAAAAQcsgbUnOJJmqmU483%2F8x6n9V1i8%3Df0qaEo9cV1sWP4eyNQ6E9s8BiRjvFTSN9mSqithe8uIXSNP68x'
#a selection of keywords based on MJI and musoW datasets

#OG keyword list
keywords = ['oral history', 'music magazine', 'sound archive', 'music history', 'music culture', 'music research', 'sheet music', 'music library', 'digital library', 'music collection', 'digital collection', 'sound recording', 'midi file', 'audio file', 'music information', 'musical score', 'digital score', 'song dataset', 'digital edition', 'digital archive', 'digital library', 'music archive', 'music library', 'archive collection']

#best performing list based on v1 tests
better_keywords = ['audio file', 'music archive', 'music collection', 'music library', 'sheet music', 'sound archive', 'sound recording']

#transfer keywords to input list w/ additional twitter parameters, e.g. no retweets
input_keywords = [f'\"{k}\" -is:retweet' for k in keywords] 
#input time periods for search as a comma separated list, searches will be conducted for each time pair e.g. first of week/month, last of week/month 
start = ['2022-05-01T00:00:00.000Z', '2022-05-02T00:00:00.000Z', '2022-05-03T00:00:00.000Z', '2022-05-04T00:00:00.000Z', '2022-05-05T00:00:00.000Z', '2022-05-06T00:00:00.000Z', '2022-05-07T00:00:00.000Z']
end = ['2022-05-01T23:59:59.000Z', '2022-05-02T23:59:59.000Z', '2022-05-03T23:59:59.000Z', '2022-05-04T23:59:59.000Z', '2022-05-05T23:59:59.000Z', '2022-05-06T23:59:59.000Z', '2022-05-07T23:59:59.000Z']


#input max results / counts, and path 
mresults = 500 # max tweets per json response (100-500)
mcount = 500 # max tweets per search period 
#run the search! 
for k in input_keywords:
    filename = re.sub(r"([^A-Za-z0-9]+)", '', k) + f'_{start[0][0:10]}' + f'_{end[-1][6:10]}'
    filename = re.sub(r"isretweet", '', filename)
    prediction_twitter = LogRegSearches.search_twitter(token, k, start, end, mresults, mcount, filename)

#today = date.today()
#week_ago = today - timedelta(days=7)
#start = [week_ago.strftime("%Y-%m-%dT%H:%M:%S.000Z")]
#end = [today.strftime("%Y-%m-%dT%H:%M:%S.000Z")]

## Classify tweets

In [22]:
#load all search results into a single dataframe 
classified_tweets = LogRegSearches.tweets_to_classify(path, '2022-01-01_3-31.pkl')

Total tweets to classify: 2466


In [28]:
#run classification and get links from results
predicted_tweets, twitter_link_list = LogRegSearches.predict_twitter(path, 'twitter_v1_10maxfeats', classified_tweets, 'tweet', 1, '')

Total tweets classified: 611


In [10]:
predicted_tweets

Unnamed: 0,tweet,Prediction,Score,Probability,Input Length,URL,Search KW,tweet date
0,https://t.co/OkEHI6XQu3 Music archive Wolfgang...,1,14.077947,0.999999,139,http://www.opusip.co.uk/2022/01/25/music-archi...,"""music archive"" -is:retweet",2022-01-26
1,IUMA (Internet Underground Music Archive) Coll...,1,13.847835,0.999999,76,https://ift.tt/33IiRug,"""music archive"" -is:retweet",2022-01-24
2,IUMA (Internet Underground Music Archive) Coll...,1,13.847835,0.999999,80,https://ift.tt/3Asqwc9,"""music archive"" -is:retweet",2022-01-24
3,Music archive Wolfgang's Vault resolves copyri...,1,13.728025,0.999999,113,https://www.reuters.com/legal/transactional/mu...,"""music archive"" -is:retweet",2022-01-25
4,Music archive Wolfgang's Vault resolves copyri...,1,13.728025,0.999999,103,https://www.reuters.com/legal/transactional/mu...,"""music archive"" -is:retweet",2022-01-25
...,...,...,...,...,...,...,...,...
603,Wreckers of Civilization : Marcus Werner Hed a...,1,0.039432,0.509857,281,http://dlvr.it/SKmCvD,"""oral history"" -is:retweet",2022-02-27
604,"is reading ""https://t.co/s35k9d1vtn"" https://...",1,0.034007,0.508501,61,"https://ift.tt/GneE8yI, https://ift.tt/jaWQXrt","""oral history"" -is:retweet",2022-02-27
605,Our special ProsweetsCologne edition of Intern...,1,0.028407,0.507101,242,https://in-confectionery.com/january-2022-sing...,"""digital edition"" -is:retweet",2022-01-30
606,How annoying Kim. Check out https://t.co/pHXp...,1,0.018098,0.504524,119,https://www.bt.com/help/tv/fix-a-problem/recor...,"""sound recording"" -is:retweet",2022-03-29


## Scrape URLS

In [11]:
#scrape URL list and return a DF for resource classification
scraped_links = LogRegSearches.scrape_links(twitter_link_list, predicted_tweets, 'all_keywords_2022-01-01_03-31_scrapes_maxfeats100')

1 http://www.opusip.co.uk/2022/01/25/music-archive-wolfgangs-vault-resolves-copyright-fight-over-concert-recordings-2022-01-25/


Your max_length is set to 120, but you input_length is only 99. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=49)


2 https://ift.tt/33IiRug


Your max_length is set to 120, but you input_length is only 99. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=49)


3 https://ift.tt/3Asqwc9
4 https://www.reuters.com/legal/transactional/music-archive-wolfgangs-vault-resolves-copyright-fight-over-concert-recordings-2022-01-25/?utm_source=dlvr.it&utm_medium=twitter
5 https://www.reuters.com/legal/transactional/music-archive-wolfgangs-vault-resolves-copyright-fight-over-concert-recordings-2022-01-25/
6 https://archive.org/details/iuma-archive
7 https://pointblankdigital.shop/british-library-sound-archive-to-archive-our-music-catalogue/


Token indices sequence length is longer than the specified maximum sequence length for this model (1096 > 1024). Running this sequence through the model will result in indexing errors


8 https://orionlibrary.org/idigorion/
9 https://shop.line.me/@156nhver/collection/51040
10 https://www.dlsite.com/home/dlaf/=/t/s/link/work/aid/dlsiteyuki21/id/RJ280204.html
11 https://archive.org/details/stgigaarchive
12 http://fma.org


Your max_length is set to 120, but you input_length is only 69. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=34)


13 https://sound-effects.bbcrewind.co.uk/
14 https://apps.gtarcade.com/v1/#/post/53386?anchor=comment
15 https://apps.gtarcade.com/v1/#/post/53386?anchor=


Your max_length is set to 120, but you input_length is only 61. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=30)


16 https://nursinggrademiners.com/sampleessay/music-research-paper/
17 https://www.votefeed.com/video/play/31036/beautiful-relaxing-and-healing-music-xiao-and-bamboo-flute-instrumental-mus/
18 https://www.raymondjamesconnect.com/tWrw38
19 https://pastdaily.com/2022/01/30/the-pascal-quartet-play-music-of-guy-ropartz-1950-past-daily-weekend-gramophone/
20 https://m.neilyoungarchives.com/account/plans
21 https://deephousenews.com/2017/10/23/crackazat-called-my-name-original-mix/
22 https://www.saada.org/tides/article/i-am-my-own-savior
23 https://globalvacancies.org/job/phd-studentships-digital-archive-of-the-middle-east-dame-institute-of-arab-and-islamic-studies-2/?feed_id=5812&_unique_id=6218ec58d5802
24 https://www.boyculture.com/boy_culture/2022/02/levar-burton-gay-eric-adams-kill-gays-uganda-music-archive-fire-island-shirtless.html
25 https://pastdaily.com/2021/04/25/jeanne-gautier-and-yvonne-lefebure-play-music-of-ravel-1953-past-daily-weekend-gramophone/


Your max_length is set to 120, but you input_length is only 35. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=17)


26 https://buff.ly/3uH3N9W


Your max_length is set to 120, but you input_length is only 92. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=46)


27 https://www.dancemusicarchive.com/dmamaster/Oliver-Heldens-live-%40-Ultra-Music-Festival-Miami-2015
28 https://www.pond5.com/item/170386811


Your max_length is set to 120, but you input_length is only 32. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=16)


29 https://tickets.nfsa.gov.au/Events/Space-Faring-Civilisation-Film-Festival?fbclid=IwAR1cypZsK2UJ8hKCt4cANnW3g-RrsnGaXkMt_onUHMNvjwK1E-VwsegjgTU
30 https://www.jbhe.com/2022/03/san-diego-state-university-decides-not-to-accept-the-donation-of-a-black-music-archive/
31 https://music.tagirijus.de
32 https://www.mdmarchive.co.uk/exhibition/716/andrew-weatherall-a-herbal-tea-party-celebration?fbclid=IwAR26dCtyAfUO1jmjhRvMXGIieJkMTNdUYJHAJ3YDWH4th957Wiy0gYoVOvM
33 http://www.noise11.com/news/national-film-and-sound-archive-of-australia-restore-the-original-helen-reddy-i-am-woman-video-20220322
34 http://ballsackradio.com


Your max_length is set to 120, but you input_length is only 82. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=41)


35 https://www.nfsa.gov.au/collection/curated/muriels-wedding
36 https://t1p.de/w6jn
37 https://livemusicarchive.app/music/artists/moe/recordings/moe2013-03-23.late
38 https://www.forbes.com/sites/evaamsen/2022/01/31/you-use-the-language-area-of-your-brain-when-listening-to-music/


Your max_length is set to 120, but you input_length is only 67. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=33)


39 https://pastdaily.com/2022/03/20/dexter-gordon-live-in-berlin-1967-past-daily-downbeat/
40 https://pastdaily.com/2018/09/26/luciano-berio-with-the-swingle-singers-and-the-boston-symphony-play-music-of-boccherini-bach-and-berio-1982-past-daily-mid-week-concert/
41 https://www.nfsa.gov.au/about/what-we-collect/oral-history
42 https://pastdaily.com/2021/08/25/rudolf-serkin-with-edo-dewaart-and-the-san-francisco-symphony-gala-opening-of-louise-m-davies-hall-1980-past-daily-mid-week-concert/


Your max_length is set to 120, but you input_length is only 119. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=59)


43 https://bit.ly/3IKMqK7
44 https://talesfromthebraziersgrotto.wordpress.com/2022/01/28/music-archive-part-xiv-hide-with-spread-beaver-rocket-dive-universal-victor-1998/
45 https://archive.org/details/mwalk06
46 https://music.sleevenote.com/@ade
47 https://teia.art/kramer
48 https://buff.ly/3hMbFAQ
49 https://newindia24news.com/new-digital-archive-promotes-south-indian-visual-artists-and-their-works/
50 https://talesfromthebraziersgrotto.wordpress.com/2022/03/24/music-archive-part-xvi-hide-tell-me-mca-victor-1994/
51 https://www.musiceternal.com/News/2022/Florida-Sound-Archive-With-Sam-Rosenthal-20220126
52 https://creativecommons.org/licenses/by-nc-sa/4.0/
53 https://www.noise11.com/news/national-film-and-sound-archive-of-australia-restore-the-original-helen-reddy-i-am-woman-video-20220322
54 https://ift.tt/W1H3LYZ
55 https://en.wikipedia.org/wiki/FabricLive.07
56 https://www.identificationofmusic.com/iom-news/2022/2/16/mumdance-releases-extensive-music-archive
57 https://australianm

Your max_length is set to 120, but you input_length is only 116. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=58)


75 https://theconversation.com/digital-sound-archives-can-bring-extinct-birds-briefly-back-to-life-176115


Your max_length is set to 120, but you input_length is only 68. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=34)


76 https://contact-keith.wixsite.com/celtic-cd-releases?pgid=kc8wzmjt-35d07cbc-3355-4ba5-a982-b8cb0a69fe63


Your max_length is set to 120, but you input_length is only 69. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=34)


77 https://midtermessays.com/image-storage-requirements-can-be-reduced-with-____-compression-techniques-bit-map-c-midi-file-b-additive-file-d/
78 https://bit.ly/35lYyni
79 https://www.nme.com/news/music/neil-young-14-1226510
80 https://bit.ly/3IK8eFM
81 http://rver.se/fanresearch2022
82 https://www.marcrhayes.com/post/a-summary-of-ofsted-s-music-report-for-teachers-and-leaders
83 https://cincinnatilibrary.bibliocommons.com/v2/record/S170C1857835
84 https://livemusicarchive.app/music/artists/BobWeir/recordings/bw2022-03-24.peluso.flac24
85 https://www.nfsa.gov.au/collection/curated


Your max_length is set to 120, but you input_length is only 71. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=35)


86 https://beatfoundry.xyz
87 https://catholicarchivesociety.org/2022/01/22/the-leonard-cheshire-historic-sound-archive-%ef%bf%bc/
88 http://bit.ly/362rXms
89 https://playbill.com/article/watch-and-listen-an-evolving-archive-at-the-chamber-music-society
90 https://issuu.com/odouglasj?issuu_product=header&issuu_context=link&issuu_cta=profile
91 https://natlib.govt.nz/blog/posts/download-now-free


Your max_length is set to 120, but you input_length is only 104. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=52)


92 https://music-for-video.com/royaltyfreemusic/easy-lounge
93 https://www.mdmarchive.co.uk/exhibitions.php
94 https://www.hollylist.com/jobs/m5thu6C5gah7Eb8tK


Your max_length is set to 120, but you input_length is only 80. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=40)


95 http://j.mp/3b4aGay
96 https://www.fourcornersbooks.co.uk/articles/the-library-of-bootlegs
97 https://www.york.ac.uk/study/international/fees-funding/be-exceptional-scholarships/
98 https://news.sunsetmusicsupervision.com/2022/03/29/dynamyte-come-thru-indiepulse-music-magazine/
99 https://news.jamfestradio.com/2022/03/29/hot-half-dozen-3-29-22-indiepulse-music-magazine/
100 https://imgur.com/a/fbmLz6k/
101 https://www.dancemusicarchive.com/radioshow


Your max_length is set to 120, but you input_length is only 114. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=57)


102 https://archives.library.wales/index.php/y-glomen
103 https://www.exeter.ac.uk/study/funding/award/?id=4432
104 https://www.thebbrm.org/
105 https://www.thenational.wales/news/20030535.cardigan-castle---castell-aberteifi-opening-new-research-facility-april/
106 https://objkt.com/collection/KT1N1zPreiu3ANMizC44tkfcKdnsjop67qts
107 http://www.walterscott.lib.ed.ac.uk/home.html


Your max_length is set to 120, but you input_length is only 87. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=43)


108 https://www.ism.org/my-ism/your-discounts/grove


Your max_length is set to 120, but you input_length is only 102. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=51)


109 https://www.sd.net/blogs/archive/03302022-sd-board-of-regents/


Your max_length is set to 120, but you input_length is only 78. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=39)


110 http://bit.ly/2VW04qo


Your max_length is set to 120, but you input_length is only 65. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=32)


111 https://goodiespodcast.libsyn.com/166-sally-wilton-on-bill-oddies-music
112 https://exacted.me/SlightlyFoxedForInstitutions


Your max_length is set to 120, but you input_length is only 66. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=33)


113 http://www.foundsoundcycling.co.uk/
114 https://tinyurl.com/6h6hytdb
115 https://www.nfsa.gov.au/
116 https://ocean-archive.org/story/silent-whale-letters
117 https://www.ed.ac.uk/edinburgh-college-art/reid-school-music/research-seminars/music/yvonne-liao
118 https://www.whizz-kidz.org.uk/charity/30-years-30-stories/miro-story
119 https://bit.ly/3BSFWHB


Your max_length is set to 120, but you input_length is only 91. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=45)


120 https://bit.ly/3o7RIYS
121 http://KISSFMUK.COM


Your max_length is set to 120, but you input_length is only 81. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=40)


122 https://bit.ly/APFTPep6
123 https://tabsoft.co/36udGPN
124 https://www.ed.ac.uk/edinburgh-college-art/reid-school-music/research-seminars/music/fabrice-fitch
125 https://www.nfsa.gov.au/collection/curated/waltzing-matilda-0
126 https://bit.ly/3qsBkDn
127 https://losscaptureproject.cargo.site/More-than-a-Melody-Reimagining-the-Sounds-of-Blackness
128 https://rainwatertornado.cloud/collections/pony-music-archive/
129 http://www.bristol.ac.uk/theatre-collection/caring-for-your-theatre--live-art-records/caring-for-your-own-records/


Your max_length is set to 120, but you input_length is only 96. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=48)


130 https://www.armenianmuseum.org/sound-archive-2021-year-in-review
131 https://bit.ly/3p7Whmx
132 https://objkt.com/asset/KT1N1zPreiu3ANMizC44tkfcKdnsjop67qts/8, https://objkt.com/asset/KT1N1zPreiu3ANMizC44tkfcKdnsjop67qts/9
133 https://www.globeboss.com/new-digital-archive-reclaims-lost-jewish-history-israel-hayom/?feed_id=14242&_unique_id=61f5f268ea0ae
134 https://reut.rs/3KHTGJ3
135 http://conference.measureofmusic.com
136 http://DaytonMetroLibrary.org/research/databases/history
137 https://aec-music.eu/news-article/experimentation-and-beyond-in-music-research-forum-porto-portugal-15-16-april-2022/
138 https://bit.ly/NinthAnnualRobertKellyMemorialLecture
139 https://www.rollingstone.co.uk/music/british-library-to-track-600-years-of-black-british-music-for-new-exhibition-9690/
140 https://exbulletin.com/world/health/1524410/


Your max_length is set to 120, but you input_length is only 112. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=56)


141 https://www.intotheoutside.org.uk/


Your max_length is set to 120, but you input_length is only 81. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=40)


142 https://cincinnatilibrary.bibliocommons.com/v2/record/S170C2625353
143 https://bit.ly/341EHcg
144 https://www.loc.gov/programs/national-recording-preservation-plan/about-this-program/radio-preservation-task-force/current-projects/sound-submissions/
145 http://web.archive.org
146 http://suzywildeshow.com
147 http://blackfilmarchive.com


Your max_length is set to 120, but you input_length is only 70. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=35)


148 https://cincinnatilibrary.bibliocommons.com/v2/record/S170C2925059
149 https://rockcritics.com/2013/03/29/from-the-archives-paul-williams-2001/
150 https://blog.nls.uk/the-oscars-come-to-the-national-library/


Your max_length is set to 120, but you input_length is only 60. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=30)


151 https://cincinnatilibrary.bibliocommons.com/v2/record/S170C2724724
152 https://chroniclingamerica.loc.gov/


Your max_length is set to 120, but you input_length is only 68. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=34)


153 https://cincinnatilibrary.bibliocommons.com/v2/record/S170C3468000


Your max_length is set to 120, but you input_length is only 83. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=41)


154 https://cincinnatilibrary.bibliocommons.com/v2/record/S170C2706837
155 https://cincinnatilibrary.bibliocommons.com/v2/record/S170C2681008


Your max_length is set to 120, but you input_length is only 67. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=33)


156 https://cincinnatilibrary.bibliocommons.com/v2/record/S170C2629312


Your max_length is set to 120, but you input_length is only 36. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=18)


157 https://www.mymagazinesub.co.uk/he-naturist/
158 https://cincinnatilibrary.bibliocommons.com/v2/record/S170C2503999


Your max_length is set to 120, but you input_length is only 73. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=36)


159 https://bit.ly/3KSkk16
160 http://livestreamingmusic.uk


Your max_length is set to 120, but you input_length is only 79. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=39)


161 https://cincinnatilibrary.bibliocommons.com/v2/record/S170C2518943
162 https://hiphopminute.com/rapper-tyler-pauley-is-a-genz-powerhouse-shifting-music-culture/
163 https://24hip-hop.com/rapper-tyler-pauley-is-a-genz-powerhouse-shifting-music-culture/
164 https://www.gov.uk/government/publications/research-review-series-pe/research-review-series-pe


Your max_length is set to 120, but you input_length is only 72. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=36)


165 https://cincinnatilibrary.bibliocommons.com/v2/record/S170C3034053
166 https://bit.ly/3oPyZBt


Your max_length is set to 120, but you input_length is only 69. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=34)


167 https://cincinnatilibrary.bibliocommons.com/v2/record/S170C2965270
168 https://cincinnatilibrary.bibliocommons.com/v2/record/S170C2745085
169 https://cincinnatilibrary.bibliocommons.com/v2/record/S170C1824078


Your max_length is set to 120, but you input_length is only 76. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=38)


170 https://cincinnatilibrary.bibliocommons.com/v2/record/S170C2752777
171 https://cincinnatilibrary.bibliocommons.com/v2/record/S170C2625892


Your max_length is set to 120, but you input_length is only 74. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=37)


172 https://cincinnatilibrary.bibliocommons.com/v2/record/S170C2717753


Your max_length is set to 120, but you input_length is only 74. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=37)


173 https://cincinnatilibrary.bibliocommons.com/v2/record/S170C2691690
174 https://cincinnatilibrary.bibliocommons.com/v2/record/S170C2664729


Your max_length is set to 120, but you input_length is only 72. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=36)


175 https://cincinnatilibrary.bibliocommons.com/v2/record/S170C2756383
176 https://www.strippedfm.com/post/festival-season
177 http://digitalarchive.mcmaster.ca/islandora/object/macrepo%3A9839
178 https://www.ias.surrey.ac.uk/fellows/
179 https://cincinnatilibrary.bibliocommons.com/v2/record/S170C2448811
180 http://dla.acaweb.org/cdm/ref/collection/berea/id/2972
181 https://cincinnatilibrary.bibliocommons.com/v2/record/S170C2663810
182 https://patents.google.com/patent/USD147314
183 https://jamsphere.com/printed-magazine-issues/jamsphere-indie-music-magazine-march-2022
184 https://jamsphere.com/printed-magazine-issues/jamsphere-indie-music-magazine-february-2022
185 https://jamsphere.com/printed-magazine-issues/jamsphere-indie-music-magazine-january-2022
186 https://pocketmags.com/eu/computer-music-magazine


Your max_length is set to 120, but you input_length is only 100. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=50)


187 https://www.bbc.co.uk/sounds/series/m0014cg0
188 https://tabsoft.co/3v8lbGt
189 https://cincinnatilibrary.bibliocommons.com/v2/record/S170C2608391
190 https://bit.ly/3I5n75x
191 https://www.whizz-kidz.org.uk/charity/30-years-30-stories/rebecca-farren-story
192 https://cincinnatilibrary.bibliocommons.com/v2/record/S170C2725059
193 https://cincinnatilibrary.bibliocommons.com/v2/record/S170C2718073


Your max_length is set to 120, but you input_length is only 75. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=37)


194 https://bit.ly/3sgVJN6
195 https://cincinnatilibrary.bibliocommons.com/v2/record/S170C2782985


Your max_length is set to 120, but you input_length is only 65. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=32)


196 https://cincinnatilibrary.bibliocommons.com/v2/record/S170C2720486
197 https://cincinnatilibrary.bibliocommons.com/v2/record/S170C2575390


Your max_length is set to 120, but you input_length is only 78. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=39)


198 https://cincinnatilibrary.bibliocommons.com/v2/record/S170C2757581
199 http://hermitage-crabapple.amebaownd.com/pages/811095/page_201701220951
200 https://bit.ly/3qQatkR


Your max_length is set to 120, but you input_length is only 79. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=39)


201 https://cincinnatilibrary.bibliocommons.com/v2/record/S170C2573259
202 http://millionsongdataset.com
203 https://cincinnatilibrary.bibliocommons.com/v2/record/S170C2667075
204 https://ibccdigitalarchive.lincoln.ac.uk/omeka/
205 https://bit.ly/3Dl0GrU
206 https://bit.ly/3hzgVaY


Your max_length is set to 120, but you input_length is only 96. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=48)


207 https://www.academia.edu/29529055/Mari_Archive_2014_
208 https://www.artseverywhere.ca/the-drop/
209 https://bimbotech.notion.site/pink-template-d40fb130e17e44eab2e8f1239a5144c1
210 https://blog.richmond.edu/parsons/2022/03/arachnophonia-la-flor-de-la-canela-by-chabuca-granda/


Your max_length is set to 120, but you input_length is only 76. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=38)


211 https://cincinnatilibrary.bibliocommons.com/v2/record/S170C2693255


Your max_length is set to 120, but you input_length is only 71. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=35)


212 https://cincinnatilibrary.bibliocommons.com/v2/record/S170C2653386
213 https://bit.ly/3pHEy4f
214 https://ift.tt/F8dVZWG


Your max_length is set to 120, but you input_length is only 69. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=34)


215 https://cincinnatilibrary.bibliocommons.com/v2/record/S170C2045199
216 https://www.the-tls.co.uk/articles/sylvia-townsend-warner-book-review-janet-montefiore/
217 http://arxiv.org/abs/2201.08448v1


Your max_length is set to 120, but you input_length is only 81. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=40)


218 https://cincinnatilibrary.bibliocommons.com/v2/record/S170C2825033


Your max_length is set to 120, but you input_length is only 81. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=40)


219 https://www.bootsfordancing.com/copy-of-home-2


Your max_length is set to 120, but you input_length is only 109. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=54)


220 https://www.classicaltrombone.com/digital-downloads/p/ukrainian-national-anthem-
221 https://news.jamfestradio.com/2022/03/29/how-to-create-music-at-home-with-an-audio-interface%ef%bf%bc-indiepulse-music-magazine/
222 https://m.blog.naver.com/lovevante_7909/222658887118
223 https://bit.ly/3GTZxsC
224 https://indieshark.com/music-reviews/toronto-tabla-ensemble-for-the-love-of-tabla-lp/
225 http://dla.acaweb.org/cdm/ref/collection/berea/id/4972
226 http://dla.acaweb.org/cdm/ref/collection/berea/id/3946
227 http://dla.acaweb.org/cdm/ref/collection/berea/id/4492
228 https://sounds.bl.uk/Environment/Weather


Your max_length is set to 120, but you input_length is only 50. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=25)


229 http://s.einnews.com/FrkoJjQAlR


Your max_length is set to 120, but you input_length is only 50. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=25)


230 https://ift.tt/6LSTgJv
231 https://bit.ly/3lqjYoa
232 https://www.digitaladvicezone.com/top-5-microphones
233 https://www.newsletter.co.uk/heritage-and-retro/heritage/archive-footage-of-bygone-days-in-northern-ireland-given-new-lease-of-life-by-musicians-3631862
234 https://ift.tt/Q40nIOW
235 https://www.ilovefreegle.org/message/87289922?src=twitter


Your max_length is set to 120, but you input_length is only 50. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=25)


236 https://buff.ly/3LsvHxd
237 https://www.jns.org/a-lost-jewish-history-reclaimed-new-digital-archive-that-reveals-pre-holocaust-world-of-eastern-european-jewry/
238 https://bit.ly/2TYZDcn


Your max_length is set to 120, but you input_length is only 113. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=56)


239 https://news.prairiepublic.org/show/prairie-public-presents/2022-02-25/february-27-2022-mavis-staples-soul-a-celebration
240 https://issuu.com/indiepulsemusicmagazine/docs/indiepulse_febuary_issue_8_2022_final


Your max_length is set to 120, but you input_length is only 110. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=55)


241 https://buff.ly/32iq8gg, https://buff.ly/2VuvncS
242 https://radioexplorations.ch/
243 https://lovelive-sif-global.bushimo.jp/9thproject/codegeass/
244 https://digital.library.temple.edu/digital/collection/p15037coll3/id/56729/
245 https://digital.library.temple.edu/digital/collection/p15037coll3/id/62290/
246 https://digital.library.temple.edu/digital/collection/p15037coll3/id/56735/
247 https://digital.library.temple.edu/digital/collection/p15037coll3/id/56732/
248 https://digital.library.temple.edu/digital/collection/p15037coll3/id/62445/
249 http://noisehype.com/submit
250 http://dla.acaweb.org/cdm/ref/collection/berea/id/4097
251 http://WestCoastStyles.com
252 https://www.mic.lt/en/news/2022/02/08/advance-beyond-sound-yiorgis-sakellariou/
253 https://bit.ly/36tu4zS
254 https://open.qobuz.com/playlist/8189647
255 https://reut.rs/3rTJWTx
256 http://fluxus.lib.uiowa.edu/home.html
257 http://dlvr.it/SMdppq
258 https://podcasterwarehouse.com/product/neewer-nw-13-compact-microphone-

Your max_length is set to 120, but you input_length is only 107. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=53)


264 https://ntia.co.uk/find-your-mp-postponement-of-vat-business-rates-national-insurance-increases/
265 https://bit.ly/3I5rs8t


Your max_length is set to 120, but you input_length is only 49. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=24)


266 https://bit.ly/3cZkyoT
267 https://www.jwpepper.com/sheet-music/media-player.jsp?&type=audio&productID=10024362
268 https://www.fairobserver.com/culture/peter-isackson-music-culture-arts-news-popular-music-38922/
269 https://blogs.loc.gov/music/2022/02/lift-every-voice-and-sing/
270 https://relayx.com/market/77ae0dbe6b18157226aa75fc290a67f0630008bbb2e4a6ebf470a54897e5f28e_o1
271 https://oc.lc/3tRjcVN
272 https://bit.ly/3tEyFso
273 https://bit.ly/3NledUX
274 https://consortiumnews.com/2015/01/06/nyt-still-pretends-no-coup-in-ukraine/
275 https://www.mymusicsheet.com/sheet-music/57640
276 https://grangerartondemand.com/featured/2-sheet-music-1917-granger.html
277 https://grangerartondemand.com/featured/sheet-music-1917-granger.html
278 https://libbyapp.com/search/southtyneside/search/query-mojo/page-1/8840798
279 http://bit.ly/historymakerscontest
280 https://freegbedu.ng/mp3-download/mp3-nicki-minaj-do-we-have-a-problem-ft-lil-baby/
281 https://www.msn.com/en-us/music/news/why-bruce

Your max_length is set to 120, but you input_length is only 57. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=28)


286 http://fppld.org/digital-library
287 https://www.chamber-music.org/mag/2022/winter/index.html
288 https://www.spreaker.com/show/simone-nicole
289 https://music-for-video.com


Your max_length is set to 120, but you input_length is only 41. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=20)


290 https://www.nbcnews.com/news/us-news/rolling-stone-magazine-settles-rape-story-lawsuit-1-65-million-n772006
291 https://bit.ly/3Jwm27z
292 https://www-xsnoize-com.cdn.ampproject.org/v/s/www.xsnoize.com/album-review-tears-for-fears-the-tipping-point/?amp_js_v=a6&amp_gsa=1&amp&usqp=mq331AQKKAFQArABIIACAw%3D%3D#aoh=16459126957862&referrer=https%3A%2F%2Fwww.google.com&amp_tf=De%20%251%24s&ampshare=https%3A%2F%2Fwww.xsnoize.com%2Falbum-review-tears-for-fears-the-tipping-point%2F
293 https://www.emic.ee/?sisu=uudis_edasi&mid=27&lang=est&id=8063&uudis=1


Your max_length is set to 120, but you input_length is only 41. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=20)


294 https://news.jamfestradio.com/2022/03/29/celebrated-soul-pop-artist-laura-cheadle-reveals-details-for-her-sultry-new-single-the-lust-in-between-indiepulse-music-magazine/


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


295 https://www.prsformusic.com/-/media/files/prs-for-music/research/economic-insight-18-a-songwriters-perspective-on-six-music.ashx?la=en&hash=29BCB469126CE7D513F6914207008F875B4F7B84
296 https://www.namm.org/library/oral-history/garth-hudson
297 https://news.jamfestradio.com/2022/03/29/vessel-of-light-release-foxy-lady-video-indiepulse-music-magazine/
298 https://nckids.overdrive.com/
299 http://lovesedgemoor.co.uk
300 https://bbc.in/33xHdqv
301 https://www.icpl.org/books-more/digital-library
302 https://images.wur.nl/digital/collection/coll13
303 https://mainlynorfolk.info/folk/records/harryupton.html
304 https://neeedl.net/listing/brixton/
305 http://www.nfda.org/digitaldirector
306 https://www.atlasobscura.com/articles/first-black-music-magazine
307 https://buff.ly/3hiZaMU
308 https://buff.ly/3JGBxKP
309 https://www.northyorks.gov.uk/download-e-books-e-magazines-and-digital-audiobooks
310 https://sddigitalarchives.contentdm.oclc.org/digital/collection/p15914coll5
311 https://bit.l

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


327 https://www.prsformusic.com/-/media/files/prs-for-music/research/economic-insight-11-dec.ashx


Your max_length is set to 120, but you input_length is only 78. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=39)


328 http://wpl.ca/digital-library
329 https://www.birmingham2022.com/news/2493340/ceremony-pop-comes-to-campus
330 http://blog.archive.org/2022/01/27/virtual-gathering-welcomes-creative-works-from-1926-into-the-public-domain/
331 https://www.catchermedia.co.uk/resources/
332 https://hermitage-crabapple.amebaownd.com/pages/5181126/page_202108140852
333 https://bit.ly/3u8CDu1
334 https://www.lrb.co.uk/the-paper/v44/n07/florence-sutcliffe-braithwaite/bring-out-the-lemonade
335 https://www.twinkl.co.uk/l/gatw9
336 https://okt.to/p9C5sz


Your max_length is set to 120, but you input_length is only 87. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=43)


337 https://www.creatokia.com/en/s/the-poets
338 https://www.listia.com/r2tjq0t/DJNMVB
339 https://scroll.in/magazine/1008254/after-an-unbroken-run-of-87-years-indias-oldest-classical-music-magazine-is-facing-a-bleak-future
340 https://www.rollingstone.co.uk/politics/features/can-labour-win-again-with-young-people-disillusioned-13129/?fbclid=IwAR1lMroek6FASVE5bKx56NY_TZjOMLAfWdcGtSmrgnK2ExKHKdTPKnPY4RI
341 http://splibrary.ca/downloads
342 https://bit.ly/3o4c3OM
343 https://slate.com/news-and-politics/2022/01/slate-news-quiz-supreme-court-joe-biden-jeopardy.html?utm_medium=social&utm_campaign=traffic&utm_source=article&utm_content=twitter_share
344 https://linktr.ee/coreydrumz4real
345 https://www.cbc.ca/sportslongform/entry/cbc-sports-oral-histories-canadian-world-cup-qualification


Your max_length is set to 120, but you input_length is only 70. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=35)


346 https://www.isasa.org/vacancies/listing/junior-art-music-information-technology-teacher
347 https://www.boosey.com/teaching/sheet-music/Stephen-Hatfield-When-It-Was-Yet-Dark-SSA/4352
348 https://bit.ly/3kd1Q0Q
349 https://www.umusicpub.com/au/News/2013/Aug/Peter-Allen_National-Film-and-Sound-Archive-of-Australia.aspx
350 https://bit.ly/3rMbNVH
351 https://cstu.io/a21d44
352 https://bit.ly/35URZ7O
353 https://www.menofworth.org.uk/archives/digital-archive/conscientious-objectors/
354 https://bit.ly/2QhQe0A
355 http://bit.ly/sheetmusicnecklaces
356 https://im-musicmagazine.com/f/killswitch-engage-february-10-2022-raleigh-north-carolina
357 https://bit.ly/3IRPbtq
358 https://en.rwp.agency/news/953/
359 https://bit.ly/3CnqW4F
360 https://www.calderdale.gov.uk/v2/residents/leisure-and-culture/libraries/digital-library/ebooks-and-eaudio-books
361 https://kaleidoscope.boutique/talking-hamster-plush-toy/
362 https://fb.watch/aQP3cUsQ5r/
363 https://www.klkntv.com/lincoln-libraries-offering

Your max_length is set to 120, but you input_length is only 11. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=5)


364 https://bit.ly/37OaJKJ


Your max_length is set to 120, but you input_length is only 90. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=45)


365 https://archive.org/details/copyrightrecords
366 https://mtlnk.net/j_%253A%252F%252Fpiano-sheetmusic.net%252Farchives%252F1291


Your max_length is set to 120, but you input_length is only 70. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=35)


367 https://snooper-scope.in/the-rolling-stones-return-to-bst-hyde-park-to-celebrate-60th-anniversary-music-news/
368 http://dlvr.it/SMgRxM
369 http://dlvr.it/SMgPfP


Your max_length is set to 120, but you input_length is only 113. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=56)


370 http://myzakim.dana-farber.org
371 https://snooper-scope.in/doja-cat-apologises-for-tweets-after-cancelled-paraguay-show-music-news/
372 https://couponmatrix.uk/l/4sg7


Your max_length is set to 120, but you input_length is only 75. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=37)


373 https://www.edufex.com/register
374 https://www.needsomefun.net/list-of-25-books-written-about-bob-dylan/
375 https://siiasi.org/digital-archive/shaykh-muhammad-shareef/tarikh-as-sudan/
376 http://officemagazine.net/neue-projects-x-bustani-digital-archive
377 https://bit.ly/3K4Xi6V
378 https://objkt.com/asset/KT1DLwifQZbaV9xZcyukt3L8PN8iq14AG5a2/4
379 https://www.bitchmedia.org/article/no-punk-music-without-black-women?utm_source=pocket_mylist
380 https://buff.ly/3iNOcQl
381 https://bit.ly/ou-ua-crum-kahler
382 http://www.edufex.com
383 https://indianamemory.contentdm.oclc.org/digital/collection/Maennerchor/id/4226/rec/1
384 https://ethw.org/Oral-History:Elsie_Shutt
385 https://touchgenz.com/for-ps5-vertical-cooling-fan-stand-digital-edition-with-14-game-slot-3-hub-port-dual-controller-charging-station-charger-for-ps5/#gamergirl, https://touchgenz.com/for-ps5-vertical-cooling-fan-stand-digital-edition-with-14-game-slot-3-hub-port-dual-controller-charging-station-charger-for-ps5/
38

Your max_length is set to 120, but you input_length is only 80. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=40)


393 https://www.wired.com/story/apple-infinite-loop-oral-history/
394 https://www.uaarchives.org/digital/collection/p4036coll10/id/329/rec/19
395 https://www.slideshare.net/caerolus/statistical-analysis-of-results-in-music-information-retrieval-why-and-how
396 https://faroutmagazine.co.uk/worst-rhyming-couplets-music-history-songwriting/
397 https://www.marshallfoundation.org/library/digital-archive/6-026-speech-princeton-university-february-22-1947/
398 https://bit.ly/36WzG5S
399 https://ew.com/movies/2017/07/19/fifth-element-oral-history-luc-besson/
400 http://dla.acaweb.org/cdm/ref/collection/berea/id/2423
401 http://RadioAshford.com


Your max_length is set to 120, but you input_length is only 45. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=22)


402 https://www.android-setup.com/2022/03/playstation-5-digital-edition-restocked.html


Your max_length is set to 120, but you input_length is only 111. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=55)


403 https://filmmakermagazine.com/113546-wreckers-of-civilization-marcus-werner-hed-and-dan-fox-on-their-doc-fortnight-premiering-other-like-me-the-oral-history-of-coum-transmissions-and-throbbing-gristle/#.YhvTVS-B1ZM
404 https://iaml-uk-irl.org/bryant-award


Your max_length is set to 120, but you input_length is only 55. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=27)


405 https://openhandslibrary.weebly.com
406 https://www.mupress.org/Something-in-the-Water-A-History-of-Music-in-Macon-Georgia-1823-1980-P1160.aspx
407 https://on.cgiar.org/3G7J6aQ, https://on.cgiar.org/3gccOkg
408 http://ndl.iitkgp.ac.in
409 https://www.dpbolvw.net/click-8913463-13797872?url=https%3A%2F%2Fwww.fanatical.com%2Fen%2Fpick-and-mix%2Ffanatical-favorites-build-your-own-bundle
410 https://www.unwomen.org/en/digital-library/publications/2022/01/intersectionality-resource-guide-and-toolkit
411 http://owwl.org


Your max_length is set to 120, but you input_length is only 111. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=55)


412 http://dlvr.it/SKm8lW
413 https://bit.ly/TennisScrapbook65
414 https://okt.to/TQ3ebH
415 https://buff.ly/3tsZccs
416 https://www.bmwcca.org/content/oral-history-project-announcement
417 https://mailchi.mp/f03ec20b9f7c/pre-pesach


Your max_length is set to 120, but you input_length is only 111. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=55)


418 http://dlvr.it/SKmCvD


Your max_length is set to 120, but you input_length is only 23. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=11)


419 https://www.bt.com/help/tv/fix-a-problem/recording-or-sound/recording-problems-on-bt-tv#:~:text=Here%20are%20some%20steps%20to%20help%20stop%20this%20from%20happening%20again%3A&text=Check%20your%20equipment%20cabling%20is,%27%20section%20of%20%27My%20TV%27
386


In [16]:
scraped_links = pd.read_pickle(path+'LOGREG_RELEVANCE/SCRAPES/all_keywords_2022-01-01_03-31_scrapes_v1.pkl')

In [17]:
len(scraped_links) 

418

## Classify URLS

In [18]:
predicted_resources = LogRegSearches.predict_resource(path, 'resources_v2', scraped_links, 'Description', 1, 'all_keywords_2022-01-01_03-31_v1')

In [19]:
predicted_resources

Unnamed: 0,tweet,Prediction,Score,Probability,Input Length,URL,Search KW,tweet date,Title,Description
0,Jamsphere Indie Music Magazine January 2022 - ...,1,4.181303,0.984951,195,https://jamsphere.com/printed-magazine-issues/...,"""music magazine"" -is:retweet",2022-01-29,Jamsphere Indie Music Magazine January 2022 | ...,Jamsphere Indie Music Magazine January 2022 –...
1,Project: Music Recommendation  Million Song ...,1,3.689997,0.975636,319,http://millionsongdataset.com,"""song dataset"" -is:retweet",2022-03-29,Welcome! | Million Song Dataset,The Million Song Dataset is a free-available ...
2,2017 @ the British Library Sound Archive. Grat...,1,3.573228,0.972701,261,https://www.teresabarrozo.com/post/the-british...,"""sound archive"" -is:retweet",2022-01-21,@ the British Library Sound Achive,The Wild Life and Environmental Sounds Archiv...
3,Jamsphere Indie Music Magazine March 2022 - ht...,1,3.344960,0.965939,182,https://jamsphere.com/printed-magazine-issues/...,"""music magazine"" -is:retweet",2022-03-30,Jamsphere Indie Music Magazine March 2022 | Ja...,Jamsphere Indie Music Magazine March 2022 – F...
4,Jamsphere Indie Music Magazine February 2022 -...,1,3.204508,0.961004,184,https://jamsphere.com/printed-magazine-issues/...,"""music magazine"" -is:retweet",2022-02-26,Jamsphere Indie Music Magazine February 2022 |...,Jamsphere Indie Music Magazine February 2022 ...
...,...,...,...,...,...,...,...,...,...,...
164,Fire Island time machine: This recently discov...,1,0.080876,0.520208,291,https://bit.ly/3oPyZBt,"""music archive"" -is:retweet",2022-02-16,Fagulous Fire Island Music Archive + Eric Adam...,A huge archive of tapes discovered in a recen...
165,The Legend of Vox Machina Musical Score | Prim...,1,0.075942,0.518976,333,https://binge.place/amazon-prime/the-legend-of...,"""musical score"" -is:retweet",2022-03-28,Binge Place,The Legend of Vox Machina is more interested ...
166,The Legend of Vox Machina Musical Score | Prim...,1,0.075942,0.518976,353,http://dlvr.it/SMWwSw,"""musical score"" -is:retweet",2022-03-28,The Legend of Vox Machina Musical Score | Prim...,The Legend of Vox Machina is more interested ...
167,Turning on the tunes: 3 evidence-based benefit...,1,0.008282,0.502071,606,https://bit.ly/3rrgHZ5,"""music research"" -is:retweet",2022-02-25,Turning on the tunes: 3 evidence-based benefit...,"Music therapy, a safe and non-invasive comple..."
