# Pipeline

## Imports

In [1]:
import csv , dateutil.parser , time
from datetime import date , timedelta 
# classifier
import pandas as pd
import pickle
from sklearn.linear_model import LogisticRegressionCV
from sklearn.metrics import classification_report 
from sklearn.model_selection import cross_val_predict
from sklearn.feature_extraction.text import TfidfVectorizer
# web scraping
import requests
from bs4 import BeautifulSoup
#!pip3 install trafilatura
import trafilatura
from transformers import pipeline
#cleaning 
import emoji
import re

## Variables

In [2]:
path = '../'

# descriptions training set
archive_desc_training_v1 = pd.read_pickle(path+'LOGREG_RELEVANCE/TRAINING_SETS/archive_desc_training_v1.pkl')
archive_desc_training_v2 = pd.read_pickle(path+'LOGREG_RELEVANCE/TRAINING_SETS/archive_desc_training_v2.pkl')

# twitter training set
twitter_training_set_v1 = pd.read_pickle(path+'LOGREG_RELEVANCE/TRAINING_SETS/twitter_training_v1.pkl')

#kw and sites to remove from url and title strings 
discard = ['youtu', '404', 'Not Found', 'bandcamp', 'ebay', 'It needs a human touch', 'Page not found', 'open.spotify.com', 'We\'re sorry...', 'Not Acceptable!', 'Access denied', '412 Error', 'goo.gl', 'instagr.am', 'soundcloud', 'apple.co', 'amzn', 'masterstillmusic', 'Facebook', 'facebook', 'sheetmusiclibrary.website', 'Unsupported browser', 'Last.fm', 'last.fm', 'amazon.com', 'tidal.com', 'tmblr.co', 'blogspot', 'dailymusicroll', 'PortalTaxiMusic', 'apple.news', 'yahoo.com', 'sheetmusicplus.com', 'musicnotes.com', 'musescore.com', 'etsy', 'nts.live', 'twitch.tv', 'YouTube', 'radiosparx.com', 'freemusicarchive.org', 'blastradio', 'opensea', 'mixcloud', 'catalog.works', 'nft', 'NFT', 'allmusic.com', 'foundation.app', 'Robot or human?', 'heardle', 'insession.agency', 'jobvite', 'career', 'docs.google.com/forms/', 'discogs.com', 'zora.co', 'play.google.com', 't.me', 'mintable.app', 'instagram', 'linkedin', 'forms.gle', 'vimeo', 'radioiita', 'spotify']


## Functions

In [3]:
def lr_training(t_input, t_feature, target, cv_int, score_type, filename, path):
    """ Create a text classifier based on Logistic regression and TF-IDF. Use cross validation 
    
    Parameters
    ----------
    t_input: 
        dataframe of the training set
    t_feature: 
        df column, text of tweet or description of the resource
    target: 
        df column, [0,1] values
    cv_int: int
        the number of cross validation folding
    score_type: str
        precision or recall
    filename: str
        model file name
    path: str
        parent folder
    """
    # TODO eda to define max_features=1000
      
    tfidf_transformer = TfidfVectorizer(ngram_range=(1, 2), lowercase=True, max_features=1000) 
    x_train = tfidf_transformer.fit_transform(t_input[t_feature])
    y_train = t_input[target].values
    model = LogisticRegressionCV(solver='liblinear', random_state=44, cv=cv_int, scoring=score_type)
    
    # export
    model.fit(x_train, y_train)
    export_model = f'LOGREG_RELEVANCE/MODELS/{filename}_model.pkl'
    export_vectorizer = f'LOGREG_RELEVANCE/MODELS/{filename}_vectorizer.pkl'
    pickle.dump(model, open(path+export_model, 'wb'))
    pickle.dump(tfidf_transformer, open(path+export_vectorizer, 'wb'))
    
    # report
    y_pred = cross_val_predict(model, x_train, y_train, cv=cv_int)
    report = classification_report(y_train, y_pred)
    print('report:', report, sep='\n')
    return model
    
    
def lr_predict(path, filename, p_input, p_feature):
    """ Classify text using a pickled model based on Logistic regression and TF-IDF.
    
    Parameters
    ----------
    p_input: 
        dataframe of the prediction set
    p_feature: 
        df column, text of tweet or description of the resource
    filename: str
        model file name
    path: str
        parent folder
    """
    export_model = f'{path}LOGREG_RELEVANCE/MODELS/{filename}_model.pkl'
    export_vectorizer = f'{path}LOGREG_RELEVANCE/MODELS/{filename}_vectorizer.pkl'
    model = pickle.load(open(export_model, 'rb'))
    tfidf_transformer = pickle.load(open(export_vectorizer, 'rb'))
  
    #result = loaded_model.score(X_test, Y_test)
    #x_new_count = count_vect.transform(p_input[p_feature])
    x_predict = tfidf_transformer.transform(p_input[p_feature])
    y_predict = model.predict(x_predict)
    scores = model.decision_function(x_predict)
    probability = model.predict_proba(x_predict)
    
    #results = [r for r in y_predict]
    result = p_input.copy()
    result['Prediction'] = y_predict
    result['Score'] = scores
    result['Probability'] = probability[:,1]
    result['Input Length'] = result[p_feature].str.len()
    return result


def create_url(keyword, start_date, end_date, max_results):
        search_url = "https://api.twitter.com/2/tweets/search/all" #Change to the endpoint you want to collect data from
        #change params based on the endpoint you are using
        query_params = {'query': keyword,
                        'start_time': start_date,
                        'end_time': end_date,
                        'max_results': max_results,
                        'expansions': 'author_id,in_reply_to_user_id,geo.place_id',
                        'tweet.fields': 'id,text,author_id,in_reply_to_user_id,geo,conversation_id,created_at,lang,public_metrics,referenced_tweets,reply_settings,source,entities',
                        'user.fields': 'id,name,username,created_at,description,public_metrics,verified',
                        'place.fields': 'full_name,id,country,country_code,geo,name,place_type',
                        'next_token': {}}
        return (search_url, query_params)
    
def connect_to_endpoint(url, headers, params, next_token = None):
    params['next_token'] = next_token   #params object received from create_url function
    response = requests.request("GET", url, headers = headers, params = params)
    print("Endpoint Response Code: " + str(response.status_code))
    if response.status_code != 200:
        raise Exception(response.status_code, response.text)
    return response.json()

#remove emojis from tweet    
def give_emoji_free_text(text):
    return emoji.replace_emoji(text, replace='')

def append_to_csv(json_response, fileName):
    #A counter variable
    counter = 0

    #Open OR create the target CSV file
    csvFile = open(fileName, "a", newline="", encoding='utf-8')
    csvWriter = csv.writer(csvFile)
    
    #setup usernames via includes
    username = {user['id']: user['username'] for user in json_response['includes']['users']}
    
    #Loop through each tweet
    for tweet in json_response['data']:

        # 1. Username
        author_id = tweet['author_id']
        user = username[author_id]

        # 2. Time created
        created_at = dateutil.parser.parse(tweet['created_at'])

        # 3. Language
        lang = tweet['lang']

        # 4. Tweet metrics
        retweet_count = tweet['public_metrics']['retweet_count']
        reply_count = tweet['public_metrics']['reply_count']
        like_count = tweet['public_metrics']['like_count']
        quote_count = tweet['public_metrics']['quote_count']

        #5. URLs w/ a catch for tweets w/ two links TODO: how to catch more than two links? 
        if ('entities' in tweet) and ('urls' in tweet['entities']):
            for url in tweet['entities']['urls']:
                url = [url['expanded_url'] for url in tweet['entities']['urls'] if 'twitter.com' not in url['expanded_url']]
                url = ', '.join(url)
        else:
            url = ""
        
        #6. Tweet text
        text = give_emoji_free_text(tweet['text']) 
        
        # Assemble all data in a list
        res = [user, created_at, lang, like_count, quote_count, reply_count, retweet_count, text, url]

        # Append the result to the CSV file
        csvWriter.writerow(res)
        counter += 1    
    
    # When done, close the CSV file
    csvFile.close()

    # Print the number of tweets for this iteration
    print("# of Tweets added from this response: ", counter) 

def twitter_search(token, keyword, start, end, mresults, mcount, file_name):
    
    # TODO filter tweets in english only OR tweak TF-IDF stopwords (lang detection)
    bearer_token = token
    headers = {"Authorization": "Bearer {}".format(bearer_token)} 
    start_list = start
    end_list =  end
    max_results = mresults
    total_tweets = 0

    # Create file
    csvFile = open(f'{path}TWITTER_SEARCHES/RAW_SEARCHES/{file_name}.csv', "a", newline="", encoding='utf-8')
    csvWriter = csv.writer(csvFile)
    csvWriter.writerow(['user', 'created_at', 'lang', 'like_count', 'quote_count', 'reply_count','retweet_count','tweet', 'URL'])
    csvFile.close()

    for i in range(0,len(start_list)):
        # Inputs
        count = 0 # Counting tweets per time period
        max_count = mcount # Max tweets per time period
        flag = True
        next_token = None
        
        while flag:
            # Check if max_count reached
            if count >= max_count:
                break
            print("-------------------")
            print("Token: ", next_token)
            url = create_url(keyword, start_list[i],end_list[i], max_results)
            json_response = connect_to_endpoint(url[0], headers, url[1], next_token)
            result_count = json_response['meta']['result_count']

            if 'next_token' in json_response['meta']:
                # Save the token to use for next call
                next_token = json_response['meta']['next_token']
                print("Next Token: ", next_token)
                if result_count is not None and result_count > 0 and next_token is not None:
                    print("Start Date: ", start_list[i])
                    append_to_csv(json_response, f'{path}TWITTER_SEARCHES/RAW_SEARCHES/{file_name}.csv')
                    count += result_count
                    total_tweets += result_count
                    print(f"Total # of Tweets added for '{keyword}':", total_tweets)
                    print("-------------------")
                    time.sleep(5)                
            # If no next token exists
            else:
                if result_count is not None and result_count > 0:
                    print("-------------------")
                    print("Start Date: ", start_list[i])
                    append_to_csv(json_response, f'{path}TWITTER_SEARCHES/RAW_SEARCHES/{file_name}.csv')
                    count += result_count
                    total_tweets += result_count
                    print(f"Total # of Tweets added for '{keyword}':", total_tweets)
                    print("-------------------")
                    time.sleep(5)

                #Since this is the final request, turn flag to false to move to the next time period.
                flag = False
                next_token = None
            time.sleep(5)
    print("Total number of results:", total_tweets)
    
    df = pd.read_csv(f'{path}TWITTER_SEARCHES/RAW_SEARCHES/{file_name}.csv', keep_default_na=False, dtype={"user": "string", "lang": "string", "tweet": "string", "URL": "string"})
    
    # clean the tweet from meentions, hashtags, emojis
    df['tweet'].replace( { r"@[A-Za-z0-9_]+": '' }, inplace= True, regex = True)
    df['tweet'].replace( { r"#": '' }, inplace= True, regex = True)
    
    # remove tweets that are not in english, have empty URLs, or have duplicate URLs
    df = df[df['lang'].isin(['en'])]
    df = df[df.URL != '']
    df = df.drop_duplicates(['URL'], keep='last')

    #add a column for the search keyword
    df['Search KW'] = keyword

    #pickle df for reuse
    df.to_pickle(f'{path}TWITTER_SEARCHES/RAW_SEARCHES/{file_name}.pkl')


def scrape_links(link_list):
    links = pd.DataFrame(columns=['Title', 'Description', 'URL'])
    summarizer = pipeline("summarization", model='sshleifer/distilbart-cnn-12-6')
    
    for link in link_list:
        URL = link
        page = None
        ARTICLE = ''
        try:
            x = requests.head(URL, timeout=15)
            content_type = x.headers["Content-Type"] if "Content-Type" in x.headers else "None"
            if ("text/html" in content_type.lower()):
                page = requests.get(URL, timeout=15)
        except Exception:
            pass
        
        if page:
            soup = BeautifulSoup(page.content, "html.parser")
            title = ' '.join([t.text for t in soup.find('head').find_all('title')]).strip() \
                if soup and soup.find('head') and soup.find('body') is not None \
                else URL
            
            try:
                downloaded = trafilatura.fetch_url(URL)
                ARTICLE = trafilatura.extract(downloaded, include_comments=False, include_tables=True, target_language='en', deduplicate=True)
            except Exception:
                results = soup.find_all(['h1', 'p'])
                text = [result.text for result in results]
                ARTICLE = ' '.join(text)
            
            if ARTICLE is not None and len(ARTICLE) > 200:
                # text summarisation
                max_chunk = 500
                #removing special characters and replacing with end of sentence
                ARTICLE = ARTICLE.replace('.', '.<eos>')
                ARTICLE = ARTICLE.replace('?', '?<eos>')
                ARTICLE = ARTICLE.replace('!', '!<eos>')
                sentences = ARTICLE.split('<eos>')
                current_chunk = 0 
                chunks = []

                # split text to process
                for sentence in sentences:
                    if len(chunks) == current_chunk + 1: 
                        if len(chunks[current_chunk]) + len(sentence.split(' ')) <= max_chunk:
                            chunks[current_chunk].extend(sentence.split(' '))
                        else:
                            current_chunk += 1
                            chunks.append(sentence.split(' '))
                    else:
                        chunks.append(sentence.split(' '))

                for chunk_id in range(len(chunks)):
                    chunks[chunk_id] = ' '.join(chunks[chunk_id])
                try:
                    res = summarizer(chunks, max_length=59, min_length=30, do_sample=False)
                    # summary
                    text = ' '.join([summ['summary_text'] for summ in res])
                except Exception:
                    text = ARTICLE
                    continue
            else:
                text = ARTICLE
            print(URL)
            new_row = {'Title': title, 'Description': text, 'URL': URL.strip()}
            new_df = pd.DataFrame(data=new_row, index=[0])
            links = pd.concat([links, new_df], ignore_index=True)
    discard = ['None', '! D O C T Y P E h t m l >', '! d o c t y p e h t m l >', '! D O C T Y P E H T M L >']
    links = links.fillna('None')
    links = links[~links.Description.str.contains('|'.join(discard))]
    return links

def twitter_predictions(path, filename, p_input, p_feature, score, discard, filter):
    """ Predict relevant tweets using a pickled model based on Logistic regression and TF-IDF.
    
    Parameters
    ----------
    p_input: 
        dataframe of the prediction set
    p_feature: 
        df column, text of tweet or description of the resource
    filename: str
        model file name
    path: str
        parent folder
    score: int
        which prediction score to filter the results by 1/0
    discard: variable
        a list of terms to check against to remove tweets
    filter: str 
        a string against which to further filter predictions 
    """
    preds = lr_predict(path, filename, p_input, p_feature)
    preds = preds.drop_duplicates(['tweet'], keep='last')
    preds = preds.loc[preds['Prediction'] == score]
    preds = preds[~preds.URL.str.contains('|'.join(discard))]
    preds = preds.sort_values(by='Score', ascending=False).reset_index(drop=True)
    preds = preds[['tweet', 'Prediction', 'Score', 'Probability', 'Input Length', 'URL', 'Search KW']]
    if filter != '':
        preds = preds[preds['tweet'].str.contains(filter)]
        preds = preds.reset_index(drop=True)
    return preds

def resource_predictions(path, filename, p_input, p_feature, score, discard, savefile):
    """ Predict relevant URL descriptions using a pickled model based on Logistic regression and TF-IDF.
    
    Parameters
    ----------
    p_input: 
        dataframe of the prediction set
    p_feature: 
        df column, text of tweet or description of the resource
    filename: str
        model file name
    path: str
        parent folder
    score: int
        which prediction score to filter the results by 1/0
    discard: variable
        a list of terms to check against to remove tweets
    savefile: str
        name for the final csv to be saved under 
    """
    preds = lr_predict(path, filename, p_input, p_feature)
    preds = preds.drop_duplicates(['Description'], keep='last')
    preds = preds.loc[preds['Prediction'] == score]
    preds = preds[~preds.URL.str.contains('|'.join(discard))]
    preds = preds[~preds.Title.str.contains('|'.join(discard))]
    preds = preds.sort_values(by='Score', ascending=False).reset_index(drop=True)
    preds.to_csv(f'{path}LOGREG_RELEVANCE/PREDICTIONS/{savefile}.csv')
    return preds


## Training twitter and descriptions classifiers

This is a ONE TIME operation. The models are pickled and loaded later to predict new results

In [None]:
# one time training on twitter
#twitter_training_model = lr_training(twitter_training_set_v1, 'tweet', 'Target', 10, 'precision', 'twitter', path)

# one time training on resources
#resource_training_model = lr_training(archive_desc_training_v2, 'Description', 'Target', 10, 'f1','resources_v2',path)

## Query Twitter

Calls Twitter API with the list of keywords and returns the table `prediction_twitter`

In [None]:
token = 'AAAAAAAAAAAAAAAAAAAAAJgsNAEAAAAAQcsgbUnOJJmqmU483%2F8x6n9V1i8%3Df0qaEo9cV1sWP4eyNQ6E9s8BiRjvFTSN9mSqithe8uIXSNP68x'
#a selection of keywords based on MJI and musoW datasets
keywords = ['oral history', 'music magazine', 'sound archive', 'music history', 'music culture', 'music research', 'sheet music', 'music library', 'digital library', 'music collection', 'digital collection', 'sound recording', 'midi file', 'audio file', 'music information', 'musical score', 'digital score', 'song dataset', 'digital edition', 'digital archive', 'digital library', 'music archive', 'music library', 'archive collection']
#transfer keywords to input list w/ additional twitter parameters, e.g. no retweets
input_keywords = [f'\"{k}\" -is:retweet' for k in keywords] 
#input time periods for search as a comma separated list
start = ['2022-01-01T00:00:00.000Z', '2022-02-01T00:00:00.000Z', '2022-03-01T00:00:00.000Z']
end = ['2022-01-31T00:00:00.000Z', '2022-02-28T00:00:00.000Z', '2022-03-31T00:00:00.000Z']
#input max results / counts, and path 
mresults = 50 # max tweets per json response (100-500)
mcount = 50 # max tweets per search period 
#run the search! 
for k in input_keywords:
    filename = re.sub(r"([^A-Za-z0-9]+)", '', k) + f'_{start[0][0:10]}' + f'_{end[-1][6:10]}'
    filename = re.sub(r"isretweet", '', filename)
    prediction_twitter = twitter_search(token, k, start, end, mresults, mcount, filename)

#today = date.today()
#week_ago = today - timedelta(days=7)
#start = [week_ago.strftime("%Y-%m-%dT%H:%M:%S.000Z")]
#end = [today.strftime("%Y-%m-%dT%H:%M:%S.000Z")]


## Classify tweets

In [25]:
#load the search you want to classify 
tweets_to_predict = pd.read_pickle(path+'TWITTER_SEARCHES/RAW_SEARCHES/musicmagazine_2022-01-01_3-31.pkl')

In [26]:
#classify tweets
predicted = twitter_predictions(path, 'twitter', tweets_to_predict, 'tweet', 1, discard, '')
predicted 

Unnamed: 0,tweet,Prediction,Score,Probability,Input Length,URL,Search KW
0,Check out this listing I just added to my Posh...,1,4.754634,0.991462,165,https://poshmark.ca/listing/622a750e163df4b219...,"""music magazine"" -is:retweet"
1,Why Can't It Always Be Saturday? This 1978 ...,1,4.120266,0.984019,282,https://mainlynorfolk.info/folk/records/harryu...,"""music magazine"" -is:retweet"
2,"A good quiz day. First, a 4 on Wordle. Now, a ...",1,4.04156,0.982733,188,https://slate.com/news-and-politics/2022/01/sl...,"""music magazine"" -is:retweet"
3,NoiseHype are an online music magazine promoti...,1,3.752056,0.977069,120,http://noisehype.com/submit,"""music magazine"" -is:retweet"
4,Best 51 Twin Peaks Fan Tattoos – NSF – Music M...,1,3.525291,0.971399,85,https://carwaxo.com/best-51-twin-peaks-fan-tat...,"""music magazine"" -is:retweet"
5,There are just two surviving issues of The Mus...,1,3.490824,0.970426,197,https://www.atlasobscura.com/articles/first-bl...,"""music magazine"" -is:retweet"
6,"MOJO, the world’s largest UK music magazine, d...",1,3.291746,0.964145,267,https://libbyapp.com/search/southtyneside/sear...,"""music magazine"" -is:retweet"
7,IndiePulse Music Magazine Volume 2 Issue 8 Feb...,1,3.288155,0.96402,86,https://issuu.com/indiepulsemusicmagazine/docs...,"""music magazine"" -is:retweet"
8,Want GURANTEED Posting to music blogs and onli...,1,3.277115,0.963635,108,https://www.endsmedia.stream/p/subscribe-to-en...,"""music magazine"" -is:retweet"
9,Did you know... you can listen to our BBC Musi...,1,3.205786,0.961051,106,https://open.qobuz.com/playlist/8189647,"""music magazine"" -is:retweet"


## Scrape URLS

In [27]:
#get links from positive tweets results
twitter_link_list = [link for link in predicted['URL']]

#scrape URL list
scraped_links = scrape_links(twitter_link_list)

#merge w/ predictions and save for reuse
twitter_scrapes_preds = pd.merge(predicted, scraped_links, on='URL')
twitter_scrapes_preds.to_pickle(path+'LOGREG_RELEVANCE/SCRAPES/musicmagazine_2022-01-01_3-31_scrapes.pkl')


https://mainlynorfolk.info/folk/records/harryupton.html
https://slate.com/news-and-politics/2022/01/slate-news-quiz-supreme-court-joe-biden-jeopardy.html?utm_medium=social&utm_campaign=traffic&utm_source=article&utm_content=twitter_share
http://noisehype.com/submit


Your max_length is set to 59, but you input_length is only 50. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=25)


https://www.atlasobscura.com/articles/first-black-music-magazine
https://libbyapp.com/search/southtyneside/search/query-mojo/page-1/8840798
https://issuu.com/indiepulsemusicmagazine/docs/indiepulse_febuary_issue_8_2022_final
https://open.qobuz.com/playlist/8189647
https://www-xsnoize-com.cdn.ampproject.org/v/s/www.xsnoize.com/album-review-tears-for-fears-the-tipping-point/?amp_js_v=a6&amp_gsa=1&amp&usqp=mq331AQKKAFQArABIIACAw%3D%3D#aoh=16459126957862&referrer=https%3A%2F%2Fwww.google.com&amp_tf=De%20%251%24s&ampshare=https%3A%2F%2Fwww.xsnoize.com%2Falbum-review-tears-for-fears-the-tipping-point%2F
https://bit.ly/3lqjYoa
https://www.chamber-music.org/mag/2022/winter/index.html
https://jamsphere.com/printed-magazine-issues/jamsphere-indie-music-magazine-february-2022
https://bbc.in/33xHdqv
https://rockcritics.com/2013/03/29/from-the-archives-paul-williams-2001/
https://pocketmags.com/us/computer-music-magazine/january-2022
https://news.jamfestradio.com/2022/03/29/how-to-create-music-at-h

Your max_length is set to 59, but you input_length is only 51. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=25)


https://pocketmags.com/eu/computer-music-magazine
https://news.sunsetmusicsupervision.com/2022/03/29/dynamyte-come-thru-indiepulse-music-magazine/
https://bit.ly/3u8CDu1
https://jamsphere.com/printed-magazine-issues/jamsphere-indie-music-magazine-march-2022


Your max_length is set to 59, but you input_length is only 50. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=25)


https://ift.tt/6LSTgJv


Your max_length is set to 59, but you input_length is only 50. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=25)


http://s.einnews.com/FrkoJjQAlR
https://buff.ly/3LsvHxd
https://news.jamfestradio.com/2022/03/29/celebrated-soul-pop-artist-laura-cheadle-reveals-details-for-her-sultry-new-single-the-lust-in-between-indiepulse-music-magazine/
https://gerald-pilcher.ml/reflections-of-darkness-music-magazine-live-review-omd-london-2022/


Your max_length is set to 59, but you input_length is only 41. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=20)


https://www.nbcnews.com/news/us-news/rolling-stone-magazine-settles-rape-story-lawsuit-1-65-million-n772006
https://linktr.ee/coreydrumz4real
https://www.classical-music.com/awards-2022/opera-award-2022/
https://buff.ly/32iq8gg, https://buff.ly/2VuvncS
https://www.rollingstone.co.uk/politics/features/can-labour-win-again-with-young-people-disillusioned-13129/?fbclid=IwAR1lMroek6FASVE5bKx56NY_TZjOMLAfWdcGtSmrgnK2ExKHKdTPKnPY4RI
https://news.jamfestradio.com/2022/03/29/hot-half-dozen-3-29-22-indiepulse-music-magazine/
https://indieshark.com/music-reviews/toronto-tabla-ensemble-for-the-love-of-tabla-lp/
https://fb.watch/aQP3cUsQ5r/
https://jamsphere.com/printed-magazine-issues/jamsphere-indie-music-magazine-january-2022


In [28]:
twitter_scrapes_preds

Unnamed: 0,tweet,Prediction,Score,Probability,Input Length,URL,Search KW,Title,Description
0,Why Can't It Always Be Saturday? This 1978 ...,1,4.120266,0.984019,282,https://mainlynorfolk.info/folk/records/harryu...,"""music magazine"" -is:retweet",Harry Upton,This 1978 Topic “Special Project” was a limit...
1,NoiseHype are an online music magazine promoti...,1,3.752056,0.977069,120,http://noisehype.com/submit,"""music magazine"" -is:retweet",Submit Your Music | NoiseHype!,NoiseHype! promotes the best new music from up...
2,There are just two surviving issues of The Mus...,1,3.490824,0.970426,197,https://www.atlasobscura.com/articles/first-bl...,"""music magazine"" -is:retweet",The Women Behind the First Black Music Magazin...,There are just two surviving issues of The Mu...
3,"MOJO, the world’s largest UK music magazine, d...",1,3.291746,0.964145,267,https://libbyapp.com/search/southtyneside/sear...,"""music magazine"" -is:retweet",Libby,
4,IndiePulse Music Magazine Volume 2 Issue 8 Feb...,1,3.288155,0.96402,86,https://issuu.com/indiepulsemusicmagazine/docs...,"""music magazine"" -is:retweet",IndiePulse Music Magazine Volume 2 Issue 8 Feb...,Advertisement
5,ALBUM REVIEW: Tears For Fears - The Tipping Po...,1,2.985365,0.951909,108,https://www-xsnoize-com.cdn.ampproject.org/v/s...,"""music magazine"" -is:retweet",ALBUM REVIEW: Tears For Fears - The Tipping Po...,The long wait is over as Tears for Fears rele...
6,Having released her 16th album Gimme Some Wine...,1,2.8383,0.944711,292,https://bit.ly/3lqjYoa,"""music magazine"" -is:retweet",Eleanor McEvoy - Hawk's Well Theatre Sligo,Eleanor McEvoy has released her 16th album ‘G...
7,I had the pleasure of interviewing Nez Perce j...,1,2.431952,0.919232,169,https://www.chamber-music.org/mag/2022/winter/...,"""music magazine"" -is:retweet",Chamber Music Magazine,
8,Jamsphere Indie Music Magazine February 2022 -...,1,2.313992,0.910029,94,https://jamsphere.com/printed-magazine-issues/...,"""music magazine"" -is:retweet",Jamsphere Indie Music Magazine February 2022 |...,Jamsphere Indie Music Magazine February 2022 ...
9,It's time to test your grey matter with this ...,1,1.964726,0.877044,181,https://bbc.in/33xHdqv,"""music magazine"" -is:retweet",Let's test your knowledge of starched symphoni...,BBC Music Magazine Quiz: How good is your cla...


## Classify URLS

In [29]:
resources_predictions = resource_predictions(path, 'resources_v2', twitter_scrapes_preds, 'Description', 1, discard, 'musicmagazine_2022-01-01_3-31')
resources_predictions

Unnamed: 0,tweet,Prediction,Score,Probability,Input Length,URL,Search KW,Title,Description
0,Jamsphere Indie Music Magazine January 2022 - ...,1,4.181303,0.984951,180,https://jamsphere.com/printed-magazine-issues/...,"""music magazine"" -is:retweet",Jamsphere Indie Music Magazine January 2022 | ...,Jamsphere Indie Music Magazine January 2022 –...
1,Jamsphere Indie Music Magazine February 2022 -...,1,3.204508,0.961004,184,https://jamsphere.com/printed-magazine-issues/...,"""music magazine"" -is:retweet",Jamsphere Indie Music Magazine February 2022 |...,Jamsphere Indie Music Magazine February 2022 ...
2,Jamsphere Indie Music Magazine March 2022 - ht...,1,2.897023,0.947699,207,https://jamsphere.com/printed-magazine-issues/...,"""music magazine"" -is:retweet",Jamsphere Indie Music Magazine March 2022 | Ja...,Jamsphere Indie Music Magazine March 2022 – F...
3,HOT HALF DOZEN 3.29.22 – IndiePulse Music Maga...,1,2.067565,0.88771,192,https://news.jamfestradio.com/2022/03/29/hot-h...,"""music magazine"" -is:retweet",HOT HALF DOZEN 3.29.22 – IndiePulse Music Maga...,132 submissions this week . First single sing...
4,I had the pleasure of interviewing Nez Perce j...,1,1.51565,0.819897,0,https://www.chamber-music.org/mag/2022/winter/...,"""music magazine"" -is:retweet",Chamber Music Magazine,
5,IndiePulse Music Magazine Volume 2 Issue 8 Feb...,1,1.51565,0.819897,13,https://issuu.com/indiepulsemusicmagazine/docs...,"""music magazine"" -is:retweet",IndiePulse Music Magazine Volume 2 Issue 8 Feb...,Advertisement
6,Why Can't It Always Be Saturday? This 1978 ...,1,1.49897,0.817421,486,https://mainlynorfolk.info/folk/records/harryu...,"""music magazine"" -is:retweet",Harry Upton,This 1978 Topic “Special Project” was a limit...
7,The Women Behind the First Black Music Magazin...,1,1.260239,0.779067,995,http://s.einnews.com/FrkoJjQAlR,"""music magazine"" -is:retweet",The Women Behind the First Black Music Magazin...,There are just two surviving issues of The Mu...
8,Freddie De Tommaso's debut album has been nomi...,1,1.023203,0.735596,185,https://fb.watch/aQP3cUsQ5r/,"""music magazine"" -is:retweet",Decca Classics - Freddie De Tommaso's debut al...,Freddie De Tommaso's debut album has been nomi...
9,The Women Behind the First Black Music Magazin...,1,0.823686,0.695018,979,https://buff.ly/3LsvHxd,"""music magazine"" -is:retweet",The Women Behind the First Black Music Magazin...,There are just two surviving issues of The Mu...
