# Pipeline

## Imports

In [7]:
path = '../'
import csv , dateutil.parser , time
from datetime import date , timedelta 
import os
# classifier
import pandas as pd
import pickle
from sklearn.linear_model import LogisticRegressionCV
from sklearn.metrics import classification_report 
from sklearn.model_selection import cross_val_predict
from sklearn.feature_extraction.text import TfidfVectorizer
# web scraping
import requests
from bs4 import BeautifulSoup
#!pip3 install trafilatura
import trafilatura
from transformers import pipeline
#cleaning 
import emoji
import re
#functions
from PYTHON_FILES.LogReg_Searches import LogRegSearches

## Variables

In [2]:
# descriptions training set -> v2 = musow+mji descriptions vs summarized scrapes from twitter searches  
archive_desc_training_v2 = pd.read_pickle(path+'LOGREG_RELEVANCE/TRAINING_SETS/archive_desc_training_v2.pkl')

# twitter training set -> v1 = tweets from bigrams vs tweets for digital humanities and music company 
twitter_training_set_v1 = pd.read_pickle(path+'LOGREG_RELEVANCE/TRAINING_SETS/twitter_training_v1.pkl')

## Functions

In [3]:
def lr_training(t_input, t_feature, target, cv_int, score_type, filename, path):
    """ Create a text classifier based on Logistic regression and TF-IDF. Use cross validation 
    
    Parameters
    ----------
    t_input: 
        dataframe of the training set
    t_feature: 
        df column, text of tweet or description of the resource
    target: 
        df column, [0,1] values
    cv_int: int
        the number of cross validation folding
    score_type: str
        precision or recall
    filename: str
        model file name
    path: str
        parent folder
    """
    # TODO eda to define max_features=1000
      
    tfidf_transformer = TfidfVectorizer(ngram_range=(1, 2), lowercase=True, max_features=1000) 
    x_train = tfidf_transformer.fit_transform(t_input[t_feature])
    y_train = t_input[target].values
    model = LogisticRegressionCV(solver='liblinear', random_state=44, cv=cv_int, scoring=score_type)
    
    # export
    model.fit(x_train, y_train)
    export_model = f'LOGREG_RELEVANCE/MODELS/{filename}_model.pkl'
    export_vectorizer = f'LOGREG_RELEVANCE/MODELS/{filename}_vectorizer.pkl'
    pickle.dump(model, open(path+export_model, 'wb'))
    pickle.dump(tfidf_transformer, open(path+export_vectorizer, 'wb'))
    
    # report
    y_pred = cross_val_predict(model, x_train, y_train, cv=cv_int)
    report = classification_report(y_train, y_pred)
    print('report:', report, sep='\n')
    return model
    
def lr_predict(path, filename, p_input, p_feature):
    """ Classify text using a pickled model based on Logistic regression and TF-IDF.
    
    Parameters
    ----------
    p_input: 
        dataframe of the prediction set
    p_feature: 
        df column, text of tweet or description of the resource
    filename: str
        model file name
    path: str
        parent folder
    """
    export_model = f'{path}LOGREG_RELEVANCE/MODELS/{filename}_model.pkl'
    export_vectorizer = f'{path}LOGREG_RELEVANCE/MODELS/{filename}_vectorizer.pkl'
    model = pickle.load(open(export_model, 'rb'))
    tfidf_transformer = pickle.load(open(export_vectorizer, 'rb'))
  
    #result = loaded_model.score(X_test, Y_test)
    #x_new_count = count_vect.transform(p_input[p_feature])
    x_predict = tfidf_transformer.transform(p_input[p_feature])
    y_predict = model.predict(x_predict)
    scores = model.decision_function(x_predict)
    probability = model.predict_proba(x_predict)
    
    #results = [r for r in y_predict]
    result = p_input.copy()
    result['Prediction'] = y_predict
    result['Score'] = scores
    result['Probability'] = probability[:,1]
    result['Input Length'] = result[p_feature].str.len()
    return result

def create_url(keyword, start_date, end_date, max_results):
        search_url = "https://api.twitter.com/2/tweets/search/all" #Change to the endpoint you want to collect data from
        #change params based on the endpoint you are using
        query_params = {'query': keyword,
                        'start_time': start_date,
                        'end_time': end_date,
                        'max_results': max_results,
                        'expansions': 'author_id,in_reply_to_user_id,geo.place_id',
                        'tweet.fields': 'id,text,author_id,in_reply_to_user_id,geo,conversation_id,created_at,lang,public_metrics,referenced_tweets,reply_settings,source,entities',
                        'user.fields': 'id,name,username,created_at,description,public_metrics,verified',
                        'place.fields': 'full_name,id,country,country_code,geo,name,place_type',
                        'next_token': {}}
        return (search_url, query_params)
    
def connect_to_endpoint(url, headers, params, next_token = None):
    params['next_token'] = next_token   #params object received from create_url function
    response = requests.request("GET", url, headers = headers, params = params)
    print("Endpoint Response Code: " + str(response.status_code))
    if response.status_code != 200:
        raise Exception(response.status_code, response.text)
    return response.json()

def give_emoji_free_text(text):
    return emoji.replace_emoji(text, replace='')

def append_to_csv(json_response, fileName):
    #A counter variable
    counter = 0

    #Open OR create the target CSV file
    csvFile = open(fileName, "a", newline="", encoding='utf-8')
    csvWriter = csv.writer(csvFile)
    
    #setup usernames via includes
    username = {user['id']: user['username'] for user in json_response['includes']['users']}
    
    #Loop through each tweet
    for tweet in json_response['data']:

        # 1. Username
        author_id = tweet['author_id']
        user = username[author_id]

        # 2. Time created
        created_at = dateutil.parser.parse(tweet['created_at'])

        # 3. Language
        lang = tweet['lang']

        # 4. Tweet metrics
        retweet_count = tweet['public_metrics']['retweet_count']
        reply_count = tweet['public_metrics']['reply_count']
        like_count = tweet['public_metrics']['like_count']
        quote_count = tweet['public_metrics']['quote_count']

        #5. URLs w/ a catch for tweets w/ two links TODO: how to catch more than two links? 
        if ('entities' in tweet) and ('urls' in tweet['entities']):
            for url in tweet['entities']['urls']:
                url = [url['expanded_url'] for url in tweet['entities']['urls'] if 'twitter.com' not in url['expanded_url']]
                url = ', '.join(url)
        else:
            url = ""
        
        #6. Tweet text
        text = give_emoji_free_text(tweet['text']) 
        
        # Assemble all data in a list
        res = [user, created_at, lang, like_count, quote_count, reply_count, retweet_count, text, url]

        # Append the result to the CSV file
        csvWriter.writerow(res)
        counter += 1    
    
    # When done, close the CSV file
    csvFile.close()

    # Print the number of tweets for this iteration
    print("# of Tweets added from this response: ", counter) 

def twitter_search(token, keyword, start, end, mresults, mcount, file_name):
    
    # TODO filter tweets in english only OR tweak TF-IDF stopwords (lang detection)
    bearer_token = token
    headers = {"Authorization": "Bearer {}".format(bearer_token)} 
    start_list = start
    end_list =  end
    max_results = mresults
    total_tweets = 0

    # Create file
    csvFile = open(f'{path}TWITTER_SEARCHES/RAW_SEARCHES/{file_name}.csv', "a", newline="", encoding='utf-8')
    csvWriter = csv.writer(csvFile)
    csvWriter.writerow(['user', 'created_at', 'lang', 'like_count', 'quote_count', 'reply_count','retweet_count','tweet', 'URL'])
    csvFile.close()

    for i in range(0,len(start_list)):
        # Inputs
        count = 0 # Counting tweets per time period
        max_count = mcount # Max tweets per time period
        flag = True
        next_token = None
        
        while flag:
            # Check if max_count reached
            if count >= max_count:
                break
            print("-------------------")
            print("Token: ", next_token)
            url = create_url(keyword, start_list[i],end_list[i], max_results)
            json_response = connect_to_endpoint(url[0], headers, url[1], next_token)
            result_count = json_response['meta']['result_count']

            if 'next_token' in json_response['meta']:
                # Save the token to use for next call
                next_token = json_response['meta']['next_token']
                print("Next Token: ", next_token)
                if result_count is not None and result_count > 0 and next_token is not None:
                    print("Start Date: ", start_list[i])
                    append_to_csv(json_response, f'{path}TWITTER_SEARCHES/RAW_SEARCHES/{file_name}.csv')
                    count += result_count
                    total_tweets += result_count
                    print(f"Total # of Tweets added for '{keyword}':", total_tweets)
                    print("-------------------")
                    time.sleep(5)                
            # If no next token exists
            else:
                if result_count is not None and result_count > 0:
                    print("-------------------")
                    print("Start Date: ", start_list[i])
                    append_to_csv(json_response, f'{path}TWITTER_SEARCHES/RAW_SEARCHES/{file_name}.csv')
                    count += result_count
                    total_tweets += result_count
                    print(f"Total # of Tweets added for '{keyword}':", total_tweets)
                    print("-------------------")
                    time.sleep(5)

                #Since this is the final request, turn flag to false to move to the next time period.
                flag = False
                next_token = None
            time.sleep(5)
    print("Total number of results:", total_tweets)
    
    df = pd.read_csv(f'{path}TWITTER_SEARCHES/RAW_SEARCHES/{file_name}.csv', keep_default_na=False, dtype={"user": "string", "lang": "string", "tweet": "string", "URL": "string"})
    
    # clean the tweet from meentions, hashtags, emojis
    df['tweet'].replace( { r"@[A-Za-z0-9_]+": '' }, inplace= True, regex = True)
    df['tweet'].replace( { r"#": '' }, inplace= True, regex = True)
    
    # remove tweets that are not in english, have empty URLs, or have duplicate URLs
    df = df[df['lang'].isin(['en'])]
    df = df[df.URL != '']
    df = df.drop_duplicates(['URL'], keep='last')

    #add a column for the search keyword
    df['Search KW'] = keyword

    #pickle df for reuse
    df.to_pickle(f'{path}TWITTER_SEARCHES/RAW_SEARCHES/{file_name}.pkl')

def scrape_links(link_list, pred_df, filename):
    """ Scrape links from classified tweets, save scrapes, combine them w/ tweets and return a DF for description classification.
    
    Parameters
    ----------
    p_input: 
        dataframe of the prediction set
    p_feature: 
        df column, text of tweet or description of the resource
    filename: str
        model file name
    path: str
        parent folder
    score: int
        which prediction score to filter the results by 1/0
    discard: variable
        a list of terms to check against to remove tweets
    filter: str 
        a string against which to further filter predictions 
    """
    links = pd.DataFrame(columns=['Title', 'Description', 'URL'])
    summarizer = pipeline("summarization", model='sshleifer/distilbart-cnn-12-6')
    
    for link in link_list:
        URL = link
        page = None
        ARTICLE = ''
        try:
            x = requests.head(URL, timeout=15)
            content_type = x.headers["Content-Type"] if "Content-Type" in x.headers else "None"
            if ("text/html" in content_type.lower()):
                page = requests.get(URL, timeout=15)
        except Exception:
            pass
        
        if page:
            soup = BeautifulSoup(page.content, "html.parser")
            title = ' '.join([t.text for t in soup.find('head').find_all('title')]).strip() \
                if soup and soup.find('head') and soup.find('body') is not None \
                else URL
            
            try:
                downloaded = trafilatura.fetch_url(URL)
                ARTICLE = trafilatura.extract(downloaded, include_comments=False, include_tables=True, target_language='en', deduplicate=True)
            except Exception:
                results = soup.find_all(['h1', 'p'])
                text = [result.text for result in results]
                ARTICLE = ' '.join(text)
            
            if ARTICLE is not None and len(ARTICLE) > 200:
                # text summarisation
                max_chunk = 500
                #removing special characters and replacing with end of sentence
                ARTICLE = ARTICLE.replace('.', '.<eos>')
                ARTICLE = ARTICLE.replace('?', '?<eos>')
                ARTICLE = ARTICLE.replace('!', '!<eos>')
                sentences = ARTICLE.split('<eos>')
                current_chunk = 0 
                chunks = []

                # split text to process
                for sentence in sentences:
                    if len(chunks) == current_chunk + 1: 
                        if len(chunks[current_chunk]) + len(sentence.split(' ')) <= max_chunk:
                            chunks[current_chunk].extend(sentence.split(' '))
                        else:
                            current_chunk += 1
                            chunks.append(sentence.split(' '))
                    else:
                        chunks.append(sentence.split(' '))

                for chunk_id in range(len(chunks)):
                    chunks[chunk_id] = ' '.join(chunks[chunk_id])
                try:
                    res = summarizer(chunks, min_length = int(0.2 * len(text)), max_length = int(0.5 * len(text)), do_sample=False)
                    # summary
                    text = ' '.join([summ['summary_text'] for summ in res])
                except Exception:
                    text = ARTICLE
                    continue
            else:
                text = ARTICLE
            print(URL)
            new_row = {'Title': title, 'Description': text, 'URL': URL.strip()}
            new_df = pd.DataFrame(data=new_row, index=[0])
            links = pd.concat([links, new_df], ignore_index=True)
    discard = ['None', '! D O C T Y P E h t m l >', '! d o c t y p e h t m l >', '! D O C T Y P E H T M L >']
    links = links.fillna('None')
    links = links[~links.Description.str.contains('|'.join(discard))]
    twitter_scrapes_preds = pd.merge(pred_df, links, on='URL')
    twitter_scrapes_preds.to_pickle(f'{path}LOGREG_RELEVANCE/SCRAPES/{filename}.pkl')
    print(len(twitter_scrapes_preds))
    return twitter_scrapes_preds

def twitter_predictions(path, filename, p_input, p_feature, score, discard, filter):
    """ Predict relevant tweets using a pickled model based on Logistic regression and TF-IDF.
    
    Parameters
    ----------
    p_input: 
        dataframe of the prediction set
    p_feature: 
        df column, text of tweet or description of the resource
    filename: str
        model file name
    path: str
        parent folder
    score: int
        which prediction score to filter the results by 1/0
    discard: variable
        a list of terms to check against to remove tweets
    filter: str 
        a string against which to further filter predictions 
    """
    preds = lr_predict(path, filename, p_input, p_feature)
    preds = preds.drop_duplicates(['tweet'], keep='last')
    preds = preds.loc[preds['Prediction'] == score]
    preds = preds[~preds.URL.str.contains('|'.join(discard))]
    preds = preds.sort_values(by='Score', ascending=False).reset_index(drop=True)
    preds = preds[['tweet', 'Prediction', 'Score', 'Probability', 'Input Length', 'URL', 'Search KW']]
    if filter != '':
        preds = preds[preds['tweet'].str.contains(filter)]
        preds = preds.reset_index(drop=True)
    twitter_link_list = [link for link in preds['URL']]
    print('Total tweets classified:', len(preds))
    return preds, twitter_link_list

def resource_predictions(path, filename, p_input, p_feature, score, discard, savefile):
    """ Predict relevant URL descriptions using a pickled model based on Logistic regression and TF-IDF.
    
    Parameters
    ----------
    p_input: 
        dataframe of the prediction set
    p_feature: 
        df column, text of tweet or description of the resource
    filename: str
        model file name
    path: str
        parent folder
    score: int
        which prediction score to filter the results by 1/0
    discard: variable
        a list of terms to check against to remove tweets
    savefile: str
        name for the final csv to be saved under 
    """
    if len(filename) == 0:
        return 'Sorry no URLs to classify!'
    preds = lr_predict(path, filename, p_input, p_feature)
    preds = preds.drop_duplicates(['Description'], keep='last')
    preds = preds.loc[preds['Description'] != '']
    preds = preds.loc[preds['Prediction'] == score]
    preds = preds[~preds.URL.str.contains('|'.join(discard))]
    preds = preds[~preds.Title.str.contains('|'.join(discard))]
    preds = preds.sort_values(by='Score', ascending=False).reset_index(drop=True)
    preds.to_csv(f'{path}LOGREG_RELEVANCE/PREDICTIONS/{savefile}.csv')
    print(preds)
    return preds

def tweets_to_classify(path, filetype):   
    """ Merge all tweet searches together.
    
    Parameters
    ----------
    path: 
        for raw searches folder
    filetype: 
        the ending of the files to load, you can call just .pkl or also the date tag from file names
    """  
    raw_searches = path+'TWITTER_SEARCHES/RAW_SEARCHES/'
    result = pd.DataFrame()
    tweets_to_classify = pd.DataFrame()
    for file in os.listdir(raw_searches):
        if file.endswith(filetype):
            result = pd.read_pickle(raw_searches+file)
        tweets_to_classify = pd.concat([tweets_to_classify, result])
        tweets_to_classify = tweets_to_classify.reset_index(drop=True)
    print('Total tweets to classify:', len(tweets_to_classify))
    return tweets_to_classify

## Training twitter and descriptions classifiers

This is a ONE TIME operation. The models are pickled and loaded later to predict new results

In [None]:
# one time training on twitter
#twitter_training_model = lr_training(twitter_training_set_v1, 'tweet', 'Target', 10, 'precision', 'twitter', path)

# one time training on resources
#resource_training_model = lr_training(archive_desc_training_v2, 'Description', 'Target', 10, 'f1','resources_v2',path)

## Query Twitter

Calls Twitter API with the list of keywords and returns the table `prediction_twitter`

In [None]:
def twitter_search_weekly (token, keyword_list, max_results, max_counts):
    today = date.today()
    week_ago = today - timedelta(days=7)
    start = [week_ago.strftime("%Y-%m-%dT%H:%M:%S.000Z")]
    end = [today.strftime("%Y-%m-%dT%H:%M:%S.000Z")]
    #input max results / counts, and path 
    mresults = max_results # max tweets per json response (100-500)
    mcount = max_counts # max tweets per search period 
    #format keywords for search
    input_keywords = [f'\"{k}\" -is:retweet' for k in keyword_list] 
    #send to search 
    for k in input_keywords:
        filename = re.sub(r"([^A-Za-z0-9]+)", '', k) + f'_{start[0][0:10]}' + f'_{end[-1][6:10]}'
        filename = re.sub(r"isretweet", '', filename)
        prediction_twitter = LogRegSearches.search_twitter(token, k, start, end, mresults, mcount, filename)

def twitter_search_custom (token, keyword_list, start_list, end_list, max_results, max_counts):
    start = start_list
    end = end_list
    #input max results / counts, and path 
    mresults = max_results # max tweets per json response (100-500)
    mcount = max_counts # max tweets per search period 
    #format keywords for search
    input_keywords = [f'\"{k}\" -is:retweet' for k in keyword_list] 
    #send to search 
    for k in input_keywords:
        filename = re.sub(r"([^A-Za-z0-9]+)", '', k) + f'_{start[0][0:10]}' + f'_{end[-1][6:10]}'
        filename = re.sub(r"isretweet", '', filename)
        prediction_twitter = LogRegSearches.search_twitter(token, k, start, end, mresults, mcount, filename)


In [None]:
#load token
token = 'AAAAAAAAAAAAAAAAAAAAAJgsNAEAAAAAQcsgbUnOJJmqmU483%2F8x6n9V1i8%3Df0qaEo9cV1sWP4eyNQ6E9s8BiRjvFTSN9mSqithe8uIXSNP68x'
#a selection of keywords based on MJI and musoW datasets

#choose KW list 
##OG keyword list
keywords = ['oral history', 'music magazine', 'sound archive', 'music history', 'music culture', 'music research', 'sheet music', 'music library', 'digital library', 'music collection', 'digital collection', 'sound recording', 'midi file', 'audio file', 'music information', 'musical score', 'digital score', 'song dataset', 'digital edition', 'digital archive', 'digital library', 'music archive', 'music library', 'archive collection']

##best performing list based on v1 tests
better_keywords = ['audio file', 'music archive', 'music collection', 'music library', 'sheet music', 'sound archive', 'sound recording']

#choose search option 

In [9]:
#load token
token = 'AAAAAAAAAAAAAAAAAAAAAJgsNAEAAAAAQcsgbUnOJJmqmU483%2F8x6n9V1i8%3Df0qaEo9cV1sWP4eyNQ6E9s8BiRjvFTSN9mSqithe8uIXSNP68x'
#a selection of keywords based on MJI and musoW datasets

#OG keyword list
keywords = ['oral history', 'music magazine', 'sound archive', 'music history', 'music culture', 'music research', 'sheet music', 'music library', 'digital library', 'music collection', 'digital collection', 'sound recording', 'midi file', 'audio file', 'music information', 'musical score', 'digital score', 'song dataset', 'digital edition', 'digital archive', 'digital library', 'music archive', 'music library', 'archive collection']

#best performing list based on v1 tests
better_keywords = ['audio file', 'music archive', 'music collection', 'music library', 'sheet music', 'sound archive', 'sound recording']

#transfer keywords to input list w/ additional twitter parameters, e.g. no retweets
input_keywords = [f'\"{k}\" -is:retweet' for k in keywords] 
#input time periods for search as a comma separated list, searches will be conducted for each time pair e.g. first of week/month, last of week/month 
start = ['2022-05-01T00:00:00.000Z', '2022-05-02T00:00:00.000Z', '2022-05-03T00:00:00.000Z', '2022-05-04T00:00:00.000Z', '2022-05-05T00:00:00.000Z', '2022-05-06T00:00:00.000Z', '2022-05-07T00:00:00.000Z']
end = ['2022-05-01T23:59:59.000Z', '2022-05-02T23:59:59.000Z', '2022-05-03T23:59:59.000Z', '2022-05-04T23:59:59.000Z', '2022-05-05T23:59:59.000Z', '2022-05-06T23:59:59.000Z', '2022-05-07T23:59:59.000Z']


#input max results / counts, and path 
mresults = 500 # max tweets per json response (100-500)
mcount = 500 # max tweets per search period 
#run the search! 
for k in input_keywords:
    filename = re.sub(r"([^A-Za-z0-9]+)", '', k) + f'_{start[0][0:10]}' + f'_{end[-1][6:10]}'
    filename = re.sub(r"isretweet", '', filename)
    prediction_twitter = LogRegSearches.search_twitter(token, k, start, end, mresults, mcount, filename)

#today = date.today()
#week_ago = today - timedelta(days=7)
#start = [week_ago.strftime("%Y-%m-%dT%H:%M:%S.000Z")]
#end = [today.strftime("%Y-%m-%dT%H:%M:%S.000Z")]

-------------------
Token:  None
Endpoint Response Code: 200
-------------------
Start Date:  2022-05-01T00:00:00.000Z
# of Tweets added from this response:  96
Total # of Tweets added for '"oral history" -is:retweet': 96
-------------------
-------------------
Token:  None
Endpoint Response Code: 200
-------------------
Start Date:  2022-05-02T00:00:00.000Z
# of Tweets added from this response:  138
Total # of Tweets added for '"oral history" -is:retweet': 234
-------------------
-------------------
Token:  None
Endpoint Response Code: 200
-------------------
Start Date:  2022-05-03T00:00:00.000Z
# of Tweets added from this response:  137
Total # of Tweets added for '"oral history" -is:retweet': 371
-------------------
-------------------
Token:  None
Endpoint Response Code: 200
-------------------
Start Date:  2022-05-04T00:00:00.000Z
# of Tweets added from this response:  149
Total # of Tweets added for '"oral history" -is:retweet': 520
-------------------
-------------------
Token:

## Classify tweets

In [6]:
#load all search results into a single dataframe 
tweets_to_classify = LogRegSearches.tweets_to_classify(path, '2022-05-01_5-07.pkl')

In [7]:
tweets_to_classify

Unnamed: 0,user,created_at,lang,like_count,quote_count,reply_count,retweet_count,tweet,URL,Search KW
0,tuomas_ee,2022-01-30 16:27:08+00:00,en,25,3,0,8,Interested in high-quality musicology research...,https://www.durham.ac.uk/departments/academic/...,"""music research"" -is:retweet"
1,ancientlyric,2022-01-30 02:03:04+00:00,en,2,0,0,1,Each instrument in a mix costs $40-60/hour for...,http://Patreon.com/BettinaJoyDeGuzman,"""music research"" -is:retweet"
2,melissachemam,2022-01-29 18:58:32+00:00,en,0,0,0,0,The British Library launched a major research ...,https://www.rollingstone.co.uk/music/british-l...,"""music research"" -is:retweet"
3,CRHCIUSB,2022-01-29 12:50:00+00:00,en,0,0,0,0,One person finds a tucked away college archive...,https://losscaptureproject.cargo.site/More-tha...,"""music research"" -is:retweet"
4,KeithJonesJr,2022-01-29 05:14:58+00:00,en,0,1,0,1,Did not know Marsha Ambrosius wrote this. I’m ...,https://hellobeautiful.com/2741812/marsha-ambr...,"""music research"" -is:retweet"
...,...,...,...,...,...,...,...,...,...,...
7347,saareman,2022-03-10 13:45:46+00:00,en,1,0,0,0,Download the score at the Estonian Music Infor...,https://www.emic.ee/?sisu=uudis_edasi&mid=27&l...,"""music information"" -is:retweet"
7348,tt_edugraph,2022-03-09 16:15:00+00:00,en,0,0,0,0,Kolkata-based educational institutions and ass...,https://www.telegraphindia.com/edugraph/campus...,"""music information"" -is:retweet"
7349,CroatianMusic,2022-03-09 10:04:08+00:00,en,0,0,0,0,We are looking forward to another performance ...,https://mic.hr/en/performance-of-detonis-the-w...,"""music information"" -is:retweet"
7350,bemoreloyal,2022-03-08 14:09:13+00:00,en,3,0,0,1,InternationalWomensDay Nadine is a research ...,https://bit.ly/3MA6Bhe,"""music information"" -is:retweet"


In [19]:
#load the search you want to classify 
tweets_to_predict = pd.read_pickle(path+'TWITTER_SEARCHES/RAW_SEARCHES/musicresearch_2022-01-01_3-31.pkl')
tweets_to_predict

Unnamed: 0,user,created_at,lang,like_count,quote_count,reply_count,retweet_count,tweet,URL,Search KW
0,tuomas_ee,2022-01-30 16:27:08+00:00,en,25,3,0,8,Interested in high-quality musicology research...,https://www.durham.ac.uk/departments/academic/...,"""music research"" -is:retweet"
1,ancientlyric,2022-01-30 02:03:04+00:00,en,2,0,0,1,Each instrument in a mix costs $40-60/hour for...,http://Patreon.com/BettinaJoyDeGuzman,"""music research"" -is:retweet"
4,melissachemam,2022-01-29 18:58:32+00:00,en,0,0,0,0,The British Library launched a major research ...,https://www.rollingstone.co.uk/music/british-l...,"""music research"" -is:retweet"
7,CRHCIUSB,2022-01-29 12:50:00+00:00,en,0,0,0,0,One person finds a tucked away college archive...,https://losscaptureproject.cargo.site/More-tha...,"""music research"" -is:retweet"
8,KeithJonesJr,2022-01-29 05:14:58+00:00,en,0,1,0,1,Did not know Marsha Ambrosius wrote this. I’m ...,https://hellobeautiful.com/2741812/marsha-ambr...,"""music research"" -is:retweet"
...,...,...,...,...,...,...,...,...,...,...
205,EshitePeter,2022-03-22 23:22:37+00:00,en,3,0,0,0,"Check out this review of Peedoet:writers, musi...",https://goo.gl/maps/pfWC6mt7byfk2pwbA,"""music research"" -is:retweet"
208,HomesAtMetacoda,2022-03-22 21:25:00+00:00,en,0,0,0,0,An insight into how our brains respond to musi...,https://www.forbes.com/sites/evaamsen/2022/01/...,"""music research"" -is:retweet"
211,tuosmusic,2022-03-22 11:11:22+00:00,en,1,1,0,0,Today's research seminar! Exploring Women’s ...,https://www.sheffield.ac.uk/music/research/res...,"""music research"" -is:retweet"
214,AusMusician,2022-03-21 21:45:49+00:00,en,0,0,0,1,Regional Music Research Group is kicking off i...,https://australianmusician.com.au/youre-invite...,"""music research"" -is:retweet"


In [None]:
#classify tweets
predicted = twitter_predictions(path, 'twitter', tweets_to_predict, 'tweet', 1, discard, '')
predicted 

## Scrape URLS

In [176]:
#get links from positive tweets results
twitter_link_list = [link for link in predicted['URL']]

#scrape URL list
scraped_links = scrape_links(twitter_link_list)

#merge w/ predictions and save for reuse
twitter_scrapes_preds = pd.merge(predicted, scraped_links, on='URL')
twitter_scrapes_preds.to_pickle(path+'LOGREG_RELEVANCE/SCRAPES/soundarchive_2022-04-01_4-30_scrapes.pkl')


https://archive.org/details/stgigaarchive
https://tickets.nfsa.gov.au/Events/WHEN-THE-CAMERA-STOPPED-ROLLING-Q-A
https://fb.watch/cmoHcGeDdv/
https://spoti.fi/3MDqpzA
https://www.base.at/dorninger/soundmixer-2
http://store.rocksound.tv/simpleplan-tw
http://www.noise11.com/news/national-film-and-sound-archive-of-australia-restore-the-original-helen-reddy-i-am-woman-video-20220322
https://freesound.org/
https://tickets.nfsa.gov.au/Events/BLACK-ANZAC
https://video.alexanderstreet.com/watch/operation-babylift
https://fb.watch/cqYnrBpMUY/
https://rsa.fau.edu/album/42327
https://tinyurl.com/3ys8hsc6


Your max_length is set to 59, but you input_length is only 24. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=12)


https://pastdaily.com/2019/05/15/sir-simon-rattle-with-the-berlin-philharmonic-in-music-of-schoenberg-and-mahler-2010-past-daily-mid-week-concert/
http://bit.ly/3IPdLLi
https://pastdaily.com/2021/01/13/january-13-1982-big-freeze-of-82-on-going-situation-in-poland-a-sit-down-with-hosni-mubarak/
https://bit.ly/3ODwhdT
https://www.nfsa.gov.au/collection/curated/chris-bailey-interviewed-iain-shedden
https://www.bl.uk/events/late-at-the-library-the-will-gregory-moog-ensemble
http://www.divfuse.com
https://www.bbc.co.uk/sounds/play/m0016h9r
https://richmix.org.uk/events/mwalimu-express/
https://www.eventbrite.co.uk/e/ya-lalla-jewish-saharans-singing-to-birth-tickets-277570158817
https://www.realgonerocks.com/2020/10/the-fall-fall-sound-archive-vol-5-imperial-wax-solvent/
https://www.afr.com/rear-window/national-screen-and-sound-archive-hunts-for-scott-love-rub-morrisson-20150813-giynw7


In [177]:
len(twitter_scrapes_preds)

22

## Classify URLS

In [178]:
resources_predictions = resource_predictions(path, 'resources_v2', twitter_scrapes_preds, 'Description', 1, discard, 'soundarchive_2022-04-01_4-30')
resources_predictions

Unnamed: 0,tweet,Prediction,Score,Probability,Input Length,URL,Search KW,Title,Description
0,Our Screen and Sound Archive is available in o...,1,3.880747,0.979782,255,https://tinyurl.com/3ys8hsc6,"""sound archive"" -is:retweet",West Glamorgan Screen and Sound Archive - Swansea,West Glamorgan Screen and Sound Archive is a ...
1,NFSA - National Film and Sound Archive of Aust...,1,2.624148,0.9324,174,https://fb.watch/cmoHcGeDdv/,"""sound archive"" -is:retweet",NFSA - National Film and Sound Archive of Aust...,Collection of General Motors Holden cinema an...
2,Feel free to jam with my new Sound Mixer 2 wit...,1,2.310839,0.909771,196,https://www.base.at/dorninger/soundmixer-2,"""sound archive"" -is:retweet",Sound Mixer #2 | Dorninger,"Wolfgang Dorninger mixes field-recordings, be..."
3,"This Sun 3 Apr, our family-friendly afternoon ...",1,1.946392,0.875053,146,https://richmix.org.uk/events/mwalimu-express/,"""sound archive"" -is:retweet",Mwalimu Express - Rich Mix,Live African Music from the cream of London’s...
4,Supposedly this was a Vick's Vaporub ad starri...,1,1.733931,0.849915,260,https://www.afr.com/rear-window/national-scree...,"""sound archive"" -is:retweet",National Film and Sound Archive hunts for Scot...,National Film and Sound Archive hunts for Sco...
5,National Film and Sound Archive of Australia R...,1,1.29511,0.785011,282,http://www.noise11.com/news/national-film-and-...,"""sound archive"" -is:retweet",http://www.noise11.com/news/national-film-and-...,The Australian Film and Sound Archive in Canb...
6,"Still rock in these smooth sounds, perfect wor...",1,1.112723,0.752636,129,https://archive.org/details/stgigaarchive,"""sound archive"" -is:retweet",St. GIGA - Tide of Sound Archive : St. GIGA : ...,"Reviewer: ""Thank you so much for archiving th..."
7,"Not sure, but you can learn a little more her...",1,0.467541,0.614802,182,https://bit.ly/3ODwhdT,"""sound archive"" -is:retweet",Vivien Mepham on Mad Max | NFSA,Vivien Mepham worked on the original Mad Max ...
8,"January 13, 1982 - The Big Freeze Of '82 - The...",1,0.463558,0.613858,269,https://pastdaily.com/2021/01/13/january-13-19...,"""sound archive"" -is:retweet","January 13, 1982 - The Big Freeze Of '82 - The...",The Big Freeze of 1982 dumped record amounts ...
9,"Imperial Wax Solvent, one of *the* great late-...",1,0.457169,0.612342,1400,https://www.realgonerocks.com/2020/10/the-fall...,"""sound archive"" -is:retweet",THE FALL – Fall Sound Archive Vol 5: Imperial ...,'Imperial Wax Solvent' is an album that has m...
