# Pipeline

- This notebook is used to test various aspects of the twitter pipeline 

## Imports

In [1]:
path = '../'
import csv , dateutil.parser , time
from datetime import date , timedelta 
import os
# classifier
import pandas as pd
import pickle
from sklearn.linear_model import LogisticRegressionCV
from sklearn.metrics import classification_report 
from sklearn.model_selection import cross_val_predict
from sklearn.feature_extraction.text import TfidfVectorizer
# web scraping
import requests
from bs4 import BeautifulSoup
#!pip3 install trafilatura
import trafilatura
from transformers import pipeline
#cleaning 
import emoji
import re
#functions
from PYTHON_FILES.LogReg_Searches import LogRegSearches

## Variables

In [2]:
# descriptions training set -> v2 = musow+mji descriptions vs summarized scrapes from twitter searches  
archive_desc_training_v2 = pd.read_pickle(path+'LOGREG_RELEVANCE/TRAINING_SETS/archive_desc_training_v2.pkl')

# twitter training set -> v1 = tweets from bigrams vs tweets for digital humanities and music company 
twitter_training_set_v1 = pd.read_pickle(path+'LOGREG_RELEVANCE/TRAINING_SETS/twitter_training_v1.pkl')

In [5]:
music_collection = pd.read_pickle(path+'OLD_DO_NOT_USE/MUSOW BIGRAMS/twitter_sheet_music.pkl')
#music_collection['tweet'] = music_collection['tweet'].apply(lambda x: emoji.replace_emoji(x, replace=''))
music_collection

Unnamed: 0,user,created_at,lang,like_count,quote_count,reply_count,retweet_count,tweet,url
3,LevittDF,2021-01-30 23:23:32+00:00,en,0,0,0,0,"Check out ""Didn't We"" BY JiMMY WEBB sheet musi...",https://www.ebay.com/itm/284166722848
5,DXSteveX,2021-01-30 23:01:58+00:00,en,2,0,0,1,"Memories - Xenoblade Chronicles (Piano Cover, ...",https://youtu.be/unX1F7D2OBM
7,CathyGriffindor,2021-01-30 22:44:34+00:00,en,0,0,0,0,EPUB Free The Beatles Sheet Music Collection =...,https://redjourneylibrary.blogspot.com/book55....
11,HarmonyTabs,2021-01-30 22:28:54+00:00,en,0,0,0,0,I Want A Hippopotamus For Christmas (Hippo The...,https://is.gd/YdFViV
12,LevittDF,2021-01-30 22:22:01+00:00,en,0,0,0,0,Check out Old Days Sheet Music https://t.co/IW...,https://www.ebay.com/itm/284166732838
...,...,...,...,...,...,...,...,...,...
11654,LibrarySheet,2021-12-26 18:34:59+00:00,en,3,0,0,0,Teachers and Mentors. Students and Teachers . ...,https://sheetmusiclibrary.website/2021/01/16/t...
11663,reading_your,2021-12-26 17:03:56+00:00,en,0,0,0,0,list book online new all book free | DANDY : 2...,http://goo.gl/RpFbdl
11665,decacorde,2021-12-26 16:53:40+00:00,en,8,0,1,0,“Ghrostmas Ghosts” my new 10 string guitar ori...,https://www.patreon.com/posts/60346043
11669,djtbird1,2021-12-26 16:49:53+00:00,en,1,0,0,0,"#CulturalLesson 483: ""Letto"" is #Italian for b...","https://en.wikipedia.org/wiki/Ghetto/, https:/..."


In [24]:
music_collection.to_csv(path+'twitter_music_collection.csv')

In [4]:
music_collection.to_excel(path+'twitter_sheet_music.xlsx')

## Functions

In [3]:
def lr_training(t_input, t_feature, target, cv_int, score_type, filename, path):
    """ Create a text classifier based on Logistic regression and TF-IDF. Use cross validation 
    
    Parameters
    ----------
    t_input: 
        dataframe of the training set
    t_feature: 
        df column, text of tweet or description of the resource
    target: 
        df column, [0,1] values
    cv_int: int
        the number of cross validation folding
    score_type: str
        precision or recall
    filename: str
        model file name
    path: str
        parent folder
    """
    # TODO eda to define max_features=1000
      
    tfidf_transformer = TfidfVectorizer(ngram_range=(1, 2), lowercase=True, max_features=1000) 
    x_train = tfidf_transformer.fit_transform(t_input[t_feature])
    y_train = t_input[target].values
    model = LogisticRegressionCV(solver='liblinear', random_state=44, cv=cv_int, scoring=score_type)
    
    # export
    model.fit(x_train, y_train)
    export_model = f'LOGREG_RELEVANCE/MODELS/{filename}_model.pkl'
    export_vectorizer = f'LOGREG_RELEVANCE/MODELS/{filename}_vectorizer.pkl'
    pickle.dump(model, open(path+export_model, 'wb'))
    pickle.dump(tfidf_transformer, open(path+export_vectorizer, 'wb'))
    
    # report
    y_pred = cross_val_predict(model, x_train, y_train, cv=cv_int)
    report = classification_report(y_train, y_pred)
    print('report:', report, sep='\n')
    return model
    
def lr_predict(path, filename, p_input, p_feature):
    """ Classify text using a pickled model based on Logistic regression and TF-IDF.
    
    Parameters
    ----------
    p_input: 
        dataframe of the prediction set
    p_feature: 
        df column, text of tweet or description of the resource
    filename: str
        model file name
    path: str
        parent folder
    """
    export_model = f'{path}LOGREG_RELEVANCE/MODELS/{filename}_model.pkl'
    export_vectorizer = f'{path}LOGREG_RELEVANCE/MODELS/{filename}_vectorizer.pkl'
    model = pickle.load(open(export_model, 'rb'))
    tfidf_transformer = pickle.load(open(export_vectorizer, 'rb'))
  
    #result = loaded_model.score(X_test, Y_test)
    #x_new_count = count_vect.transform(p_input[p_feature])
    x_predict = tfidf_transformer.transform(p_input[p_feature])
    y_predict = model.predict(x_predict)
    scores = model.decision_function(x_predict)
    probability = model.predict_proba(x_predict)
    
    #results = [r for r in y_predict]
    result = p_input.copy()
    result['Prediction'] = y_predict
    result['Score'] = scores
    result['Probability'] = probability[:,1]
    result['Input Length'] = result[p_feature].str.len()
    return result

def create_url(keyword, start_date, end_date, max_results):
        search_url = "https://api.twitter.com/2/tweets/search/all" #Change to the endpoint you want to collect data from
        #change params based on the endpoint you are using
        query_params = {'query': keyword,
                        'start_time': start_date,
                        'end_time': end_date,
                        'max_results': max_results,
                        'expansions': 'author_id,in_reply_to_user_id,geo.place_id',
                        'tweet.fields': 'id,text,author_id,in_reply_to_user_id,geo,conversation_id,created_at,lang,public_metrics,referenced_tweets,reply_settings,source,entities',
                        'user.fields': 'id,name,username,created_at,description,public_metrics,verified',
                        'place.fields': 'full_name,id,country,country_code,geo,name,place_type',
                        'next_token': {}}
        return (search_url, query_params)
    
def connect_to_endpoint(url, headers, params, next_token = None):
    params['next_token'] = next_token   #params object received from create_url function
    response = requests.request("GET", url, headers = headers, params = params)
    print("Endpoint Response Code: " + str(response.status_code))
    if response.status_code != 200:
        raise Exception(response.status_code, response.text)
    return response.json()

def give_emoji_free_text(text):
    return emoji.replace_emoji(text, replace='')

def append_to_csv(json_response, fileName):
    #A counter variable
    counter = 0

    #Open OR create the target CSV file
    csvFile = open(fileName, "a", newline="", encoding='utf-8')
    csvWriter = csv.writer(csvFile)
    
    #setup usernames via includes
    username = {user['id']: user['username'] for user in json_response['includes']['users']}
    
    #Loop through each tweet
    for tweet in json_response['data']:

        # 1. Username
        author_id = tweet['author_id']
        user = username[author_id]

        # 2. Time created
        created_at = dateutil.parser.parse(tweet['created_at'])

        # 3. Language
        lang = tweet['lang']

        # 4. Tweet metrics
        retweet_count = tweet['public_metrics']['retweet_count']
        reply_count = tweet['public_metrics']['reply_count']
        like_count = tweet['public_metrics']['like_count']
        quote_count = tweet['public_metrics']['quote_count']

        #5. URLs w/ a catch for tweets w/ two links TODO: how to catch more than two links? 
        if ('entities' in tweet) and ('urls' in tweet['entities']):
            for url in tweet['entities']['urls']:
                url = [url['expanded_url'] for url in tweet['entities']['urls'] if 'twitter.com' not in url['expanded_url']]
                url = ', '.join(url)
        else:
            url = ""
        
        #6. Tweet text
        text = give_emoji_free_text(tweet['text']) 
        
        # Assemble all data in a list
        res = [user, created_at, lang, like_count, quote_count, reply_count, retweet_count, text, url]

        # Append the result to the CSV file
        csvWriter.writerow(res)
        counter += 1    
    
    # When done, close the CSV file
    csvFile.close()

    # Print the number of tweets for this iteration
    print("# of Tweets added from this response: ", counter) 

def twitter_search(token, keyword, start, end, mresults, mcount, file_name):
    
    # TODO filter tweets in english only OR tweak TF-IDF stopwords (lang detection)
    bearer_token = token
    headers = {"Authorization": "Bearer {}".format(bearer_token)} 
    start_list = start
    end_list =  end
    max_results = mresults
    total_tweets = 0

    # Create file
    csvFile = open(f'{path}TWITTER_SEARCHES/RAW_SEARCHES/{file_name}.csv', "a", newline="", encoding='utf-8')
    csvWriter = csv.writer(csvFile)
    csvWriter.writerow(['user', 'created_at', 'lang', 'like_count', 'quote_count', 'reply_count','retweet_count','tweet', 'URL'])
    csvFile.close()

    for i in range(0,len(start_list)):
        # Inputs
        count = 0 # Counting tweets per time period
        max_count = mcount # Max tweets per time period
        flag = True
        next_token = None
        
        while flag:
            # Check if max_count reached
            if count >= max_count:
                break
            print("-------------------")
            print("Token: ", next_token)
            url = create_url(keyword, start_list[i],end_list[i], max_results)
            json_response = connect_to_endpoint(url[0], headers, url[1], next_token)
            result_count = json_response['meta']['result_count']

            if 'next_token' in json_response['meta']:
                # Save the token to use for next call
                next_token = json_response['meta']['next_token']
                print("Next Token: ", next_token)
                if result_count is not None and result_count > 0 and next_token is not None:
                    print("Start Date: ", start_list[i])
                    append_to_csv(json_response, f'{path}TWITTER_SEARCHES/RAW_SEARCHES/{file_name}.csv')
                    count += result_count
                    total_tweets += result_count
                    print(f"Total # of Tweets added for '{keyword}':", total_tweets)
                    print("-------------------")
                    time.sleep(5)                
            # If no next token exists
            else:
                if result_count is not None and result_count > 0:
                    print("-------------------")
                    print("Start Date: ", start_list[i])
                    append_to_csv(json_response, f'{path}TWITTER_SEARCHES/RAW_SEARCHES/{file_name}.csv')
                    count += result_count
                    total_tweets += result_count
                    print(f"Total # of Tweets added for '{keyword}':", total_tweets)
                    print("-------------------")
                    time.sleep(5)

                #Since this is the final request, turn flag to false to move to the next time period.
                flag = False
                next_token = None
            time.sleep(5)
    print("Total number of results:", total_tweets)
    
    df = pd.read_csv(f'{path}TWITTER_SEARCHES/RAW_SEARCHES/{file_name}.csv', keep_default_na=False, dtype={"user": "string", "lang": "string", "tweet": "string", "URL": "string"})
    
    # clean the tweet from meentions, hashtags, emojis
    df['tweet'].replace( { r"@[A-Za-z0-9_]+": '' }, inplace= True, regex = True)
    df['tweet'].replace( { r"#": '' }, inplace= True, regex = True)
    
    # remove tweets that are not in english, have empty URLs, or have duplicate URLs
    df = df[df['lang'].isin(['en'])]
    df = df[df.URL != '']
    df = df.drop_duplicates(['URL'], keep='last')

    #add a column for the search keyword
    df['Search KW'] = keyword

    #pickle df for reuse
    df.to_pickle(f'{path}TWITTER_SEARCHES/RAW_SEARCHES/{file_name}.pkl')

def scrape_links(link_list, pred_df, filename):
    """ Scrape links from classified tweets, save scrapes, combine them w/ tweets and return a DF for description classification.
    
    Parameters
    ----------
    p_input: 
        dataframe of the prediction set
    p_feature: 
        df column, text of tweet or description of the resource
    filename: str
        model file name
    path: str
        parent folder
    score: int
        which prediction score to filter the results by 1/0
    discard: variable
        a list of terms to check against to remove tweets
    filter: str 
        a string against which to further filter predictions 
    """
    links = pd.DataFrame(columns=['Title', 'Description', 'URL'])
    summarizer = pipeline("summarization", model='sshleifer/distilbart-cnn-12-6')
    
    for link in link_list:
        URL = link
        page = None
        ARTICLE = ''
        try:
            x = requests.head(URL, timeout=15)
            content_type = x.headers["Content-Type"] if "Content-Type" in x.headers else "None"
            if ("text/html" in content_type.lower()):
                page = requests.get(URL, timeout=15)
        except Exception:
            pass
        
        if page:
            soup = BeautifulSoup(page.content, "html.parser")
            title = ' '.join([t.text for t in soup.find('head').find_all('title')]).strip() \
                if soup and soup.find('head') and soup.find('body') is not None \
                else URL
            
            try:
                downloaded = trafilatura.fetch_url(URL)
                ARTICLE = trafilatura.extract(downloaded, include_comments=False, include_tables=True, target_language='en', deduplicate=True)
            except Exception:
                results = soup.find_all(['h1', 'p'])
                text = [result.text for result in results]
                ARTICLE = ' '.join(text)
            
            if ARTICLE is not None and len(ARTICLE) > 200:
                # text summarisation
                max_chunk = 500
                #removing special characters and replacing with end of sentence
                ARTICLE = ARTICLE.replace('.', '.<eos>')
                ARTICLE = ARTICLE.replace('?', '?<eos>')
                ARTICLE = ARTICLE.replace('!', '!<eos>')
                sentences = ARTICLE.split('<eos>')
                current_chunk = 0 
                chunks = []

                # split text to process
                for sentence in sentences:
                    if len(chunks) == current_chunk + 1: 
                        if len(chunks[current_chunk]) + len(sentence.split(' ')) <= max_chunk:
                            chunks[current_chunk].extend(sentence.split(' '))
                        else:
                            current_chunk += 1
                            chunks.append(sentence.split(' '))
                    else:
                        chunks.append(sentence.split(' '))

                for chunk_id in range(len(chunks)):
                    chunks[chunk_id] = ' '.join(chunks[chunk_id])
                try:
                    res = summarizer(chunks, min_length = int(0.2 * len(text)), max_length = int(0.5 * len(text)), do_sample=False)
                    # summary
                    text = ' '.join([summ['summary_text'] for summ in res])
                except Exception:
                    text = ARTICLE
                    continue
            else:
                text = ARTICLE
            print(URL)
            new_row = {'Title': title, 'Description': text, 'URL': URL.strip()}
            new_df = pd.DataFrame(data=new_row, index=[0])
            links = pd.concat([links, new_df], ignore_index=True)
    discard = ['None', '! D O C T Y P E h t m l >', '! d o c t y p e h t m l >', '! D O C T Y P E H T M L >']
    links = links.fillna('None')
    links = links[~links.Description.str.contains('|'.join(discard))]
    twitter_scrapes_preds = pd.merge(pred_df, links, on='URL')
    twitter_scrapes_preds.to_pickle(f'{path}LOGREG_RELEVANCE/SCRAPES/{filename}.pkl')
    print(len(twitter_scrapes_preds))
    return twitter_scrapes_preds

def twitter_predictions(path, filename, p_input, p_feature, score, discard, filter):
    """ Predict relevant tweets using a pickled model based on Logistic regression and TF-IDF.
    
    Parameters
    ----------
    p_input: 
        dataframe of the prediction set
    p_feature: 
        df column, text of tweet or description of the resource
    filename: str
        model file name
    path: str
        parent folder
    score: int
        which prediction score to filter the results by 1/0
    discard: variable
        a list of terms to check against to remove tweets
    filter: str 
        a string against which to further filter predictions 
    """
    preds = lr_predict(path, filename, p_input, p_feature)
    preds = preds.drop_duplicates(['tweet'], keep='last')
    preds = preds.loc[preds['Prediction'] == score]
    preds = preds[~preds.URL.str.contains('|'.join(discard))]
    preds = preds.sort_values(by='Score', ascending=False).reset_index(drop=True)
    preds = preds[['tweet', 'Prediction', 'Score', 'Probability', 'Input Length', 'URL', 'Search KW']]
    if filter != '':
        preds = preds[preds['tweet'].str.contains(filter)]
        preds = preds.reset_index(drop=True)
    twitter_link_list = [link for link in preds['URL']]
    print('Total tweets classified:', len(preds))
    return preds, twitter_link_list

def resource_predictions(path, filename, p_input, p_feature, score, discard, savefile):
    """ Predict relevant URL descriptions using a pickled model based on Logistic regression and TF-IDF.
    
    Parameters
    ----------
    p_input: 
        dataframe of the prediction set
    p_feature: 
        df column, text of tweet or description of the resource
    filename: str
        model file name
    path: str
        parent folder
    score: int
        which prediction score to filter the results by 1/0
    discard: variable
        a list of terms to check against to remove tweets
    savefile: str
        name for the final csv to be saved under 
    """
    if len(filename) == 0:
        return 'Sorry no URLs to classify!'
    preds = lr_predict(path, filename, p_input, p_feature)
    preds = preds.drop_duplicates(['Description'], keep='last')
    preds = preds.loc[preds['Description'] != '']
    preds = preds.loc[preds['Prediction'] == score]
    preds = preds[~preds.URL.str.contains('|'.join(discard))]
    preds = preds[~preds.Title.str.contains('|'.join(discard))]
    preds = preds.sort_values(by='Score', ascending=False).reset_index(drop=True)
    preds.to_csv(f'{path}LOGREG_RELEVANCE/PREDICTIONS/{savefile}.csv')
    print(preds)
    return preds

def tweets_to_classify(path, filetype):   
    """ Merge all tweet searches together.
    
    Parameters
    ----------
    path: 
        for raw searches folder
    filetype: 
        the ending of the files to load, you can call just .pkl or also the date tag from file names
    """  
    raw_searches = path+'TWITTER_SEARCHES/RAW_SEARCHES/'
    result = pd.DataFrame()
    tweets_to_classify = pd.DataFrame()
    for file in os.listdir(raw_searches):
        if file.endswith(filetype):
            result = pd.read_pickle(raw_searches+file)
        tweets_to_classify = pd.concat([tweets_to_classify, result])
        tweets_to_classify = tweets_to_classify.reset_index(drop=True)
    print('Total tweets to classify:', len(tweets_to_classify))
    return tweets_to_classify

## Training twitter and descriptions classifiers

This is a ONE TIME operation. The models are pickled and loaded later to predict new results

In [9]:
# one time training on twitter
#twitter_training_model = LogRegSearches.train(twitter_training_set_v1, 'tweet', 'Target', 10, 'precision', 250, None, 'twitter_v1_250maxfeats', path)

# one time training on resources
resource_training_model = LogRegSearches.train(archive_desc_training_v2, 'Description', 'Target', 10, 'precision', 1000, 'resources_v2_1Kmaxfeats', path)

report:
              precision    recall  f1-score   support

           0       0.93      0.91      0.92       538
           1       0.94      0.95      0.95       786

    accuracy                           0.94      1324
   macro avg       0.93      0.93      0.93      1324
weighted avg       0.93      0.94      0.93      1324



In [7]:
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_predict, cross_validate
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline

tfidf_transformer = TfidfVectorizer(ngram_range=(1, 2), lowercase=True, max_features=1000) 
x_train = tfidf_transformer.fit_transform(archive_desc_training_v2['Description'])
y_train = archive_desc_training_v2['Target'].values
model = make_pipeline(LogisticRegression(solver='liblinear', random_state=44))
scores = cross_validate(model, x_train, y_train, scoring='precision', cv=10)
model.fit(x_train, y_train)
y_pred = cross_val_predict(model, x_train, y_train, cv=10)
report = classification_report(y_train, y_pred)
print('report:', report, sep='\n')

report:
              precision    recall  f1-score   support

           0       0.94      0.88      0.91       538
           1       0.92      0.96      0.94       786

    accuracy                           0.93      1324
   macro avg       0.93      0.92      0.93      1324
weighted avg       0.93      0.93      0.93      1324



## Query Twitter

Calls Twitter API with the list of keywords and returns the table `prediction_twitter`

In [None]:
#load token
token = 'AAAAAAAAAAAAAAAAAAAAAJgsNAEAAAAAQcsgbUnOJJmqmU483%2F8x6n9V1i8%3Df0qaEo9cV1sWP4eyNQ6E9s8BiRjvFTSN9mSqithe8uIXSNP68x'
#a selection of keywords based on MJI and musoW datasets

#choose KW list 
##best performing list based on v1 tests
better_keywords = ['audio file', 'music archive', 'music collection', 'music library', 'sheet music', 'sound archive', 'sound recording']

#custom timeframe for searching
start = ['2022-05-01T00:00:00.000Z', '2022-05-02T00:00:00.000Z', '2022-05-03T00:00:00.000Z', '2022-05-04T00:00:00.000Z', '2022-05-05T00:00:00.000Z', '2022-05-06T00:00:00.000Z', '2022-05-07T00:00:00.000Z']
end = ['2022-05-01T23:59:59.000Z', '2022-05-02T23:59:59.000Z', '2022-05-03T23:59:59.000Z', '2022-05-04T23:59:59.000Z', '2022-05-05T23:59:59.000Z', '2022-05-06T23:59:59.000Z', '2022-05-07T23:59:59.000Z']

#choose search option 
## search last week
LogRegSearches.search_twitter_weekly(token, better_keywords, 500, 500)
## search custom timeframe
LogRegSearches.search_twitter_custom(token, better_keywords, start, end, 500, 500)

## Classify tweets

In [47]:
#load all search results into a single dataframe 
classified_tweets = LogRegSearches.tweets_to_classify(f'{path}TWITTER_SEARCHES/RAW_SEARCHES_ARCHIVE/April 2022 searches v1 (500)/', '2022-04-01_4-30.pkl')
classified_tweets_2 = LogRegSearches.tweets_to_classify(f'{path}TWITTER_SEARCHES/RAW_SEARCHES_ARCHIVE/Jan - March 2022 searches v1 (50)/', '2022-01-01_3-31.pkl')

Total tweets to classify: 3679
Total tweets to classify: 1310


In [None]:
#load all search results into a single dataframe 
tweets_to_classify = LogRegSearches.tweets_to_classify(path, '2022-05-01_5-05.pkl')

In [48]:
classified_tweets = pd.concat([classified_tweets, classified_tweets_2])

In [54]:
#run classification and get links from results
predicted_tweets, twitter_link_list = LogRegSearches.predict_twitter(path, 'twitter_v1_100maxfeats', classified_tweets, 'tweet', 1, '')

Total tweets classified: 1507


In [12]:
predicted_tweets

Unnamed: 0,tweet,Prediction,Score,Probability,Input Length,URL,Search KW,tweet date
0,[BGM素材] Japanese-Style Music Collection　https:...,1,0.026723,0.506680,63,https://www.dlsite.com/home/dlaf/=/t/s/link/wo...,"""music collection"" -is:retweet",2022-02-27
1,Yo check out Ade's music collection https://t...,1,0.023895,0.505973,60,https://music.sleevenote.com/@ade,"""music collection"" -is:retweet",2022-03-30
2,"Relaxing With Chinese Bamboo Flute, Guzheng, E...",1,0.023468,0.505867,106,https://apps.gtarcade.com/v1/#/post/53386?anch...,"""music collection"" -is:retweet",2022-03-30
3,"Relaxing With Chinese Bamboo Flute, Guzheng, E...",1,0.023468,0.505867,106,https://apps.gtarcade.com/v1/#/post/53386?anchor=,"""music collection"" -is:retweet",2022-03-30
4,IUMA (Internet Underground Music Archive) Coll...,1,0.023168,0.505792,76,https://ift.tt/33IiRug,"""music archive"" -is:retweet",2022-01-24
...,...,...,...,...,...,...,...,...
430,Everyone can move to the music! This poster h...,1,0.000050,0.500012,218,https://www.twinkl.co.uk/l/gatw9,"""music information"" -is:retweet",2022-03-15
431,Royalty Free Music library that will be with y...,1,0.000046,0.500012,241,https://music-for-video.com,"""music library"" -is:retweet",2022-03-30
432,"did u read this. if not, its prob helpful htt...",1,0.000040,0.500010,66,https://www.vulture.com/article/clifford-marti...,"""oral history"" -is:retweet",2022-02-27
433,Julian Urbano did some good work on this sam...,1,0.000013,0.500003,171,https://www.slideshare.net/caerolus/statistica...,"""music information"" -is:retweet",2022-02-10


## Scrape URLS

In [None]:
#scrape URL list and return a DF for resource classification
scraped_links = LogRegSearches.scrape_links(twitter_link_list, predicted_tweets, 'all_keywords_2022-01-01_04-31_scrapes_maxfeats200')

In [168]:
len(scraped_links) 

924

## Classify URLS

In [9]:
import os
raw_searches = path+'LOGREG_RELEVANCE/SCRAPES/Jan-April 2022 baseline/'
result = pd.DataFrame()
scraped_links = pd.DataFrame()
filetype = 'scrapes.pkl'
for file in os.listdir(raw_searches):
    if file.endswith(filetype):
        result = pd.read_pickle(raw_searches+file)
        scraped_links = pd.concat([scraped_links, result])
        scraped_links = scraped_links.reset_index(drop=True)
len(scraped_links)

1371

In [34]:
predicted_resources = LogRegSearches.predict_resource(path, 'resources_v2_100maxfeats', scraped_links, 'Description', 1, 'all_keywords_2022-01-01_04-30_desc_v2_100maxfeats')

In [35]:
predicted_resources

Unnamed: 0,tweet,Prediction,Score,Probability,Input Length,URL,Search KW,Title,Description
0,https://t.co/OkEHI6XQu3 Music archive Wolfgang...,1,7.489973,0.999442,189,http://www.opusip.co.uk/2022/01/25/music-archi...,"""music archive"" -is:retweet",Music archive Wolfgang's Vault resolves copyri...,25th January 2022\nMusic archive Wolfgang’s Va...
1,Online concert archive Wolfgang's Vault has re...,1,6.473366,0.998458,308,https://reut.rs/3rTJWTx,"""music archive"" -is:retweet",Music archive Wolfgang's Vault resolves copyri...,Rocker Greg Kihn sued online archive over all...
2,Video Interview: Florida Sound Archive With Sa...,1,5.960977,0.997429,113,https://www.musiceternal.com/News/2022/Florida...,"""sound archive"" -is:retweet",Florida Sound Archive With Sam Rosenthal,Interview with Sam Rosenthal of Projekt Record...
3,Inmates at HMP Perth engage with archive birds...,1,5.949058,0.997398,216,https://www.listentosteve.com/doingbird,"""oral history"" -is:retweet",Steve Urquhart // doing bird,Inmates at HMP Perth engage with archive bird...
4,With first-hand access to the original film ar...,1,5.705338,0.996683,285,https://buff.ly/38mYNQm,"""archive collection"" -is:retweet",Introducing… THE ARCHIVE COLLECTION – Regal Robot,The Archive Collection – by Regal Robot™Intro...
...,...,...,...,...,...,...,...,...,...
480,"Hard to believe there isn't, although it appe...",1,0.021968,0.505492,298,https://digital.library.unt.edu/ark:/67531/met...,"""digital library"" -is:retweet",Norma and Mel Gabler: The Development and Caus...,The problem of this study was to trace throug...
481,Join Delyna Baxter as she discusses the true h...,1,0.009815,0.502454,193,http://bit.ly/register-delyna22,"""oral history"" -is:retweet",Webinar Registration - Zoom,"Zoom is a full-featured, easy-to-use, engagin..."
482,NowPlaying blues : Realize (Digital Edition Bo...,1,0.008761,0.502190,238,https://cowboysjukejoint.com/,"""digital edition"" -is:retweet",Cowboy's Juke Joint - Playing The Harder side ...,Cowboy’s Juke Joint is an independent non-com...
483,Concerts of the ensemble Bis-Quit band from St...,1,0.000688,0.500172,239,https://en.rwp.agency/news/953/,"""music culture"" -is:retweet",Concerts of the ensemble Bis-Quit band from St...,Kyrgyzstan will hold concerts of the Bis-Quit...


***

Compare final results for evaluation

In [36]:
baseline = pd.read_csv(path+'LOGREG_RELEVANCE/FINAL_RESULTS/Jan-April_2022_assessed_by_Marilena.csv')
baseline['source'] = 'baseline'
baseline_true = baseline.loc[baseline['Add to musoW'] != 'no']
baseline_false = baseline.loc[baseline['Add to musoW'] == 'no']

evaluate = pd.read_csv(path+'LOGREG_RELEVANCE/PREDICTIONS/all_keywords_2022-01-01_04-30_desc_v2_100maxfeats.csv')
evaluate['source'] = 'new'

In [37]:
baseline_true = baseline_true['URL'].to_dict()
baseline_true = dict([(value, key) for key, value in baseline_true.items()])
baseline_true.update((k,'true positive') for k in baseline_true)
baseline_false = baseline_false['URL'].to_dict()
baseline_false = dict([(value, key) for key, value in baseline_false.items()])
baseline_false.update((k,'false positive') for k in baseline_false)

In [38]:
evaluate['sort'] = evaluate['URL'].map(baseline_true)
evaluate.loc[evaluate['sort'] != 'true positive', 'sort'] = evaluate['URL'].map(baseline_false)
evaluate['sort'] = evaluate['sort'].fillna('new')

In [39]:
evaluate

Unnamed: 0.1,Unnamed: 0,tweet,Prediction,Score,Probability,Input Length,URL,Search KW,Title,Description,source,sort
0,0,https://t.co/OkEHI6XQu3 Music archive Wolfgang...,1,7.489973,0.999442,189,http://www.opusip.co.uk/2022/01/25/music-archi...,"""music archive"" -is:retweet",Music archive Wolfgang's Vault resolves copyri...,25th January 2022\nMusic archive Wolfgang’s Va...,new,false positive
1,1,Online concert archive Wolfgang's Vault has re...,1,6.473366,0.998458,308,https://reut.rs/3rTJWTx,"""music archive"" -is:retweet",Music archive Wolfgang's Vault resolves copyri...,Rocker Greg Kihn sued online archive over all...,new,false positive
2,2,Video Interview:\nFlorida Sound Archive With S...,1,5.960977,0.997429,113,https://www.musiceternal.com/News/2022/Florida...,"""sound archive"" -is:retweet",Florida Sound Archive With Sam Rosenthal,Interview with Sam Rosenthal of Projekt Record...,new,true positive
3,3,Inmates at HMP Perth engage with archive birds...,1,5.949058,0.997398,216,https://www.listentosteve.com/doingbird,"""oral history"" -is:retweet",Steve Urquhart // doing bird,Inmates at HMP Perth engage with archive bird...,new,true positive
4,4,With first-hand access to the original film ar...,1,5.705338,0.996683,285,https://buff.ly/38mYNQm,"""archive collection"" -is:retweet",Introducing… THE ARCHIVE COLLECTION – Regal Robot,The Archive Collection – by Regal Robot™Intro...,new,false positive
...,...,...,...,...,...,...,...,...,...,...,...,...
480,480,"Hard to believe there isn't, although it appe...",1,0.021968,0.505492,298,https://digital.library.unt.edu/ark:/67531/met...,"""digital library"" -is:retweet",Norma and Mel Gabler: The Development and Caus...,The problem of this study was to trace throug...,new,new
481,481,Join Delyna Baxter as she discusses the true h...,1,0.009815,0.502454,193,http://bit.ly/register-delyna22,"""oral history"" -is:retweet",Webinar Registration - Zoom,"Zoom is a full-featured, easy-to-use, engagin...",new,new
482,482,NowPlaying blues : Realize (Digital Edition Bo...,1,0.008761,0.502190,238,https://cowboysjukejoint.com/,"""digital edition"" -is:retweet",Cowboy's Juke Joint - Playing The Harder side ...,Cowboy’s Juke Joint is an independent non-com...,new,false positive
483,483,Concerts of the ensemble Bis-Quit band from St...,1,0.000688,0.500172,239,https://en.rwp.agency/news/953/,"""music culture"" -is:retweet",Concerts of the ensemble Bis-Quit band from St...,Kyrgyzstan will hold concerts of the Bis-Quit...,new,false positive


In [33]:
evaluate.to_csv(path+'LOGREG_RELEVANCE/FINAL_RESULTS/Jan-April_2022_desc_maxfeats_100.csv')