TWITTER

In [None]:
#dataframe function

def append_to_pd(json):
    
    #username setup
    username = {user['id']: user['username'] for user in json_response['includes']['users']}
    
    #1. username
    author_id = [tweet['author_id'] for tweet in json['data']]
    user = [username[id] for id in author_id]

    # 2. Time created
    created_at = [dateutil.parser.parse(tweet['created_at']) for tweet in json['data']]

    # 3. Language
    lang = [tweet['lang'] for tweet in json_response['data']]

    # 4. Tweet metrics
    retweet_count = [tweet['public_metrics']['retweet_count'] for tweet in json['data']]
    reply_count = [tweet['public_metrics']['reply_count'] for tweet in json['data']]
    like_count = [tweet['public_metrics']['like_count'] for tweet in json['data']]
    quote_count = [tweet['public_metrics']['quote_count'] for tweet in json['data']]

    # 5. Tweet text
    text = [tweet['text'] for tweet in json['data']]
    
    # 6. URL 
    url = []
    for tweet in json['data']:
        if ('entities' in tweet) and ('urls' in tweet['entities']):
            for link in tweet['entities']['urls']:
                url.append(link['expanded_url'])
        else:
            url.append('')
    
    # Create df and append everything 
    dataframe = pd.DataFrame(columns=['User', 'Created', 'Language', 'Likes', 'Quotes', 'Replies', 'RTs', 'Tweet', 'URL'])
    dataframe['User'] = pd.Series(user).astype('string')
    dataframe['Created'] = pd.Series(created_at)
    dataframe['Language'] = pd.Series(lang).astype('string')
    dataframe['Likes'] = pd.Series(like_count)
    dataframe['Quotes'] = pd.Series(quote_count)
    dataframe['Replies'] = pd.Series(reply_count)
    dataframe['RTs'] = pd.Series(retweet_count)
    dataframe['Tweet'] = pd.Series(text).astype('string')
    dataframe['URL'] = pd.Series(url).astype('string')   
    
    return dataframe

SCRAPING

In [1]:
#imports and path
from __future__ import print_function
from bs4 import BeautifulSoup
import json
import urllib
import requests
import pandas as pd
path = '/Users/laurentfintoni/Desktop/University/COURSE DOCS/THESIS/Internship/musow-pipeline/'

In [2]:
#scrape URLs for title and text 

def scrape_links(link_list):
    links = pd.DataFrame(columns=['Title', 'Description', 'URL'])
    for link in link_list:
        URL = link
        try:
            page = requests.get(URL)
        except requests.exceptions.ConnectionError:
            pass
        except Exception:
            continue
        try:
            soup = BeautifulSoup(page.content, "html.parser")
            if soup and soup.find('head') and soup.find('body') is not None:
                title = ' '.join([t.text for t in soup.find('head').find_all('title')]).strip()
                text = ' '.join([p.text for p in soup.find('body').find_all('p')]).strip()
                new_row = {'Title': title, 'Description': text, 'URL': URL.strip()}
                links = links.append(new_row, ignore_index=True)
        except AssertionError:
            pass
    return links

In [12]:
music_research = pd.read_pickle(path+'TWITTER_SEARCHES/NEGATIVE/music_research_21.pkl')

In [13]:
links_to_add = music_research.url.to_list()

In [14]:
len(links_to_add)

613

In [None]:
links_to_add_9= scrape_links(links_to_add[550:613])

In [35]:
links_to_add_9.to_excel(path+'scrape9.xlsx')

In [None]:
links_to_add_18 = scrape_links(link_list[1800:1850])
links_to_add_18.to_csv(path+'scrape18.csv')
links_to_add = links_to_add[links_to_add.Description != ''].reset_index(drop=True)
#links_to_add.to_csv(path+'test_2.csv')
#links_to_add.to_pickle('/Users/laurentfintoni/Desktop/University/COURSE DOCS/THESIS/Internship/DATA/TWITTER_SEARCHES/MJI BIGRAMS (NEGATIVE)/twitter_music_oral_history_link_scrape.pkl')

SPLIT PREDICTION FUNCTION

In [None]:
def lr_model(t_input, t_feature, target, score_type, filename, path):
    count_vect = CountVectorizer()
    tfidf_transformer = TfidfTransformer()
    x_count = count_vect.fit_transform(t_input[t_feature])
    x_train = tfidf_transformer.fit_transform(x_count)
    y_train = t_input[target].values
    model = LogisticRegressionCV(solver='liblinear', random_state=44, cv=2, scoring=score_type)
    model.fit(x_train, y_train)
    saved_model = f'LOGREG_RELEVANCE/{filename}_model.pkl'
    vectorizer = f'LOGREG_RELEVANCE/{filename}_vectorizer.pkl'
    pickle.dump(model, open(path+saved_model, 'wb'))
    pickle.dump(vectorizer, open(path+vectorizer, 'wb'))

def lr_predict(p_input, p_feature, filename, path):
    model = pickle.load(open(path+f'LOGREG_RELEVANCE/{filename}_model.pkl', 'rb'))
    vectorizer = pickle.load(open(path+f'LOGREG_RELEVANCE/{filename}_vectorizer.pkl', 'rb'))
    x_new_count = vectorizer.transform(p_input[p_feature])
    x_new_train = tfidf_transformer.transform(x_new_count)
    y_predict = model.predict(x_new_train)
    scores = model.decision_function(x_new_train, y_predict)
    probability = model.predict_log_proba(x_new_train)
    results = [r for r in y_predict]
    result = p_input.copy()
    result['Prediction'] = results
    result['Score'] = [s for s in scores]
    result['Probability'] = [p for p in probability]
    return result 

NEW TRAINING SET

In [None]:
training_set_even = pd.read_pickle(path+'LOGREG_RELEVANCE/trainingset_even.pkl')
training_set_even['Target'] = '1'
negative_set = pd.read_csv(path+'LOGREG_RELEVANCE/negative_set.csv')
negative_set['Target'] = '0'
new_training_set = pd.concat([training_set_even, negative_set])
new_training_set['Target'] = new_training_set['Target'].astype('int')
new_training_set = new_training_set.reset_index(drop=True)
new_training_set.to_pickle(path+'new_training_set.pkl')

In [27]:
new_neg_set = pd.read_excel(path+'LOGREG_RELEVANCE/non_archive_negative_set.xlsx')
new_neg_set = new_neg_set.drop_duplicates(subset=['Title'])
new_neg_set['Target'] = '0'


In [28]:
training_set_even = pd.read_pickle(path+'LOGREG_RELEVANCE/trainingset_even_extended.pkl')
training_set_even['Target'] = '1'

In [31]:
new_training_set = pd.concat([training_set_even, new_neg_set])
new_training_set['Target'] = new_training_set['Target'].astype('int')
new_training_set = new_training_set.reset_index(drop=True)

In [16]:
new_training_set.to_pickle(path+'new_training_set.pkl')

In [33]:
new_training_set.loc[new_training_set['Target'] == 0]

Unnamed: 0,Title,Description,URL,Target
544,Postdoctoral Researcher - Digital Humanities (...,All jobs You cannot apply for this job anymore...,https://www.academictransfer.com/297340/,0
545,Nazmus Saquib - Digital Humanities: Reflection,"January 25, 2021 On January 27th, the Islamica...",https://www.nsaquib.org/blog/digital-humanitie...,0
546,Grants-in-Aid - Arte Publico Press,The University of Houston US Latino Digital Hu...,http://ow.ly/NJs050Dg1eA,0
547,Winner of the DHMS Prize - The Medieval Academ...,Remember Me \n 4/21/2022 » 4/24/2022Medieval A...,https://zurl.co/bGFb,0
548,2020 USLDH Mellon-Funded Grants-in-Aid Project...,"September 8, 2020 The University of Houston US...",https://l8r.it/JPWw,0
...,...,...,...,...
1065,University of Amsterdam: Henkjan Honing and Sa...,The funding was granted by the Platform Digita...,https://indiaeducationdiary.in/university-of-a...,0
1066,New Tung Auditorium Affilliate Organisations a...,The University of Liverpool is delighted to an...,https://www.miragenews.com/new-tung-auditorium...,0
1067,25 Surprising Remote Jobs You Can Do From Home...,Find a job faster! 50+ job categories Hand-scr...,https://buff.ly/2OCw3WR,0
1068,Digitizing the Grauman Collection – Silent Fil...,Silent Film Sound & Music Archive a digital re...,https://www.sfsma.org/ARK/22915/digitizing-the...,0
