# TRAINING SET TRACKING

- This notebook tracks the evolution of the training sets for the musoW discovery pipeline for easy reference and (re)construction

***

In [1]:
path = '../'
import pandas as pd
import emoji


## Description training set v1 
- musoW descriptions as positives
- MJI descriptions as negatives 
- base version + extended version (additions to negatives) + even version (for balancing)

DEPRECATED - DO NOT USE 

In [None]:
#read mji csv and grab needed columns
df_mji = pd.read_csv(path+'MJI/MJI_data.csv', keep_default_na=False, dtype='string')
df_mji_small = pd.DataFrame(columns=['Title', 'Description', 'URL'])
df_mji_small['Title'] = df_mji['Title'].str.lower().str.strip()
df_mji_small['Description'] = df_mji['Description'].str.lower().str.strip()
df_mji_small['URL'] = df_mji['URL'].str.lower().str.strip()

#read musow json dump and grab needed columns
with open(path+'MUSOW/musow_name_desc_url_cat.json') as file:
    data = json.load(file)
    
musow_names = [result['name']['value'].strip().lower() for result in data['results']['bindings']]
musow_desc = [result['description']['value'].strip().lower() for result in data['results']['bindings']]
musow_url = [result['url']['value'].strip().lower() for result in data['results']['bindings']]
df_musow = pd.DataFrame(columns=['Title', 'Description', 'URL'])
df_musow['Title'] = musow_names
df_musow['Description'] = musow_desc
df_musow['URL'] = musow_url
df_musow = df_musow.astype('string')

#remove musow duplicates from MJI set 
mji_training_set = df_mji_small[~df_mji_small['Title'].isin(df_musow['Title'])].dropna()

#create positive and negative base sets, add target column 
positive_df = df_musow.copy()
positive_df['Target'] = '1'
negative_df = mji_training_set.copy()
negative_df['Target'] = '0'

#create positive and negative sets w/ additions, add target column 
ismir_df = pd.read_pickle(path+'GH_PICKLES/ismir.pkl')
ismir_df = ismir_df[~ismir_df['Title'].isin(df_musow['Title'])].dropna() 
positive_df_adds = pd.concat([df_musow, ismir_df]).reset_index(drop=True)
positive_df_adds = positive_df_adds.drop_duplicates(['Title'], keep='last')
positive_df_adds['Target'] = '1'
mji_additions_1 = pd.read_csv(path+'MJI/MJI_additions_for_LR.csv')
mji_additions_1['Title'] = mji_additions_1['Title'].str.lower().str.strip()
mji_additions_1['Description'] = mji_additions_1['Description'].str.lower().str.strip()
mji_additions_1['URL'] = mji_additions_1['URL'].str.lower().str.strip()
mji_additions_1 = mji_additions_1[~mji_additions_1['Title'].isin(df_musow['Title'])].dropna()
mji_additions_1 = mji_additions_1[~mji_additions_1['Title'].isin(mji_training_set['Title'])].dropna()
negative_df_adds = pd.concat([mji_training_set, mji_additions_1]).reset_index(drop=True)
negative_df_adds = negative_df_adds.drop_duplicates(['Title'], keep='last')
negative_df_adds['Target'] = '0'

#create the base and extended training sets, pickle for reuse
training_set = pd.concat([positive_df, negative_df])
training_set['Target'] = training_set['Target'].astype('int')
training_set = training_set.reset_index(drop=True)
training_set_adds = pd.concat([positive_df_adds, negative_df_adds])
training_set_adds['Target'] = training_set_adds['Target'].astype('int')
training_set_adds = training_set_adds.reset_index(drop=True)
training_set.to_pickle(path+'LOGREG_RELEVANCE/TRAINING_SETS/trainingset.pkl')
training_set_adds.to_pickle(path+'LOGREG_RELEVANCE/TRAINING_SETS/trainingset_extended.pkl')

#create the even set, one base and one extended, pickle for reuse 
positive_df_2 = positive_df.sample(n=128, random_state=1)
training_set_even = pd.concat([positive_df_2, negative_df])
training_set_even['Target'] = training_set_even['Target'].astype('int')
training_set_even = training_set_even.reset_index(drop=True)
training_set_even.to_pickle(path+'LOGREG_RELEVANCE/TRAINING_SETS/trainingset_even.pkl')

positive_df_3 = positive_df.sample(n=272, random_state=1)
training_set_even_adds = pd.concat([positive_df_3, negative_df_adds])
training_set_even_adds['Target'] = training_set_even_adds['Target'].astype('int')
training_set_even_adds = training_set_even_adds.reset_index(drop=True)
training_set_even_adds.to_pickle(path+'LOGREG_RELEVANCE/TRAINING_SETS/trainingset_even_extended.pkl')

***

## Description training set v2 + 3
- musoW and MJI descriptions as positives, using even extended set from v1
- manual and automatic scrapes from twitter searches for digital humanities, music companies, music industry as negatives for v2
- summarized automatic scrapes of negatives for v3

v3 is the version on which tests for V1 of the pipeline on Twitter were conducted, results of which are included in Marilena's report. 

In [None]:
#NEW TRAINING SET V2
new_neg_set = pd.read_excel(path+'LOGREG_RELEVANCE/TRAINING_SETS/non_archive_negative_set_v1.xlsx')
new_neg_set = new_neg_set.drop_duplicates(subset=['Title'])
new_neg_set = new_neg_set.drop_duplicates(subset=['Description'])
new_neg_set['Target'] = '0'
positive_set = pd.read_pickle(path+'LOGREG_RELEVANCE/TRAINING_SETS/trainingset_even_extended.pkl')
positive_set['Target'] = '1'
archive_desc_training_v1 = pd.concat([positive_set, new_neg_set])
archive_desc_training_v1['Target'] = archive_desc_training_v1['Target'].astype('int')
archive_desc_training_v1 = archive_desc_training_v1.reset_index(drop=True)
archive_desc_training_v1.to_pickle(path+'LOGREG_RELEVANCE/TRAINING_SETS/archive_desc_training_v1.pkl')

#NEW TRAINING SET V3
new_neg_set_2 = pd.read_excel(path+'LOGREG_RELEVANCE/TRAINING_SETS/non_archive_negative_set_v2.xlsx')
new_neg_set_2 = new_neg_set_2.drop_duplicates(subset=['Title'])
new_neg_set_2 = new_neg_set_2.drop_duplicates(subset=['Description'])
new_neg_set_2['Target'] = '0'
positive_set_2 = pd.read_pickle(path+'LOGREG_RELEVANCE/TRAINING_SETS/trainingset_extended.pkl')
positive_set_2['Target'] = '1'
archive_desc_training_v2 = pd.concat([positive_set_2, new_neg_set_2])
archive_desc_training_v2['Target'] = archive_desc_training_v2['Target'].astype('int')
archive_desc_training_v2 = archive_desc_training_v2.reset_index(drop=True)
archive_desc_training_v2.to_pickle(path+'LOGREG_RELEVANCE/TRAINING_SETS/archive_desc_training_v2.pkl')

***

## Tweets Training Set v1 
- tweets from bigrams searches for positives (see keywords notebook for details of research)
- tweets from searches for digital humanities, music companies, music industry for negatives (tied to v2+3 of description training set)

v1 is the version on which tests for V1 of the pipeline on Twitter were conducted, results of which are included in Marilena's report.  

In [None]:
#negative twitter training set
dh = pd.read_pickle(path+'TWITTER_SEARCHES/NEGATIVE/digital_humanities_2021.pkl')
music_company = pd.read_pickle(path+'TWITTER_SEARCHES/NEGATIVE/music_company_2021.pkl')
twitter_neg = pd.concat([dh, music_company])
twitter_neg = twitter_neg.loc[twitter_neg['lang'] == 'en']
twitter_neg['Target'] = '0'
twitter_neg = twitter_neg.sample(n=4379, random_state=56)
twitter_neg = twitter_neg[['tweet', 'Target']].reset_index(drop=True)

#positive twitter training set 
music_collection = pd.read_pickle(path+'TWITTER_SEARCHES/MUSOW BIGRAMS/twitter_music_collection.pkl')
song_dataset = pd.read_pickle(path+'TWITTER_SEARCHES/MUSOW BIGRAMS/twitter_song_dataset.pkl')
sound_archive = pd.read_pickle(path+'TWITTER_SEARCHES/MJI BIGRAMS/twitter_sound_archive.pkl')
digital_archive = pd.read_pickle(path+'TWITTER_SEARCHES/MUSOW BIGRAMS/twitter_digital_archive.pkl')
music_archive = pd.read_pickle(path+'TWITTER_SEARCHES/MUSOW BIGRAMS/twitter_music_archive.pkl')
digi_music_archive = pd.read_pickle(path+'TWITTER_SEARCHES/MUSOW BIGRAMS/twitter_digital_music_archive.pkl')
midi_file = pd.read_pickle(path+'TWITTER_SEARCHES/MUSOW BIGRAMS/twitter_midi_file.pkl')
music_data = pd.read_pickle(path+'TWITTER_SEARCHES/MUSOW BIGRAMS/twitter_music_data.pkl')
music_research = pd.read_pickle(path+'TWITTER_SEARCHES/MJI BIGRAMS/twitter_music_research.pkl')
music_dataset = pd.read_pickle(path+'TWITTER_SEARCHES/MUSOW BIGRAMS/twitter_music_dataset.pkl')
twitter_pos = pd.concat([sound_archive, music_collection, digital_archive, music_archive, song_dataset, digi_music_archive, midi_file, music_data, music_research, music_dataset])
twitter_pos = twitter_pos.loc[twitter_pos['lang'] == 'en']
twitter_pos['Target'] = '1'
twitter_pos = twitter_pos[['tweet', 'Target']].reset_index(drop=True)

#final twitter training set
twitter_set = pd.concat([twitter_pos, twitter_neg])
twitter_set['Target'] = twitter_set['Target'].astype('int')
twitter_set = twitter_set.reset_index(drop=True)
twitter_set.to_pickle(path+'LOGREG_RELEVANCE/TRAINING_SETS/twitter_training_v1.pkl')

***

## Description Training Set v4 

- Manual work:
    - Lenghthen musow descs where possible 
    - Add some new MJI descs
    - Add pos/neg from baseline test 
- Lower case  

In [3]:
#NEW DESC TRAINING SET V4
desc_neg_v4 = pd.read_csv(path+'LOGREG_RELEVANCE/TRAINING_SETS/Pipeline_v3_sets/Description V4/Training Set Desc v4 - Negatives.csv')
desc_neg_v4 = desc_neg_v4.drop(labels=['Notes', 'Source'], axis=1)
desc_neg_v4['Title'] = desc_neg_v4['Title'].str.lower()
desc_neg_v4['Description'] = desc_neg_v4['Description'].str.lower()
desc_neg_v4 = desc_neg_v4.drop_duplicates(subset=['URL'])
desc_neg_v4 = desc_neg_v4.drop_duplicates(subset=['Title'])
desc_neg_v4 = desc_neg_v4.drop_duplicates(subset=['Description'])

desc_pos_v4 = pd.read_csv(path+'LOGREG_RELEVANCE/TRAINING_SETS/Pipeline_v3_sets/Description V4/Training Set Desc v4 - Positives.csv')
desc_pos_v4 = desc_pos_v4.drop(labels=['Notes', 'Source'], axis=1)
desc_pos_v4['Title'] = desc_pos_v4['Title'].str.lower()
desc_pos_v4['Description'] = desc_pos_v4['Description'].str.lower()
desc_pos_v4 = desc_pos_v4.drop_duplicates(subset=['URL'])
desc_pos_v4 = desc_pos_v4.drop_duplicates(subset=['Title'])
desc_pos_v4 = desc_pos_v4.drop_duplicates(subset=['Description'])

archive_desc_training_v4 = pd.concat([desc_neg_v4, desc_pos_v4])
archive_desc_training_v4['Target'] = archive_desc_training_v4['Target'].astype('int')
archive_desc_training_v4 = archive_desc_training_v4.reset_index(drop=True)

archive_desc_training_v4.to_pickle(path+'LOGREG_RELEVANCE/TRAINING_SETS/archive_desc_training_v4.pkl')

***

## Twitter Training Set v2

- Manual work:
    - Clean up positives from v1 set, remove all dupes, spam, non english 
    - Take negs from baseline test + max feats 100 test, clean up 
- Lower case
- Remove emojis   

In [3]:
#NEW DESC TRAINING SET V4
twitter_neg_v2 = pd.read_csv(path+'LOGREG_RELEVANCE/TRAINING_SETS/Pipeline_v3_sets/Twitter V2/Twitter v2 - Negatives.csv')
twitter_neg_v2 = twitter_neg_v2.drop(labels=['Source'], axis=1)
twitter_neg_v2['tweet'] = twitter_neg_v2['tweet'].str.lower()
twitter_neg_v2['tweet'] = twitter_neg_v2['tweet'].apply(lambda x: emoji.replace_emoji(x, replace=''))
twitter_neg_v2 = twitter_neg_v2.drop_duplicates(subset=['tweet'])
twitter_neg_v2['Target'] = twitter_neg_v2['Target'].astype(int)

twitter_pos_v2 = pd.read_csv(path+'LOGREG_RELEVANCE/TRAINING_SETS/Pipeline_v3_sets/Twitter V2/Twitter v2 - Positives.csv')
twitter_pos_v2 = twitter_pos_v2.drop(labels=['Source'], axis=1)
twitter_pos_v2['tweet'] = twitter_pos_v2['tweet'].str.lower()
twitter_pos_v2['tweet'] = twitter_pos_v2['tweet'].apply(lambda x: emoji.replace_emoji(x, replace=''))
twitter_pos_v2 = twitter_pos_v2.drop_duplicates(subset=['tweet'])
twitter_pos_v2['Target'] = twitter_pos_v2['Target'].astype(int)

twitter_training_v2 = pd.concat([twitter_neg_v2, twitter_pos_v2])
twitter_training_v2 = twitter_training_v2.reset_index(drop=True)

#twitter_training_v2.to_pickle(path+'LOGREG_RELEVANCE/TRAINING_SETS/twitter_training_v2.pkl')

In [5]:
#ALT NEGATIVE 

twitter_neg_v2_alt = pd.read_pickle(path+'OLD_DO_NOT_USE/NEGATIVE/digital_humanities_2021.pkl')
twitter_neg_v2_alt = twitter_neg_v2_alt.loc[twitter_neg_v2_alt['lang'] == 'en']
twitter_neg_v2_alt['tweet'] = twitter_neg_v2_alt['tweet'].str.lower()
twitter_neg_v2_alt['tweet'] = twitter_neg_v2_alt['tweet'].apply(lambda x: emoji.replace_emoji(x, replace=''))
twitter_neg_v2_alt['Target'] = '0'
twitter_neg_v2_alt['Target'] = twitter_neg_v2_alt['Target'].astype(int)
twitter_neg_v2_alt = twitter_neg_v2_alt.sample(n=510, random_state=33)
twitter_neg_v2_alt = twitter_neg_v2_alt[['tweet', 'Target']].reset_index(drop=True)

twitter_training_v2_alt = pd.concat([twitter_neg_v2_alt, twitter_pos_v2])
twitter_training_v2_alt = twitter_training_v2_alt.reset_index(drop=True)

twitter_training_v2_alt.to_pickle(path+'LOGREG_RELEVANCE/TRAINING_SETS/twitter_training_v2_alt.pkl')