TO DO:
- supervised ce? 
- replicate w/ twitter dfs? 
- try different types of scoring in logreg? different test sizes?
- export model and reuse w/ predictions (use additions to test against base set)  

In [1]:
#imports + path
import pandas as pd
import json
import re
import numpy as np
import category_encoders as ce
import pickle
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer
from pandas_profiling import ProfileReport
path = '/Users/laurentfintoni/Desktop/University/COURSE DOCS/THESIS/Internship/musow-pipeline/'

Part 1: 
- create DFs for trainin set using musoW and MJI spreadsheets
- two versions: a base one and one w/ additions from ismir dataset (for musoW) and from additional research (for MJI)
- the baseline difference we want to train the LogReg model on is that musoW is focused on music archives w/ datasets while MJI is focused on all sorts of music archives, regardless of dataset inclusion
- to avoid polluting the two sets we check for duplicates at all possible stages to ensure that the MJI set does not have anything that currently exists in musoW 

In [None]:
#read mji csv and grab needed columns
df_mji = pd.read_csv(path+'MJI/MJI_data.csv', keep_default_na=False, dtype='string')
df_mji_small = pd.DataFrame(columns=['Title', 'Description', 'URL'])
df_mji_small['Title'] = df_mji['Title'].str.lower().str.strip()
df_mji_small['Description'] = df_mji['Description'].str.lower().str.strip()
df_mji_small['URL'] = df_mji['URL'].str.lower().str.strip()

In [None]:
#read musow json dump and grab needed columns
with open(path+'MUSOW/musow_name_desc_url_cat.json') as file:
    data = json.load(file)
    
musow_names = [result['name']['value'].strip().lower() for result in data['results']['bindings']]
musow_desc = [result['description']['value'].strip().lower() for result in data['results']['bindings']]
musow_url = [result['url']['value'].strip().lower() for result in data['results']['bindings']]
df_musow = pd.DataFrame(columns=['Title', 'Description', 'URL'])
df_musow['Title'] = musow_names
df_musow['Description'] = musow_desc
df_musow['URL'] = musow_url
df_musow = df_musow.astype('string')

In [None]:
#remove musow duplicates from MJI set 
mji_training_set = df_mji_small[~df_mji_small['Title'].isin(df_musow['Title'])].dropna()

In [None]:
#create positive and negative sets w/o additions, add target column 
positive_df = df_musow.copy()
positive_df['Target'] = '1'
negative_df = mji_training_set.copy()
negative_df['Target'] = '0'

In [None]:
#create positive and negative sets w/ additions, add target column 
ismir_df = pd.read_pickle(path+'GH_PICKLES/ismir.pkl')
ismir_df = ismir_df[~ismir_df['Title'].isin(df_musow['Title'])].dropna() 
positive_df_adds = pd.concat([df_musow, ismir_df]).reset_index(drop=True)
positive_df_adds = positive_df_adds.drop_duplicates(['Title'], keep='last')
positive_df_adds['Target'] = '1'
mji_additions_1 = pd.read_csv(path+'MJI/MJI_additions_for_LR.csv')
mji_additions_1['Title'] = mji_additions_1['Title'].str.lower().str.strip()
mji_additions_1['Description'] = mji_additions_1['Description'].str.lower().str.strip()
mji_additions_1['URL'] = mji_additions_1['URL'].str.lower().str.strip()
mji_additions_1 = mji_additions_1[~mji_additions_1['Title'].isin(df_musow['Title'])].dropna()
mji_additions_1 = mji_additions_1[~mji_additions_1['Title'].isin(mji_training_set['Title'])].dropna()
negative_df_adds = pd.concat([mji_training_set, mji_additions_1]).reset_index(drop=True)
negative_df_adds = negative_df_adds.drop_duplicates(['Title'], keep='last')
negative_df_adds['Target'] = '0'

In [None]:
#create a prediction set for later w/ only additions
#ismir_df['Target'] = '1'
#mji_additions_1['Target'] = '0'
prediction_set = pd.concat([ismir_df, mji_additions_1]).reset_index(drop=True)
prediction_set.to_pickle(path+'LOGREG_RELEVANCE/base_prediction_set.pkl')

In [None]:
#merge both sets into one, create two training sets (one base, one extended w/ additional data), pickle for reuse
training_set = pd.concat([positive_df, negative_df])
training_set['Target'] = training_set['Target'].astype('int')
training_set = training_set.reset_index(drop=True)
training_set_adds = pd.concat([positive_df_adds, negative_df_adds])
training_set_adds['Target'] = training_set_adds['Target'].astype('int')
training_set_adds = training_set_adds.reset_index(drop=True)
training_set.to_pickle(path+'LOGREG_RELEVANCE/title_desc_url_trainingset.pkl')
training_set_adds.to_pickle(path+'LOGREG_RELEVANCE/title_desc_url_trainingset_extended.pkl')

#random sample positive df to same length as negative, create alt extended set w/ even balance, pickle 
positive_df_adds_2 = positive_df_adds.sample(n=267, random_state=1)
training_set_even = pd.concat([positive_df_adds_2, negative_df_adds])
training_set_even['Target'] = training_set_even['Target'].astype('int')
training_set_even = training_set_even.reset_index(drop=True)
training_set_even.to_pickle(path+'LOGREG_RELEVANCE/title_desc_url_trainingset_even.pkl')

In [None]:
#create another version of base/extended sets with combined features, pickle for reuse

training_set_comb = training_set.copy()
training_set_adds_comb = training_set_adds.copy()
training_set_comb_even = training_set_even.copy()

#create combined columns for desc+headline and desc+headline_url
def tokenize_url(url:str):
    url=url.replace("https","")
    url=url.replace("http","")
    url=url.replace("www","")   
    url=re.sub("(\W|_)+"," ",url)
    return url

#create tokenized URL field
training_set_comb['tokenized_url']=training_set_comb['URL'].apply(lambda x:tokenize_url(x))
#description + tokenized url
training_set_comb['text_desc_headline_url'] = training_set_comb['Description'] + ' '+ training_set_comb['Title']+" " + training_set_comb['tokenized_url']
training_set_comb.drop(['tokenized_url', 'Title', 'Description', 'URL'], inplace=True, axis=1)

#create tokenized URL field
training_set_adds_comb['tokenized_url']=training_set_adds_comb['URL'].apply(lambda x:tokenize_url(x))
#description + tokenized url
training_set_adds_comb['text_desc_headline_url'] = training_set_adds_comb['Description'] + ' '+ training_set_adds_comb['Title']+" " + training_set_adds_comb['tokenized_url']
training_set_adds_comb.drop(['tokenized_url', 'Title', 'Description', 'URL'], inplace=True, axis=1)

#same for even set
#create tokenized URL field
training_set_comb_even['tokenized_url']=training_set_comb_even['URL'].apply(lambda x:tokenize_url(x))
#description + tokenized url
training_set_comb_even['text_desc_headline_url'] = training_set_comb_even['Description'] + ' '+ training_set_comb_even['Title']+" " + training_set_comb_even['tokenized_url']
training_set_comb_even.drop(['tokenized_url', 'Title', 'Description', 'URL'], inplace=True, axis=1)

training_set_comb.to_pickle(path+'LOGREG_RELEVANCE/title_desc_url_trainingset_comb.pkl')
training_set_adds_comb.to_pickle(path+'LOGREG_RELEVANCE/title_desc_url_trainingset_ext_comb.pkl')
training_set_comb_even.to_pickle(path+'LOGREG_RELEVANCE/title_desc_url_trainingset_even_comb.pkl')

In [None]:
#print some base stats to keep track of changes: training set size, avg length of description (main feature)

print('Size of base set:', len(training_set.index), '\nMean length of description in base set is:', training_set['Description'].str.len().mean(), '\nSize of extended set:', len(training_set_adds.index), '\nMean length of description in extended set is:', training_set_adds['Description'].str.len().mean(), '\nSize of even set:', len(training_set_even.index), '\nMean length of description in even set is:', training_set_even['Description'].str.len().mean())

#print some base stats to keep track of changes: avg length of combined desc+title+url (main feature)

print('\nMean length of description in base set combined is:', training_set_comb['text_desc_headline_url'].str.len().mean(), '\nMean length of description in extended set combined is:', training_set_adds_comb['text_desc_headline_url'].str.len().mean(), '\nMean length of description in even set combined is:', training_set_comb_even['text_desc_headline_url'].str.len().mean())

Part 2:
- Try different unsupervised encoding approaches for categorical variables including OrdinalEncoder, Label Encoding, Tf-Idf, BackwardDifferenceEncoder 
- Ordinal and Label encoding use title, desc, and url + combined title/desc and title/desc/url as features depending on source version of training set
- BackwardDifference uses title, desc, and url + separated title/desc and title/desc/url as features (in order to see if there's any diff btw the two different combination of strings approach)
- Tf-Idf uses only desc, title/desc, and title/desc/url as features 

In [None]:
#encode categorial variables w/ OrdinalEncoder, pickle
ord_enc = OrdinalEncoder()

#base set 
training_set_ordinal = training_set.copy()
training_set_ordinal["Title_encoded"] = ord_enc.fit_transform(training_set_ordinal[["Title"]])
training_set_ordinal["Desc_encoded"] = ord_enc.fit_transform(training_set_ordinal[["Description"]])
training_set_ordinal["URL_encoded"] = ord_enc.fit_transform(training_set_ordinal[["URL"]])
training_set_ordinal.to_pickle(path+'LOGREG_RELEVANCE/title_desc_url_trainingset_ordinal.pkl')

#base set comb
training_set_ordinal_comb = training_set_comb.copy()
training_set_ordinal_comb["text_desc_headline_url_encoded"] = ord_enc.fit_transform(training_set_ordinal_comb[["text_desc_headline_url"]])
training_set_ordinal_comb.to_pickle(path+'LOGREG_RELEVANCE/title_desc_url_trainingset_comb_ordinal.pkl')

#extended set 
training_set_ordinal_adds = training_set_adds.copy()
training_set_ordinal_adds["Title_encoded"] = ord_enc.fit_transform(training_set_ordinal_adds[["Title"]])
training_set_ordinal_adds["Desc_encoded"] = ord_enc.fit_transform(training_set_ordinal_adds[["Description"]])
training_set_ordinal_adds["URL_encoded"] = ord_enc.fit_transform(training_set_ordinal_adds[["URL"]])
training_set_ordinal_adds.to_pickle(path+'LOGREG_RELEVANCE/title_desc_url_trainingset_extended_ordinal.pkl')

#extended set comb 
training_set_ordinal_adds_comb = training_set_adds_comb.copy()
training_set_ordinal_adds_comb["text_desc_headline_url_encoded"] = ord_enc.fit_transform(training_set_ordinal_adds_comb[["text_desc_headline_url"]])
training_set_ordinal_adds_comb.to_pickle(path+'LOGREG_RELEVANCE/title_desc_url_trainingset_extended_comb_ordinal.pkl')

#even set 
training_set_ordinal_even = training_set_even.copy()
training_set_ordinal_even["Title_encoded"] = ord_enc.fit_transform(training_set_ordinal_even[["Title"]])
training_set_ordinal_even["Desc_encoded"] = ord_enc.fit_transform(training_set_ordinal_even[["Description"]])
training_set_ordinal_even["URL_encoded"] = ord_enc.fit_transform(training_set_ordinal_even[["URL"]])
training_set_ordinal_even.to_pickle(path+'LOGREG_RELEVANCE/title_desc_url_trainingset_even_ordinal.pkl')

#even set comb 
training_set_ordinal_even_comb = training_set_comb_even.copy()
training_set_ordinal_even_comb["text_desc_headline_url_encoded"] = ord_enc.fit_transform(training_set_ordinal_even_comb[["text_desc_headline_url"]])
training_set_ordinal_even_comb.to_pickle(path+'LOGREG_RELEVANCE/title_desc_url_trainingset_even_comb_ordinal.pkl')

In [None]:
#encode cat variables with label encoding, pickle 

#base set 
training_set_label = training_set.copy()
training_set_label["Title"] = training_set_label["Title"].astype('category')
training_set_label["Description"] = training_set_label["Description"].astype('category')
training_set_label["URL"] = training_set_label["URL"].astype('category')
training_set_label["Title_cat"] = training_set_label["Title"].cat.codes
training_set_label["Desc_cat"] = training_set_label["Description"].cat.codes
training_set_label["URL_cat"] = training_set_label["URL"].cat.codes
training_set_label.to_pickle(path+'LOGREG_RELEVANCE/title_desc_url_trainingset_label.pkl')

#base set comb
training_set_label_comb = training_set_comb.copy()
training_set_label_comb["text_desc_headline_url"] = training_set_label_comb["text_desc_headline_url"].astype('category')
training_set_label_comb["text_desc_headline_url_cat"] = training_set_label_comb["text_desc_headline_url"].cat.codes
training_set_label_comb.to_pickle(path+'LOGREG_RELEVANCE/title_desc_url_trainingset_comb_label.pkl')

#extended set 
training_set_label_adds = training_set_adds.copy()
training_set_label_adds["Title"] = training_set_label_adds["Title"].astype('category')
training_set_label_adds["Description"] = training_set_label_adds["Description"].astype('category')
training_set_label_adds["URL"] = training_set_label_adds["URL"].astype('category')
training_set_label_adds["Title_cat"] = training_set_label_adds["Title"].cat.codes
training_set_label_adds["Desc_cat"] = training_set_label_adds["Description"].cat.codes
training_set_label_adds["URL_cat"] = training_set_label_adds["URL"].cat.codes
training_set_label_adds.to_pickle(path+'LOGREG_RELEVANCE/title_desc_url_trainingset_extended_label.pkl')

#extended set comb
training_set_label_adds_comb = training_set_adds_comb.copy()
training_set_label_adds_comb["text_desc_headline_url"] = training_set_label_adds_comb["text_desc_headline_url"].astype('category')
training_set_label_adds_comb["text_desc_headline_url_cat"] = training_set_label_adds_comb["text_desc_headline_url"].cat.codes
training_set_label_adds_comb.to_pickle(path+'LOGREG_RELEVANCE/title_desc_url_trainingset_extended_comb_label.pkl')

#even set 
training_set_label_even = training_set_even.copy()
training_set_label_even["Title"] = training_set_label_even["Title"].astype('category')
training_set_label_even["Description"] = training_set_label_even["Description"].astype('category')
training_set_label_even["URL"] = training_set_label_even["URL"].astype('category')
training_set_label_even["Title_cat"] = training_set_label_even["Title"].cat.codes
training_set_label_even["Desc_cat"] = training_set_label_even["Description"].cat.codes
training_set_label_even["URL_cat"] = training_set_label_even["URL"].cat.codes
training_set_label_even.to_pickle(path+'LOGREG_RELEVANCE/title_desc_url_trainingset_even_label.pkl')

#even set comb 
training_set_label_even_comb = training_set_comb_even.copy()
training_set_label_even_comb["text_desc_headline_url"] = training_set_label_even_comb["text_desc_headline_url"].astype('category')
training_set_label_even_comb["text_desc_headline_url_cat"] = training_set_label_even_comb["text_desc_headline_url"].cat.codes
training_set_label_even_comb.to_pickle(path+'LOGREG_RELEVANCE/title_desc_url_trainingset_even_comb_label.pkl')

In [None]:
#encode w/ CE's BackwardDifferenceEncoder, pickle 

#base set 
training_set_ce = training_set.copy()
# Specify the columns to encode then fit and transform
encoder_base = ce.BackwardDifferenceEncoder(cols=['Title', 'Description', 'URL'])
training_set_ce = encoder_base.fit_transform(training_set_ce, verbose=1)
training_set_ce.drop(['intercept'], inplace=True, axis=1)
training_set_ce.to_pickle(path+'LOGREG_RELEVANCE/title_desc_url_trainingset_BD.pkl')

#base set desc only
training_set_ce_desc = training_set.copy()
encoder_base_desc = ce.BackwardDifferenceEncoder(cols=['Description'])
training_set_ce_desc = encoder_base_desc.fit_transform(training_set_ce_desc, verbose=1)
training_set_ce_desc.drop(['intercept'], inplace=True, axis=1)
training_set_ce_desc.to_pickle(path+'LOGREG_RELEVANCE/desc_trainingset_BD_encode.pkl')

#extended set 
training_set_adds_ce = training_set_adds.copy()
training_set_adds_ce = encoder_base.fit_transform(training_set_adds_ce, verbose=1)
training_set_adds_ce.drop(['intercept'], inplace=True, axis=1)
training_set_adds_ce.to_pickle(path+'LOGREG_RELEVANCE/title_desc_url_trainingset_extended_BD.pkl')

#extended set desc only 
training_set_adds_ce_desc = training_set_adds.copy()
training_set_adds_ce_desc = encoder_base_desc.fit_transform(training_set_adds_ce_desc, verbose=1)
training_set_adds_ce_desc.drop(['intercept'], inplace=True, axis=1)
training_set_adds_ce_desc.to_pickle(path+'LOGREG_RELEVANCE/desc_trainingset_extended_BD.pkl')

#base set comb
training_set_comb_ce = training_set_comb.copy()
encoder_comb = ce.BackwardDifferenceEncoder(cols=['text_desc_headline_url'])
training_set_comb_ce = encoder_comb.fit_transform(training_set_comb_ce, verbose=1)
training_set_comb_ce.drop(['intercept'], inplace=True, axis=1)
training_set_comb_ce.to_pickle(path+'LOGREG_RELEVANCE/title_desc_url_trainingset_comb_BD.pkl')

#extended set comb
training_set_adds_comb_ce = training_set_adds_comb.copy()
training_set_adds_comb_ce = encoder_comb.fit_transform(training_set_adds_comb_ce, verbose=1)
training_set_adds_comb_ce.drop(['intercept'], inplace=True, axis=1)
training_set_adds_comb_ce.to_pickle(path+'LOGREG_RELEVANCE/title_desc_url_trainingset_extended_comb_BD.pkl')

#even set 
training_set_ce_even = training_set_even.copy()
training_set_ce_even = encoder_base.fit_transform(training_set_ce_even, verbose=1)
training_set_ce_even.drop(['intercept'], inplace=True, axis=1)
training_set_ce_even.to_pickle(path+'LOGREG_RELEVANCE/title_desc_url_trainingset_even_BD.pkl')

#even set comb
training_set_ce_even_comb = training_set_comb_even.copy()
training_set_ce_even_comb = encoder_comb.fit_transform(training_set_ce_even_comb, verbose=1)
training_set_ce_even_comb.drop(['intercept'], inplace=True, axis=1)
training_set_ce_even_comb.to_pickle(path+'LOGREG_RELEVANCE/title_desc_url_trainingset_even_comb_BD.pkl')

In [2]:
#ADD PICKLE READING CHUNK HERE TO NOT RERUN ALL ABOVE CODE
#base sets 
training_set = pd.read_pickle(path+'LOGREG_RELEVANCE/title_desc_url_trainingset.pkl')
training_set_adds = pd.read_pickle(path+'LOGREG_RELEVANCE/title_desc_url_trainingset_extended.pkl')
training_set_even = pd.read_pickle(path+'LOGREG_RELEVANCE/title_desc_url_trainingset_even.pkl')
training_set_comb = pd.read_pickle(path+'LOGREG_RELEVANCE/title_desc_url_trainingset_comb.pkl')
training_set_adds_comb = pd.read_pickle(path+'LOGREG_RELEVANCE/title_desc_url_trainingset_ext_comb.pkl')
training_set_comb_even = pd.read_pickle(path+'LOGREG_RELEVANCE/title_desc_url_trainingset_even_comb.pkl')

#ordinal sets 
training_set_ordinal = pd.read_pickle(path+'LOGREG_RELEVANCE/title_desc_url_trainingset_ordinal.pkl')
training_set_ordinal_comb = pd.read_pickle(path+'LOGREG_RELEVANCE/title_desc_url_trainingset_comb_ordinal.pkl')
training_set_ordinal_adds = pd.read_pickle(path+'LOGREG_RELEVANCE/title_desc_url_trainingset_extended_ordinal.pkl')
training_set_ordinal_adds_comb = pd.read_pickle(path+'LOGREG_RELEVANCE/title_desc_url_trainingset_extended_comb_ordinal.pkl')
training_set_ordinal_even = pd.read_pickle(path+'LOGREG_RELEVANCE/title_desc_url_trainingset_even_ordinal.pkl')
training_set_ordinal_even_comb = pd.read_pickle(path+'LOGREG_RELEVANCE/title_desc_url_trainingset_even_comb_ordinal.pkl')

#label sets 
training_set_label = pd.read_pickle(path+'LOGREG_RELEVANCE/title_desc_url_trainingset_label.pkl')
training_set_label_comb = pd.read_pickle(path+'LOGREG_RELEVANCE/title_desc_url_trainingset_comb_label.pkl')
training_set_label_adds = pd.read_pickle(path+'LOGREG_RELEVANCE/title_desc_url_trainingset_extended_label.pkl')
training_set_label_adds_comb = pd.read_pickle(path+'LOGREG_RELEVANCE/title_desc_url_trainingset_extended_comb_label.pkl')
training_set_label_even = pd.read_pickle(path+'LOGREG_RELEVANCE/title_desc_url_trainingset_even_label.pkl')
training_set_label_even_comb = pd.read_pickle(path+'LOGREG_RELEVANCE/title_desc_url_trainingset_even_comb_label.pkl')

#BD sets 
training_set_ce = pd.read_pickle(path+'LOGREG_RELEVANCE/title_desc_url_trainingset_BD.pkl')
training_set_ce_desc = pd.read_pickle(path+'LOGREG_RELEVANCE/desc_trainingset_BD_encode.pkl')
training_set_adds_ce = pd.read_pickle(path+'LOGREG_RELEVANCE/title_desc_url_trainingset_extended_BD.pkl')
training_set_adds_ce_desc = pd.read_pickle(path+'LOGREG_RELEVANCE/desc_trainingset_extended_BD.pkl')
training_set_comb_ce = pd.read_pickle(path+'LOGREG_RELEVANCE/title_desc_url_trainingset_comb_BD.pkl')
training_set_adds_comb_ce = pd.read_pickle(path+'LOGREG_RELEVANCE/title_desc_url_trainingset_extended_comb_BD.pkl')
training_set_ce_even = pd.read_pickle(path+'LOGREG_RELEVANCE/title_desc_url_trainingset_even_BD.pkl')
training_set_ce_even_comb = pd.read_pickle(path+'LOGREG_RELEVANCE/title_desc_url_trainingset_even_comb_BD.pkl')

#prediction set
prediction_set = pd.read_pickle(path+'LOGREG_RELEVANCE/base_prediction_set.pkl')

In [None]:
#encode w/ tf-idf

#create vectorizers 
tfidf_vectorizer=TfidfVectorizer(use_idf=True, max_df=0.95)

#base set description 
tfidf_vectorizer.fit_transform(training_set['Description'].values)
training_set_tfidf = tfidf_vectorizer.transform(training_set['Description'].values)

#extended set description 
tfidf_vectorizer.fit_transform(training_set_adds['Description'].values)
training_set_adds_tfidf = tfidf_vectorizer.transform(training_set_adds['Description'].values)

#even set description 
tfidf_vectorizer.fit_transform(training_set_even['Description'].values)
training_set_tfidf_even = tfidf_vectorizer.transform(training_set_even['Description'].values)

#base set comb
tfidf_vectorizer.fit_transform(training_set_comb['text_desc_headline_url'].values)
training_set_comb_tfidf = tfidf_vectorizer.transform(training_set_comb['text_desc_headline_url'].values)

#extended set comb
tfidf_vectorizer.fit_transform(training_set_adds_comb['text_desc_headline_url'].values)
training_set_adds_comb_tfidf = tfidf_vectorizer.transform(training_set_adds_comb['text_desc_headline_url'].values)

#even set comb
tfidf_vectorizer.fit_transform(training_set_comb_even['text_desc_headline_url'].values)
training_set_even_comb_tfidf = tfidf_vectorizer.transform(training_set_comb_even['text_desc_headline_url'].values)


In [3]:
#tfidf encoding w/ count vect 

count_vect = CountVectorizer()
tfidf_transformer = TfidfTransformer()

#base set
base_train_counts = count_vect.fit_transform(training_set['Description'])
training_set_tfidf = tfidf_transformer.fit_transform(base_train_counts)

#extended set description 
extended_train_counts = count_vect.fit_transform(training_set_adds['Description'])
training_set_adds_tfidf = tfidf_transformer.fit_transform(extended_train_counts)

#even set description 
even_train_counts = count_vect.fit_transform(training_set_even['Description'])
training_set_tfidf_even = tfidf_transformer.fit_transform(even_train_counts)

#base set comb
base_comb_train_counts = count_vect.fit_transform(training_set_comb['text_desc_headline_url'])
training_set_comb_tfidf = tfidf_transformer.fit_transform(base_comb_train_counts)

#extended set comb
extended_comb_train_counts = count_vect.fit_transform(training_set_adds_comb['text_desc_headline_url'])
training_set_adds_comb_tfidf = tfidf_transformer.fit_transform(extended_comb_train_counts)

#even set comb
even_comb_train_counts = count_vect.fit_transform(training_set_comb_even['text_desc_headline_url'])
training_set_even_comb_tfidf = tfidf_transformer.fit_transform(even_comb_train_counts)

In [5]:
#set features and targets for all encoding types 

#ORDINAL  
features_ord = ['Title_encoded', 'Desc_encoded', 'URL_encoded']
features_ord_comb = ['text_desc_headline_url_encoded']

#base set
x_ord = training_set_ordinal[features_ord] # Features
y_ord = training_set_ordinal.Target # Target variable

#base set desc only 
training_set_ordinal_desc = training_set_ordinal.copy()
training_set_ordinal_desc.drop(['Title', 'URL', 'Title_encoded', 'Description', 'URL_encoded'], inplace=True, axis=1)
single_feat = ['Desc_encoded']
x_ord_desc = training_set_ordinal_desc[single_feat] # Features
y_ord_desc = training_set_ordinal_desc.Target # Target variable

#extended set 
x_ord_adds = training_set_ordinal_adds[features_ord] # Features
y_ord_adds = training_set_ordinal_adds.Target # Target variable

#extended set desc only
training_set_ordinal_adds_desc = training_set_ordinal_adds.copy()
training_set_ordinal_adds_desc.drop(['Title', 'URL', 'Title_encoded', 'Description', 'URL_encoded'], inplace=True, axis=1)
x_ord_adds_desc = training_set_ordinal_adds_desc[single_feat] # Features
y_ord_adds_desc = training_set_ordinal_adds_desc.Target # Target variable

#base set comb 
x_ord_comb = training_set_ordinal_comb[features_ord_comb] # Features
y_ord_comb = training_set_ordinal_comb.Target # Target variable

#extended set comb 
x_ord_adds_comb = training_set_ordinal_adds_comb[features_ord_comb] # Features
y_ord_adds_comb = training_set_ordinal_adds_comb.Target # Target variable

#even set 
x_ord_even = training_set_ordinal_even[features_ord] # Features
y_ord_even = training_set_ordinal_even.Target # Target variable

#even set comb
x_ord_even_comb = training_set_ordinal_even_comb[features_ord_comb] # Features
y_ord_even_comb = training_set_ordinal_even_comb.Target # Target variable

#LABEL 
features_label = ['Title_cat', 'Desc_cat', 'URL_cat']
features_label_comb = ['text_desc_headline_url_cat']

#base set
x_label = training_set_label[features_label] # Features
y_label = training_set_label.Target # Target variable

#base set desc only 
training_set_label_2 = training_set_label.copy()
training_set_label_2.drop(['Title', 'Description', 'URL', 'Title_cat', 'URL_cat'], inplace=True, axis=1)
single_feat_label = ['Desc_cat']
x_label_desc = training_set_label_2[single_feat_label] # Features
y_label_desc = training_set_label_2.Target # Target variable

#extended set
x_label_adds = training_set_label_adds[features_label] # Features
y_label_adds = training_set_label_adds.Target # Target variable

#extended set desc only 
training_set_label_adds_2 = training_set_label_adds.copy()
training_set_label_adds_2.drop(['Title', 'Description', 'URL', 'Title_cat', 'URL_cat'], inplace=True, axis=1)
x_label_adds_desc = training_set_label_adds_2[single_feat_label] # Features
y_label_adds_desc = training_set_label_adds_2.Target # Target variable

#base set comb
x_label_comb = training_set_label_comb[features_label_comb] # Features
y_label_comb = training_set_label_comb.Target # Target variable

#extended set comb 
x_label_adds_comb = training_set_label_adds_comb[features_label_comb] # Features
y_label_adds_comb = training_set_label_adds_comb.Target # Target variable

#even set
x_label_even = training_set_label_even[features_label] # Features
y_label_even = training_set_label_even.Target # Target variable

#even set comb 
x_label_even_comb = training_set_label_even_comb[features_label_comb] # Features
y_label_even_comb = training_set_label_even_comb.Target # Target variable

#BACKWARDSDIFFERENCE

#base set 
x_backwards = training_set_ce.iloc[:,0:1825]
y_backwards = training_set_ce.Target

#base set desc only 
x_backwards_desc = training_set_ce_desc.iloc[:,1:611]
y_backwards_desc = training_set_ce_desc.Target

#extended set 
x_backwards_adds = training_set_adds_ce.iloc[:,0:2293]
y_backwards_adds = training_set_adds_ce.Target

#extended set desc only 
x_backwards_adds_desc = training_set_adds_ce_desc.iloc[:,1:766]
y_backwards_adds_desc = training_set_adds_ce_desc.Target

#base set comb 
x_backwards_comb = training_set_comb_ce.iloc[:,2:627]
y_backwards_comb = training_set_comb_ce.Target

#extended comb
x_backwards_adds_comb = training_set_adds_comb_ce.iloc[:,2:782]
y_backwards_adds_comb = training_set_adds_comb_ce.Target

#even set 
x_backwards_even = training_set_ce_even.iloc[:,0:1575]
y_backwards_even = training_set_ce_even.Target

#even set comb
x_backwards_even_comb = training_set_ce_even_comb.iloc[:,1:534]
y_backwards_even_comb = training_set_ce_even_comb.Target

#TFIDF
#base set 
x_tfidf = training_set_tfidf
y_tfidf = training_set['Target'].values

#extended set 
x_adds_tfidf = training_set_adds_tfidf
y_adds_tfidf = training_set_adds['Target'].values

#base set comb 
x_tfidf_comb = training_set_comb_tfidf
y_tfidf_comb = training_set['Target'].values

#extended set comb 
x_adds_tfidf_comb = training_set_adds_comb_tfidf
y_adds_tfidf_comb = training_set_adds['Target'].values

#even set 
x_adds_tfidf_even = training_set_tfidf_even 
y_adds_tfidf_even = training_set_even['Target'].values

#even set comb 
x_adds_tfidf_even_comb = training_set_even_comb_tfidf
y_adds_tfidf_even_comb = training_set_even['Target'].values

Part 3: 
- run all variations of training set on two different types of Log Reg: cross eval and train/test 

In [6]:
#Log Reg cross eval 
def lr(x,y,title):  
    """ logistic regression"""
    model = LogisticRegression(solver='liblinear', C=10.0,random_state=44)
    y_pred = cross_val_predict(model, x, y, cv=5)
    acc = cross_val_score(model, x, y, cv=5, scoring='precision')    
    report = classification_report(y, y_pred)
    return print(f'{title}\n''MEAN PRECISION', np.mean(acc), 'report:', report, sep='\n')

In [None]:
#Ordinal Encoding results with crossval
print(lr(x_ord, y_ord, 'Base Set Ordinal:'), lr(x_ord_desc, y_ord_desc, 'Base Set Ordinal Desc Only:'), lr(x_ord_adds, y_ord_adds, 'Extended Set Ordinal:'), lr(x_ord_adds_desc, y_ord_adds_desc, 'Extended Set Ordinal Desc Only:'), lr(x_ord_comb, y_ord_comb, 'Base Set Ordinal Combined:'), lr(x_ord_adds_comb, y_ord_adds_comb, 'Extended Set Ordinal Combined:'), lr(x_ord_even, y_ord_even, 'Even Set Ordinal:'), lr(x_ord_even_comb, y_ord_even_comb, 'Even Set Combined Ordinal:')) 

In [None]:
#Label Encoding results with crossval
print(lr(x_label, y_label, 'Base Set Label:'), lr(x_label_desc, y_label_desc, 'Base Set Label Desc Only:'), lr(x_label_adds, y_label_adds, 'Extended Set Label:'), lr(x_label_adds_desc, y_label_adds_desc, 'Extended Set Label Desc Only:'), lr(x_label_comb, y_label_comb, 'Base Set Xtra Col Label:'), lr(x_label_adds_comb, y_label_adds_comb, 'Extended Set Xtra Col Label:'), lr(x_label_even, y_label_even, 'Even Set Label:'), lr(x_label_even_comb, y_label_even_comb, 'Even Set Combined Label:')) 

In [None]:
#BD Encoding results with crossval
print(lr(x_backwards, y_backwards, 'Base Set BD:'), lr(x_backwards_desc, y_backwards_desc, 'Base Set BD Desc Only:'), lr(x_backwards_adds, y_backwards_adds, 'Extended Set BD:'), lr(x_backwards_adds_desc, y_backwards_adds_desc, 'Extended Set Label Desc Only:'), lr(x_backwards_comb, y_backwards_comb, 'Base Set BD Combined:'), lr(x_backwards_adds_comb, y_backwards_adds_comb, 'Extended Set BD Combined:'), lr(x_backwards_even, y_backwards_even, 'Even Set BD:'), lr(x_backwards_even_comb, y_backwards_even_comb, 'Even Set Combined BD:')) 

In [None]:
#Tf-Idf Encoding results with crossval
print(lr(x_tfidf, y_tfidf, 'Base Set TF:'), lr(x_adds_tfidf, y_adds_tfidf, 'Extended Set TF:'), lr(x_tfidf_comb, y_tfidf_comb, 'Base Set Xtra Col TF URL:'), lr(x_adds_tfidf_comb, y_adds_tfidf_comb, 'Extended Set Xtra Col TF URL:'), lr(x_adds_tfidf_even, y_adds_tfidf_even, 'Even Set BD:'), lr(x_adds_tfidf_even_comb, y_adds_tfidf_even_comb, 'Even Set Combined BD:')) 

In [7]:
#LogReg train/test 

# import required modules
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import metrics
%matplotlib inline

def lr_training(x,y,title):
    x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.33,random_state=0)
    logreg = LogisticRegression(solver='liblinear')
    # fit the model with data
    logreg.fit(x_train,y_train)
    y_pred=logreg.predict(x_test)
    cnf_matrix = confusion_matrix(y_test, y_pred)
    # create heatmap for cfn matrix 
    class_names=[0,1] # name  of classes
    fig, ax = plt.subplots()
    tick_marks = np.arange(len(class_names))
    plt.xticks(tick_marks, class_names)
    plt.yticks(tick_marks, class_names)
    # create heatmap
    sns.heatmap(pd.DataFrame(cnf_matrix), annot=True, cmap="YlGnBu" ,fmt='g')
    ax.xaxis.set_label_position("top")
    plt.tight_layout()
    plt.title(f'Confusion matrix:{title}', y=1.1)
    plt.ylabel('Actual label')
    plt.xlabel('Predicted label')
    return print(f'{title}\n' "Accuracy:", accuracy_score(y_test, y_pred), "\nPrecision:", precision_score(y_test, y_pred), "\nRecall:", recall_score(y_test, y_pred))

In [None]:
#Ordinal encoding results w/ train/test 
print(lr_training(x_ord, y_ord, 'Base Set Ordinal:'), lr_training(x_ord_desc, y_ord_desc, 'Base Set Ordinal Desc Only:'), lr_training(x_ord_adds, y_ord_adds, 'Extended Set Ordinal:'), lr_training(x_ord_adds_desc, y_ord_adds_desc, 'Extended Set Ordinal Desc Only:'), lr_training(x_ord_comb, y_ord_comb, 'Base Set Ordinal Combined:'), lr_training(x_ord_adds_comb, y_ord_adds_comb, 'Extended Set Ordinal Combined:'), lr_training(x_ord_even, y_ord_even, 'Even Set Ordinal:'), lr_training(x_ord_even_comb, y_ord_even_comb, 'Even Set Combined Ordinal:')) 

In [None]:
#Label Encoding results w/ train/test
print(lr_training(x_label, y_label, 'Base Set Label:'), lr_training(x_label_desc, y_label_desc, 'Base Set Label Desc Only:'), lr_training(x_label_adds, y_label_adds, 'Extended Set Label:'), lr_training(x_label_adds_desc, y_label_adds_desc, 'Extended Set Label Desc Only:'), lr_training(x_label_comb, y_label_comb, 'Base Set Xtra Col Label:'), lr_training(x_label_adds_comb, y_label_adds_comb, 'Extended Set Xtra Col Label:'), lr_training(x_label_even, y_label_even, 'Even Set Label:'), lr_training(x_label_even_comb, y_label_even_comb, 'Even Set Combined Label:')) 

In [None]:
#BD Encoding results with crossval
print(lr_training(x_backwards, y_backwards, 'Base Set BD:'), lr_training(x_backwards_desc, y_backwards_desc, 'Base Set BD Desc Only:'), lr_training(x_backwards_adds, y_backwards_adds, 'Extended Set BD:'), lr_training(x_backwards_adds_desc, y_backwards_adds_desc, 'Extended Set Label Desc Only:'), lr_training(x_backwards_comb, y_backwards_comb, 'Base Set BD Combined:'), lr_training(x_backwards_adds_comb, y_backwards_adds_comb, 'Extended Set BD Combined:'), lr_training(x_backwards_even, y_backwards_even, 'Even Set BD:'), lr_training(x_backwards_even_comb, y_backwards_even_comb, 'Even Set Combined BD:')) 

In [None]:
#Tf-Idf Encoding results with crossval
print(lr_training(x_tfidf, y_tfidf, 'Base Set TF:'), lr_training(x_adds_tfidf, y_adds_tfidf, 'Extended Set TF:'), lr_training(x_tfidf_comb, y_tfidf_comb, 'Base Set Xtra Col TF URL:'), lr_training(x_adds_tfidf_comb, y_adds_tfidf_comb, 'Extended Set Xtra Col TF URL:'), lr_training(x_adds_tfidf_even, y_adds_tfidf_even, 'Even Set BD:'), lr_training(x_adds_tfidf_even_comb, y_adds_tfidf_even_comb, 'Even Set Combined BD:')) 

Part 4: 
- save model
- test on base predictions

In [8]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
count_vect = CountVectorizer()
X_train_even = count_vect.fit_transform(training_set_even['Description'])
tfidf_transformer = TfidfTransformer()
X_train_tfidf_even = tfidf_transformer.fit_transform(X_train_even)
Y_train_tfidf_even = training_set_even['Target'].values
model = LogisticRegression(solver='liblinear', C=10.0,random_state=44)
model.fit(X_train_tfidf_even, Y_train_tfidf_even)
prediction_set = pd.read_pickle(path+'LOGREG_RELEVANCE/base_prediction_set.pkl')
X_new_counts = count_vect.transform(prediction_set['Description'])
X_new_tfidf = tfidf_transformer.transform(X_new_counts)
predicted = model.predict(X_new_tfidf)


In [10]:
for doc, target in zip(prediction_set['Title'], predicted):
    print('%r => %s' % (doc, target))

'chordify annotator subjectivity dataset' => 1
'the acousticbrainz genre dataset' => 1
'aligned scores and performances (asap) dataset' => 1
'ballroom beat and bar annotations' => 1
'welcome to the dali dataset: a large dataset of synchronised audio, lyrics and vocal notes. news: tutorial: 2- getting the audio. 3- working with dali. 4- correcting annotations.' => 1
'da-tacos' => 1
'desed dataset' => 1
'fma: a dataset for music analysis' => 1
'guitarsolodetection' => 1
'the harmonix set' => 1
'musooevaluator' => 1
"master's thesis data" => 1
'workflow and guidelines for corpus creation' => 1
'm-djcue' => 1
'mastmelody_dataset' => 1
'musical onset database and library (modal)' => 1
'the mtg-jamendo dataset' => 1
'the nes music database' => 1
'seils dataset' => 1
'african american historical newspapers online' => 0
'zappa books' => 0
'folkstreams' => 0
'the kirby collection' => 0
'ethnologisches museum' => 0
'phonogrammarchiv' => 0
'the archive of folk culture' => 0
'american radio archiv

In [None]:
#re encode models w/ tfidf 

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
count_vect = CountVectorizer()
tfidf_transformer = TfidfTransformer()

#even set
X_train_even = count_vect.fit_transform(training_set_even['Description'])
X_train_tfidf_even = tfidf_transformer.fit_transform(X_train_even)
Y_train_tfidf_even = training_set_even['Target'].values

#base set
X_train_base = count_vect.fit_transform(training_set['Description'])
X_train_tfidf_base = tfidf_transformer.fit_transform(X_train_base)
Y_train_tfidf_base = training_set['Target'].values

#extended set 
X_train_extended = count_vect.fit_transform(training_set_adds['Description'])
X_train_tfidf_extended = tfidf_transformer.fit_transform(X_train_extended)
Y_train_tfidf_extended = training_set_adds['Target'].values

In [None]:
#re encode models w/ BD 

#base set 
base_model_ce = training_set.copy()
encoder_base = ce.BackwardDifferenceEncoder(cols=['Title', 'Description', 'URL'])
base_model_ce = encoder_base.fit_transform(base_model_ce, verbose=1)
base_model_ce.drop(['intercept'], inplace=True, axis=1)
x_train_bd_base = base_model_ce.iloc[:,0:1825]
y_train_bd_base = base_model_ce.Target

#extended set
extended_model_ce = training_set_adds.copy()
extended_model_ce = encoder_base.fit_transform(extended_model_ce, verbose=1)
extended_model_ce.drop(['intercept'], inplace=True, axis=1)
x_train_bd_extended = extended_model_ce.iloc[:,0:2293]
y_train_bd_extended = extended_model_ce.Target

#even set
even_model_ce = training_set_even.copy()
even_model_ce = encoder_base.fit_transform(even_model_ce, verbose=1)
even_model_ce.drop(['intercept'], inplace=True, axis=1)
x_train_bd_extended = even_model_ce.iloc[:,0:1575]
y_train_bd_even = even_model_ce.Target

In [None]:
#create a cross val LogReg model, save them

#test_size = 0.33
#seed = 7
#X_train, X_test, Y_train, Y_test = train_test_split(X_train_tfidf, X_train_tfidf, test_size=test_size, random_state=seed)

# Fit the model on training set
model = LogisticRegression(solver='liblinear', C=10.0,random_state=44)
model.fit(X_train_tfidf_even, Y_train_tfidf_even)
# save the model to disk
#filename_even = 'crossval_tfidf_even.sav'
#pickle.dump(model_tfidf_even, open(filename_even, 'wb'))

In [None]:
prediction_set = pd.read_pickle(path+'LOGREG_RELEVANCE/base_prediction_set.pkl')
X_new_counts = count_vect.transform(prediction_set['Description'])
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

In [None]:
prediction_2 = pd.read_pickle(path+'GH_PICKLES/music_archive.pkl')
prediction_2['Name'] = prediction_2['Name'].str.lower().str.strip()
prediction_2 = prediction_2[~prediction_2['Name'].isin(training_set_even['Title'])].dropna()
X_new_counts_2 = count_vect.transform(prediction_2['Description'])
X_new_tfidf_2 = tfidf_transformer.transform(X_new_counts_2)

In [None]:
prediction_set_ce = prediction_set.copy()
prediction_set_ce['Dummy'] = ''
prediction_set_ce = encoder_base.transform(prediction_set_ce)
prediction_set_ce.drop(['intercept'], inplace=True, axis=1)

In [None]:
X_new_backwards = prediction_set_ce.iloc[:,0:1825]

In [None]:
predicted = model.predict(X_new_tfidf)

In [None]:
lr(X_new_tfidf_2, predicted, 'Test')

In [None]:
for doc, target in zip(prediction_2['Name'], predicted):
    print('%r => %s' % (doc, target))

Part 5: 
- The mysteries of Twitter 

In [None]:
twitter_neg = pd.read_pickle(path+'TWITTER_SEARCHES/MJI BIGRAMS (NEGATIVE)/twitter_music_magazine.pkl')
twitter_neg_2 = pd.read_pickle(path+'TWITTER_SEARCHES/MJI BIGRAMS (NEGATIVE)/twitter_music_oral_history.pkl')
twitter_neg['Target'] = '0'
twitter_neg_2['Target'] = '0'
twitter_neg = pd.concat([twitter_neg[['tweet', 'Target']], twitter_neg_2[['tweet', 'Target']]])
twitter_pos = pd.read_pickle(path+'TWITTER_SEARCHES/MUSOW BIGRAMS (POSITIVE)/twitter_music_archive.pkl')
twitter_pos['Target'] = '1'
twitter_pos = twitter_pos[['tweet', 'Target']]
twitter_pos

In [None]:
twitter_set = pd.concat([twitter_pos, twitter_neg])
twitter_set['Target'] = twitter_set['Target'].astype('int')
twitter_set = twitter_set.reset_index(drop=True)

In [None]:
#base set 
twitter_set_ce = twitter_set.copy()
# Specify the columns to encode then fit and transform
encoder_base = ce.BackwardDifferenceEncoder(cols=['tweet'])
twitter_set_ce = encoder_base.fit_transform(twitter_set_ce, verbose=1)
twitter_set_ce.drop(['intercept'], inplace=True, axis=1)

In [None]:
twitter_set_ce

In [None]:
x_twit_bd = twitter_set_ce.iloc[:,0:1158]
y_twit_bd = twitter_set_ce.Target

In [None]:
lr_training(x_twit_bd, y_twit_bd, 'BD')

In [None]:
twitter_idf = twitter_set.copy()
tfidf_vectorizer.fit_transform(twitter_idf['tweet'].values)
twitter_set_tfidf = tfidf_vectorizer.transform(twitter_idf['tweet'].values)
x_twit_idf = twitter_set_tfidf
y_twit_idf = twitter_idf['Target'].values

In [None]:
lr_training(x_twit_idf, y_twit_idf, 'IDF')