TO DO:


In [18]:
import pandas as pd
import json
import re
import numpy as np
import logging
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB 
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score, cross_val_predict, KFold
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from pandas_profiling import ProfileReport


In [2]:
#read mji csv and grab needed columns 
df_mji = pd.read_csv('/Users/laurentfintoni/Desktop/University/COURSE DOCS/THESIS/Internship/musow-pipeline/MJI/MJI_data.csv', keep_default_na=False, dtype='string')
df_mji_small = pd.DataFrame(columns=['Title', 'Description', 'URL'])
df_mji_small['Title'] = df_mji['Title'].str.lower().str.strip()
df_mji_small['Description'] = df_mji['Description'].str.lower().str.strip()
df_mji_small['URL'] = df_mji['URL'].str.lower().str.strip()

In [3]:
#read musow json dump and grab needed columns
with open('/Users/laurentfintoni/Desktop/University/COURSE DOCS/THESIS/Internship/musow-pipeline/MUSOW/musow_name_desc_url_cat.json') as file:
    data = json.load(file)
    
musow_names = [result['name']['value'].strip().lower() for result in data['results']['bindings']]
musow_desc = [result['description']['value'].strip().lower() for result in data['results']['bindings']]
musow_url = [result['url']['value'].strip().lower() for result in data['results']['bindings']]
df_musow = pd.DataFrame(columns=['Title', 'Description', 'URL'])
df_musow['Title'] = musow_names
df_musow['Description'] = musow_desc
df_musow['URL'] = musow_url
df_musow = df_musow.astype('string')

In [4]:
#remove musow duplicates from MJI set 
mji_training_set = df_mji_small[~df_mji_small['Title'].isin(df_musow['Title'])].dropna()

In [5]:
#create positive and negative sets w/o additions
positive_df = df_musow.copy()
positive_df['Target'] = '1'
negative_df = mji_training_set.copy()
negative_df['Target'] = '0'

In [9]:
#create positive and negative sets w/ additions
ismir_df = pd.read_pickle('/Users/laurentfintoni/Desktop/University/COURSE DOCS/THESIS/Internship/musow-pipeline/GH_PICKLES/ismir.pkl')
ismir_df = ismir_df[~ismir_df['Title'].isin(df_musow['Title'])].dropna() 
positive_df_adds = pd.concat([df_musow, ismir_df]).reset_index(drop=True)
positive_df_adds = positive_df_adds.drop_duplicates(['Title'], keep='last')
positive_df_adds['Target'] = '1'
mji_additions_1 = pd.read_csv('/Users/laurentfintoni/Desktop/University/COURSE DOCS/THESIS/Internship/musow-pipeline/MJI/MJI_additions_for_LR.csv')
mji_additions_1['Title'] = mji_additions_1['Title'].str.lower().str.strip()
mji_additions_1['Description'] = mji_additions_1['Description'].str.lower().str.strip()
mji_additions_1['URL'] = mji_additions_1['URL'].str.lower().str.strip()
mji_additions_1 = mji_additions_1[~mji_additions_1['Title'].isin(df_musow['Title'])].dropna()
negative_df_adds = pd.concat([mji_training_set, mji_additions_1]).reset_index(drop=True)
negative_df_adds = negative_df_adds.drop_duplicates(['Title'], keep='last')
negative_df_adds['Target'] = '0'

In [10]:
#merge both sets into one
training_set = pd.concat([positive_df, negative_df])
training_set['Target'] = training_set['Target'].astype('int')
training_set = training_set.reset_index(drop=True)
training_set_adds = pd.concat([positive_df_adds, negative_df_adds])
training_set_adds['Target'] = training_set_adds['Target'].astype('int')
training_set_adds = training_set_adds.reset_index(drop=True)

In [16]:
#create combined columns for desc+headline and desc+headline_url
def tokenize_url(url:str):
    url=url.replace("https","")
    url=url.replace("http","")
    url=url.replace("www","")   
    url=re.sub("(\W|_)+"," ",url)
    return url

#create tokenized URL field
training_set['tokenized_url']=training_set['URL'].apply(lambda x:tokenize_url(x))
#description + headline
training_set['text_desc_headline'] = training_set['Description'] + ' '+ training_set['Title']
#description + tokenized url
training_set['text_desc_headline_url'] = training_set['Description'] + ' '+ training_set['Title']+" " + training_set['tokenized_url']

#create tokenized URL field
training_set_adds['tokenized_url']=training_set_adds['URL'].apply(lambda x:tokenize_url(x))
#description + headline
training_set_adds['text_desc_headline'] = training_set_adds['Description'] + ' '+ training_set_adds['Title']
#description + tokenized url
training_set_adds['text_desc_headline_url'] = training_set_adds['Description'] + ' '+ training_set_adds['Title']+" " + training_set_adds['tokenized_url']


In [None]:
training_set_adds

In [21]:
#encode categorical variables w/OneHotEncoder

oe_style = OneHotEncoder() 
feature_cols_onehot = ['Title', 'Description', 'URL']
oe_results = oe_style.fit_transform(training_set[["Title"]])
pd.DataFrame(oe_results.toarray(), columns=oe_style.categories_).head()

Unnamed: 0,19th century california sheet music,19th-century american sheet music,312 soul,a corpus study of rock music,a-r editions' online music anthology,aaron copland collection,abc - the annotated beethoven corpus,abc notation,acoustiid api,acrcloud,...,womxn who rock digital oral history archive,world digital library,world radio history,world radio history archive,world war i sheet music,wsm radio,wwu music score portal,yale classical archives corpus,yale’s oral history of american music,yiddish sheet music
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
#encode categorial variables w/ OrdinalEncoder
training_set_ordinal = training_set.copy()
ord_enc = OrdinalEncoder()
training_set_ordinal["Title_encoded"] = ord_enc.fit_transform(training_set_ordinal[["Title"]])
training_set_ordinal["Desc_encoded"] = ord_enc.fit_transform(training_set_ordinal[["Description"]])
training_set_ordinal["URL_encoded"] = ord_enc.fit_transform(training_set_ordinal[["URL"]])
training_set_ordinal["text_desc_headline_encoded"] = ord_enc.fit_transform(training_set_ordinal[["text_desc_headline"]])
training_set_ordinal["text_desc_headline_url_encoded"] = ord_enc.fit_transform(training_set_ordinal[["text_desc_headline_url"]])

training_set_ordinal_adds = training_set_adds.copy()
ord_enc = OrdinalEncoder()
training_set_ordinal_adds["Title_encoded"] = ord_enc.fit_transform(training_set_ordinal_adds[["Title"]])
training_set_ordinal_adds["Desc_encoded"] = ord_enc.fit_transform(training_set_ordinal_adds[["Description"]])
training_set_ordinal_adds["URL_encoded"] = ord_enc.fit_transform(training_set_ordinal_adds[["URL"]])
training_set_ordinal_adds["text_desc_headline_encoded"] = ord_enc.fit_transform(training_set_ordinal_adds[["text_desc_headline"]])
training_set_ordinal_adds["text_desc_headline_url_encoded"] = ord_enc.fit_transform(training_set_ordinal_adds[["text_desc_headline_url"]])

In [10]:
#encode cat variables with label encoding
training_set_label = training_set.copy()
training_set_label["Title"] = training_set_label["Title"].astype('category')
training_set_label["Description"] = training_set_label["Description"].astype('category')
training_set_label["URL"] = training_set_label["URL"].astype('category')
training_set_label["text_desc_headline"] = training_set_label["text_desc_headline"].astype('category')
training_set_label["text_desc_headline_url"] = training_set_label["text_desc_headline_url"].astype('category')
training_set_label["Title_cat"] = training_set_label["Title"].cat.codes
training_set_label["Desc_cat"] = training_set_label["Description"].cat.codes
training_set_label["URL_cat"] = training_set_label["URL"].cat.codes
training_set_label["text_desc_headline_cat"] = training_set_label["text_desc_headline"].cat.codes
training_set_label["text_desc_headline_url_cat"] = training_set_label["text_desc_headline_url"].cat.codes

training_set_label_adds = training_set_adds.copy()
training_set_label_adds["Title"] = training_set_label_adds["Title"].astype('category')
training_set_label_adds["Description"] = training_set_label_adds["Description"].astype('category')
training_set_label_adds["URL"] = training_set_label_adds["URL"].astype('category')
training_set_label_adds["text_desc_headline"] = training_set_label_adds["text_desc_headline"].astype('category')
training_set_label_adds["text_desc_headline_url"] = training_set_label_adds["text_desc_headline_url"].astype('category')
training_set_label_adds["Title_cat"] = training_set_label_adds["Title"].cat.codes
training_set_label_adds["Desc_cat"] = training_set_label_adds["Description"].cat.codes
training_set_label_adds["URL_cat"] = training_set_label_adds["URL"].cat.codes
training_set_label_adds["text_desc_headline_cat"] = training_set_label_adds["text_desc_headline"].cat.codes
training_set_label_adds["text_desc_headline_url_cat"] = training_set_label_adds["text_desc_headline_url"].cat.codes

In [30]:
#select features and targets
feature_cols_ord = ['Title_encoded', 'Desc_encoded', 'URL_encoded']
x_ord = training_set_ordinal[feature_cols_ord] # Features
y_ord = training_set_ordinal.Target # Target variable

x_ord_adds = training_set_ordinal_adds[feature_cols_ord] # Features
y_ord_adds = training_set_ordinal_adds.Target # Target variable

feature_cols_ord_comb = ['Title_encoded', 'Desc_encoded', 'URL_encoded', 'text_desc_headline_encoded', 'text_desc_headline_url_encoded']
x_ord_comb = training_set_ordinal[feature_cols_ord_comb] # Features
y_ord_comb = training_set_ordinal.Target # Target variable 

x_ord_comb_adds = training_set_ordinal_adds[feature_cols_ord_comb] # Features
y_ord_comb_adds = training_set_ordinal_adds.Target # Target variable 

feature_cols_label = ['Title_cat', 'Desc_cat', 'URL_cat']
x_label = training_set_label[feature_cols_label] # Features
y_label = training_set_label.Target # Target variable

x_label_adds = training_set_label_adds[feature_cols_label] # Features
y_label_adds = training_set_label_adds.Target # Target variable

feature_cols_label_comb = ['Title_cat', 'Desc_cat', 'URL_cat', 'text_desc_headline_cat', 'text_desc_headline_url_cat']
x_label_comb = training_set_label[feature_cols_label] # Features
y_label_comb = training_set_label.Target # Target variable

x_label_comb_adds = training_set_label_adds[feature_cols_label_comb] # Features
y_label_comb_adds = training_set_label_adds.Target # Target variable

In [12]:
#marilena's version
def lr(x,y):  
    """ logistic regression"""
    model = LogisticRegression(solver='liblinear', C=10.0,random_state=44)
    y_pred = cross_val_predict(model, x, y, cv=10)
    acc = cross_val_score(model, x, y, cv=10, scoring='precision')
    print('MEAN PRECISION', np.mean(acc))
    report = classification_report(y, y_pred)
    print('report:', report, sep='\n')
    return y_pred

In [31]:
print(lr(x_ord_comb_adds, y_ord_comb_adds), lr(x_ord, y_ord)) 

MEAN PRECISION 0.7325974028590527
report:
              precision    recall  f1-score   support

           0       0.53      0.39      0.45       250
           1       0.74      0.83      0.78       514

    accuracy                           0.69       764
   macro avg       0.63      0.61      0.61       764
weighted avg       0.67      0.69      0.67       764

MEAN PRECISION 0.7930343559137847
report:
              precision    recall  f1-score   support

           0       0.12      0.02      0.03       128
           1       0.79      0.97      0.87       498

    accuracy                           0.78       626
   macro avg       0.46      0.49      0.45       626
weighted avg       0.66      0.78      0.70       626

[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1
 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 0 0 1
 0 1 1 1 0 0 0 0 1 0 1 1 1 0 0 0 0 1 

In [18]:
#training test version

def lr_training(x,y):
    x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.25,random_state=0)
    logreg = LogisticRegression()
    # fit the model with data
    logreg.fit(x_train,y_train)
    y_pred=logreg.predict(x_test)
    cnf_matrix = confusion_matrix(y_test, y_pred)
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Precision:", precision_score(y_test, y_pred))
    print("Recall:", recall_score(y_test, y_pred))
    return cnf_matrix

In [22]:
print(lr_training(x_ord_adds, y_ord_adds), lr_training(x_ord, y_ord)) 

Accuracy: 0.743455497382199
Precision: 0.7941176470588235
Recall: 0.8372093023255814
Accuracy: 0.7834394904458599
Precision: 0.7973856209150327
Recall: 0.976
[[ 34  28]
 [ 21 108]] [[  1  31]
 [  3 122]]
