In [1]:
# USED TO GENERATE THE MODEL FOR THE BOOK COMPARER CLASS, and
# USED TO GENERATE THE CROSSWALK BETWEEN THE NYT AND THE GOODREADS/ SPL

In [4]:
import pandas as pd
from clean_data.book_cleaner import BookCleaner
from warnings import simplefilter
from sklearn.exceptions import ConvergenceWarning
import jellyfish as jf
import pylcs
import pickle
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV 
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier

simplefilter("ignore", category=ConvergenceWarning)
from warnings import filterwarnings
filterwarnings('ignore')

In [6]:
df = pd.read_csv('training_set.csv')
bc = BookCleaner(df)
cdf = bc.get_clean_df()

In [3]:
cdf.head(5)

Unnamed: 0,is_match,publish_year_a,isbn_a,publish_year_b,isbn_b,cleaned_author_a,cleaned_author_b,cleaned_title_a,cleaned_title_b,cleaned_publisher_a,cleaned_publisher_b,removed_common_title_a,removed_common_title_b,removed_all_title_a,removed_all_title_b,removed_common_publisher_a,removed_common_publisher_b,removed_all_publisher_a,removed_all_publisher_b
0,1,2013,0399159347,2020,0425274861,lianemoriarty,lianemoriarty,the husbands secret,liane moriarty collection 8 books set the hypn...,amy einhorn books published by g p putnams son...,penguin,husbandssecret,lianemoriartycollection8bookssethypnotistslove...,husbandssecret,lianemoriartycollection8ssethypnotistslovestor...,amyeinhornbookspublishedgpputnamssonsmemberpen...,penguin,amyeinhornspublishedgpputnamssonspenguingroupu...,penguin
1,1,2015,1594633665,2015,1594633665,paulahawkins,paulahawkins,the girl on the train,the girl on the train,riverhead books a member of penguin group usa,riverhead books,girlontrain,girlontrain,girlontrain,girlontrain,riverheadbooksmemberpenguingroupusa,riverheadbooks,riverheadspenguingroupusa,riverheads
2,1,2006,074324754X,2006,074324754X,jeannettewalls,jeannettewalls,the glass castle a memoir,the glass castle a memoir,scribner,scribner,glasscastlememoir,glasscastlememoir,glasscastle,glasscastle,scribner,scribner,scribner,scribner
3,1,2013,0385537859,2016,1101972971,danbrown,danbrown,inferno a novel,inferno movie tiein edition robert langdon,doubleday,anchor,infernonovel,infernomovietieineditionrobertlangdon,inferno,infernotieinrobertlangdon,doubleday,anchor,doubleday,anchor
4,1,2011,030788743X,2012,0307887448,ernestcline,ernestcline,ready player one,ready player one a novel,crown publishers,ballantine books,readyplayerone,readyplayeronenovel,readyplayerone,readyplayerone,crownpublishers,ballantinebooks,crowns,ballantines


In [4]:
# Goal: Create a df of all numeric values only

def gen_numeric_df(cdf):
    df = cdf.copy()

    colHeaders = ['cleaned_author_', 'cleaned_title_', 'cleaned_publisher_', 'removed_common_title_', \
                  'removed_all_title_', 'removed_common_publisher_', 'removed_all_publisher_']

    numeric_df = df[['is_match']]

    numeric_df['isbn_a'] = df['isbn_a']
    numeric_df['isbn_b'] = df['isbn_b']

    numeric_df['publish_year_delta'] = abs(df['publish_year_b'] - df['publish_year_a'])

    for col in colHeaders:
        col_a = col + 'a'
        col_b = col + 'b'

        print('Generating for col', col, flush=True, end='\r')
        numeric_df[col+'levenshtein'] = df.apply(lambda row : jf.levenshtein_distance(row[col_a], row[col_b]), axis = 1)
        numeric_df[col+'damerau'] = df.apply(lambda row : jf.damerau_levenshtein_distance(row[col_a], row[col_b]), axis = 1)
        numeric_df[col+'hamming'] = df.apply(lambda row : jf.hamming_distance(row[col_a], row[col_b]), axis = 1)
        numeric_df[col+'jaro'] = df.apply(lambda row : jf.jaro_similarity(row[col_a], row[col_b]), axis = 1)
        numeric_df[col+'jaro_winkler'] = df.apply(lambda row : jf.jaro_winkler_similarity(row[col_a], row[col_b]), axis = 1)

        numeric_df[col+'lcs_seq_len'] = df.apply(lambda row : pylcs.lcs_sequence_length(row[col_a], row[col_b]), axis = 1)
        numeric_df[col+'edit_dist'] = df.apply(lambda row : pylcs.edit_distance(row[col_a], row[col_b]), axis = 1)
        
    return numeric_df

In [5]:
n_df = gen_numeric_df(cdf)

Generating for col removed_all_publisher_er_

In [6]:
n_df = n_df * 1 # NECESSARY! Turns bool into int

In [7]:
n_df.head(3)

Unnamed: 0,is_match,isbn_a,isbn_b,publish_year_delta,cleaned_author_levenshtein,cleaned_author_damerau,cleaned_author_hamming,cleaned_author_jaro,cleaned_author_jaro_winkler,cleaned_author_lcs_seq_len,...,removed_common_publisher_jaro_winkler,removed_common_publisher_lcs_seq_len,removed_common_publisher_edit_dist,removed_all_publisher_levenshtein,removed_all_publisher_damerau,removed_all_publisher_hamming,removed_all_publisher_jaro,removed_all_publisher_jaro_winkler,removed_all_publisher_lcs_seq_len,removed_all_publisher_edit_dist
0,1,0399159347,0425274861,7,0,0,0,1.0,1.0,13,...,0.562698,7,53,43,43,50,0.570476,0.570476,7,43
1,1,1594633665,1594633665,0,0,0,0,1.0,1.0,12,...,0.88,14,21,15,15,15,0.8,0.88,10,15
2,1,074324754X,074324754X,0,0,0,0,1.0,1.0,14,...,1.0,8,0,0,0,0,1.0,1.0,8,0


In [8]:
interesting_cols = ['removed_all_title_hamming', 'removed_all_title_levenshtein', 'removed_all_title_jaro', 'removed_all_publisher_hamming', 'removed_all_publisher_levenshtein', 'removed_all_publisher_jaro', 'cleaned_author_hamming', 'cleaned_author_jaro', 'cleaned_author_damerau']

In [9]:
n_df[n_df['is_match'] ==1][interesting_cols].describe()

Unnamed: 0,removed_all_title_hamming,removed_all_title_levenshtein,removed_all_title_jaro,removed_all_publisher_hamming,removed_all_publisher_levenshtein,removed_all_publisher_jaro,cleaned_author_hamming,cleaned_author_jaro,cleaned_author_damerau
count,328.0,328.0,328.0,328.0,328.0,328.0,328.0,328.0,328.0
mean,21.557927,18.493902,0.852452,10.155488,8.719512,0.737664,2.484756,0.946538,1.801829
std,31.548269,27.867861,0.149171,9.774377,8.356029,0.223803,5.457067,0.126558,4.16778
min,0.0,0.0,0.40557,0.0,0.0,0.0,0.0,0.435897,0.0
25%,0.0,0.0,0.763689,0.0,0.0,0.529762,0.0,1.0,0.0
50%,12.0,12.0,0.854573,9.0,7.0,0.788889,0.0,1.0,0.0
75%,25.5,21.0,1.0,14.0,12.0,1.0,0.0,1.0,0.0
max,151.0,138.0,1.0,66.0,53.0,1.0,41.0,1.0,34.0


In [10]:
n_df[n_df['is_match'] == 0][interesting_cols].describe()

Unnamed: 0,removed_all_title_hamming,removed_all_title_levenshtein,removed_all_title_jaro,removed_all_publisher_hamming,removed_all_publisher_levenshtein,removed_all_publisher_jaro,cleaned_author_hamming,cleaned_author_jaro,cleaned_author_damerau
count,400000.0,400000.0,400000.0,400000.0,400000.0,400000.0,400000.0,400000.0,400000.0
mean,40.460385,35.329885,0.510235,15.912522,13.841447,0.490879,13.315988,0.480367,11.899297
std,29.07457,26.157699,0.082133,7.261276,6.255709,0.126299,3.75312,0.11209,3.181637
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,21.0,19.0,0.467949,11.0,10.0,0.438889,11.0,0.435897,10.0
50%,32.0,28.0,0.517589,14.0,13.0,0.500712,13.0,0.490741,12.0
75%,48.0,41.0,0.561836,19.0,17.0,0.553114,15.0,0.543651,13.0
max,169.0,163.0,1.0,90.0,86.0,1.0,45.0,1.0,45.0


In [11]:
# This mask is critically important but also highly subjective
mask = (n_df['removed_all_title_hamming'] <= 28) & (n_df['removed_all_title_levenshtein'] <= 28) & (n_df['removed_all_title_jaro'] >= .4) & (n_df['removed_all_publisher_hamming'] <= 11) & (n_df['removed_all_publisher_levenshtein'] <= 13) & (n_df['cleaned_author_hamming'] <= 11) & (n_df['cleaned_author_jaro'] >= .5) & (n_df['cleaned_author_damerau'] <= 9)
n_df[mask].describe()

Unnamed: 0,is_match,publish_year_delta,cleaned_author_levenshtein,cleaned_author_damerau,cleaned_author_hamming,cleaned_author_jaro,cleaned_author_jaro_winkler,cleaned_author_lcs_seq_len,cleaned_author_edit_dist,cleaned_title_levenshtein,...,removed_common_publisher_jaro_winkler,removed_common_publisher_lcs_seq_len,removed_common_publisher_edit_dist,removed_all_publisher_levenshtein,removed_all_publisher_damerau,removed_all_publisher_hamming,removed_all_publisher_jaro,removed_all_publisher_jaro_winkler,removed_all_publisher_lcs_seq_len,removed_all_publisher_edit_dist
count,5863.0,5863.0,5863.0,5863.0,5863.0,5863.0,5863.0,5863.0,5863.0,5863.0,...,5863.0,5863.0,5863.0,5863.0,5863.0,5863.0,5863.0,5863.0,5863.0,5863.0
mean,0.022514,8.034453,7.782364,7.757291,8.903804,0.591973,0.595066,3.991643,7.782364,23.000512,...,0.5199,3.675252,9.131332,7.845301,7.833703,8.622548,0.496492,0.498345,2.839843,7.845301
std,0.148361,7.535598,1.751945,1.741854,2.075089,0.094323,0.100492,1.850733,1.751945,8.439248,...,0.182681,2.875729,2.806475,2.15046,2.148324,2.321473,0.179424,0.183123,2.415004,2.15046
min,0.0,0.0,0.0,0.0,0.0,0.5,0.5,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,3.0,7.0,7.0,8.0,0.533333,0.533333,3.0,7.0,17.0,...,0.433333,2.0,8.0,7.0,7.0,8.0,0.430556,0.430556,2.0,7.0
50%,0.0,6.0,8.0,8.0,9.0,0.5671,0.5671,4.0,8.0,23.0,...,0.504545,3.0,10.0,8.0,8.0,9.0,0.499145,0.499145,2.0,8.0
75%,0.0,11.0,9.0,9.0,10.0,0.614815,0.614815,4.0,9.0,28.0,...,0.594048,4.0,11.0,9.0,9.0,10.0,0.558333,0.558333,3.0,9.0
max,1.0,90.0,10.0,9.0,11.0,1.0,1.0,17.0,10.0,53.0,...,1.0,29.0,18.0,11.0,11.0,11.0,1.0,1.0,25.0,11.0


In [12]:
subset_df = n_df[mask].reset_index(drop = True)
subset_df.head(3)

Unnamed: 0,is_match,isbn_a,isbn_b,publish_year_delta,cleaned_author_levenshtein,cleaned_author_damerau,cleaned_author_hamming,cleaned_author_jaro,cleaned_author_jaro_winkler,cleaned_author_lcs_seq_len,...,removed_common_publisher_jaro_winkler,removed_common_publisher_lcs_seq_len,removed_common_publisher_edit_dist,removed_all_publisher_levenshtein,removed_all_publisher_damerau,removed_all_publisher_hamming,removed_all_publisher_jaro,removed_all_publisher_jaro_winkler,removed_all_publisher_lcs_seq_len,removed_all_publisher_edit_dist
0,1,074324754X,074324754X,0,0,0,0,1.0,1.0,14,...,1.0,8,0,0,0,0,1.0,1.0,8,0
1,1,0385537859,1101972971,3,0,0,0,1.0,1.0,8,...,0.425926,1,9,9,9,9,0.425926,0.425926,1,9
2,1,030788743X,0307887448,1,0,0,0,1.0,1.0,11,...,0.433333,5,14,9,9,11,0.419192,0.419192,2,9


# IT'S TIME FOR MACHINE LEARNING

In [13]:
# MAKE SURE TO **NOT** USE THE ISBN FOR ANY ANALYSIS!!!!!
y = subset_df['is_match']
y

0       1
1       1
2       1
3       1
4       1
       ..
5858    0
5859    0
5860    0
5861    0
5862    0
Name: is_match, Length: 5863, dtype: int64

In [14]:
subset_df

Unnamed: 0,is_match,isbn_a,isbn_b,publish_year_delta,cleaned_author_levenshtein,cleaned_author_damerau,cleaned_author_hamming,cleaned_author_jaro,cleaned_author_jaro_winkler,cleaned_author_lcs_seq_len,...,removed_common_publisher_jaro_winkler,removed_common_publisher_lcs_seq_len,removed_common_publisher_edit_dist,removed_all_publisher_levenshtein,removed_all_publisher_damerau,removed_all_publisher_hamming,removed_all_publisher_jaro,removed_all_publisher_jaro_winkler,removed_all_publisher_lcs_seq_len,removed_all_publisher_edit_dist
0,1,074324754X,074324754X,0,0,0,0,1.000000,1.000000,14,...,1.000000,8,0,0,0,0,1.000000,1.000000,8,0
1,1,0385537859,1101972971,3,0,0,0,1.000000,1.000000,8,...,0.425926,1,9,9,9,9,0.425926,0.425926,1,9
2,1,030788743X,0307887448,1,0,0,0,1.000000,1.000000,11,...,0.433333,5,14,9,9,11,0.419192,0.419192,2,9
3,1,0590353403,059035342X,0,0,0,0,1.000000,1.000000,9,...,0.505556,2,12,10,10,10,0.447222,0.447222,2,10
4,1,1250012570,1250012570,0,0,0,0,1.000000,1.000000,13,...,1.000000,16,0,0,0,0,1.000000,1.000000,16,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5858,0,141690817X,0590846280,9,9,9,11,0.592593,0.592593,4,...,0.600000,4,8,8,8,8,0.600000,0.600000,4,8
5859,0,0345479726,0385542690,11,7,7,10,0.550000,0.550000,3,...,0.488889,3,12,9,9,11,0.505051,0.505051,2,9
5860,0,1250086612,0735212163,0,9,9,11,0.516667,0.516667,3,...,0.645604,7,9,9,9,10,0.544444,0.544444,3,9
5861,0,0545298393,014241543X,1,6,6,11,0.626263,0.626263,6,...,0.533333,2,8,8,8,9,0.533333,0.533333,2,8


In [15]:
X = subset_df.drop(['is_match', 'isbn_a', 'isbn_b'], axis = 1)
X.describe()
X

Unnamed: 0,publish_year_delta,cleaned_author_levenshtein,cleaned_author_damerau,cleaned_author_hamming,cleaned_author_jaro,cleaned_author_jaro_winkler,cleaned_author_lcs_seq_len,cleaned_author_edit_dist,cleaned_title_levenshtein,cleaned_title_damerau,...,removed_common_publisher_jaro_winkler,removed_common_publisher_lcs_seq_len,removed_common_publisher_edit_dist,removed_all_publisher_levenshtein,removed_all_publisher_damerau,removed_all_publisher_hamming,removed_all_publisher_jaro,removed_all_publisher_jaro_winkler,removed_all_publisher_lcs_seq_len,removed_all_publisher_edit_dist
0,0,0,0,0,1.000000,1.000000,14,0,1,1,...,1.000000,8,0,0,0,0,1.000000,1.000000,8,0
1,3,0,0,0,1.000000,1.000000,8,0,28,28,...,0.425926,1,9,9,9,9,0.425926,0.425926,1,9
2,1,0,0,0,1.000000,1.000000,11,0,8,8,...,0.433333,5,14,9,9,11,0.419192,0.419192,2,9
3,0,0,0,0,1.000000,1.000000,9,0,0,0,...,0.505556,2,12,10,10,10,0.447222,0.447222,2,10
4,0,0,0,0,1.000000,1.000000,13,0,0,0,...,1.000000,16,0,0,0,0,1.000000,1.000000,16,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5858,9,9,9,11,0.592593,0.592593,4,9,22,22,...,0.600000,4,8,8,8,8,0.600000,0.600000,4,8
5859,11,7,7,10,0.550000,0.550000,3,7,28,28,...,0.488889,3,12,9,9,11,0.505051,0.505051,2,9
5860,0,9,9,11,0.516667,0.516667,3,9,23,23,...,0.645604,7,9,9,9,10,0.544444,0.544444,3,9
5861,1,6,6,11,0.626263,0.626263,6,6,13,13,...,0.533333,2,8,8,8,9,0.533333,0.533333,2,8


In [16]:
# STANDARD SCALE THE X. Also consider keeping less of the 0-match observations.
# Prepping Data

X_std = StandardScaler().fit(X).transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_std, y, random_state=42)

Pure Supervised Strategies

In [17]:
# Linear Models
#from sklearn.Linear_Model import  

In [18]:
#Random Forest
rf_grid= {'n_estimators': [50, 100, 200],'max_depth': [None, 5, 15],'min_samples_leaf': [1, 2, 5]}
rf = RandomForestClassifier().fit(X_train, y_train)
print("Accuracy on training set: {:.3f}".format(rf.score(X_train, y_train)))
print("Accuracy on test set: {:.3f}".format(rf.score(X_test, y_test)))


Accuracy on training set: 0.999
Accuracy on test set: 0.997


In [19]:
grid_search = GridSearchCV(RandomForestClassifier(),rf_grid,cv=5,return_train_score=True)
rf_best=grid_search.fit(X_train,y_train)

In [20]:
print("Random Forest")
print("Best n_estimators: ",rf_best.best_estimator_.get_params()['n_estimators'])
print("Best max_depth: ",rf_best.best_estimator_.get_params()['max_depth'])
print("Best min_samples_leaf: ",rf_best.best_estimator_.get_params()['min_samples_leaf'])
print("Accuracy on training set: {:.5f}".format(rf_best.score(X_train, y_train)))
print("Accuracy on test set: {:.5f}".format(rf_best.score(X_test, y_test)))

Random Forest
Best n_estimators:  50
Best max_depth:  None
Best min_samples_leaf:  5
Accuracy on training set: 0.99864
Accuracy on test set: 0.99454


In [21]:
# Neural Network
neural_grid ={'hidden_layer_sizes': [[10, 10], [5, 5], [15, 5]], 'activation': ['identity', 'logistic'], 'solver': ['lbfgs'], 'alpha': [.00001, .0001, .01], 'learning_rate': ['constant', 'invscaling', 'adaptive']}
grid_search = GridSearchCV(MLPClassifier(),neural_grid,cv=5,return_train_score=True)
neural_best=grid_search.fit(X_train,y_train)

In [39]:
print("Neural Net")
print("Best hidden layer size: ",neural_best.best_estimator_.get_params()['hidden_layer_sizes'])
print("Best activation: ",neural_best.best_estimator_.get_params()['activation'])
print("Best solver: ",neural_best.best_estimator_.get_params()['solver'])
print("Best alpha: ",neural_best.best_estimator_.get_params()['alpha'])
print("Best learning_rate: ",neural_best.best_estimator_.get_params()['learning_rate'])
print("Accuracy on training set: {:.5f}".format(neural_best.score(X_train, y_train)))
print("Accuracy on test set: {:.5f}".format(neural_best.score(X_test, y_test)))

Neural Net
Best hidden layer size:  [5, 5]
Best activation:  logistic
Best solver:  lbfgs
Best alpha:  0.0001
Best learning_rate:  adaptive
Accuracy on training set: 0.99932
Accuracy on test set: 0.99454


In [61]:
full_classifier = MLPClassifier(activation = 'logistic',
 alpha = 0.0001,
 hidden_layer_sizes = [5, 5],
 learning_rate = 'adaptive',
 solver = 'lbfgs').fit(X_train, y_train)

In [62]:
# Great. Let's work with the Neural Network.
full_classifier.score(X_test, y_test)

0.9965893587994543

In [79]:
# Okay, now we need to create the crosswalk.
# Let's crosswalk between the GoodReads (11,000+ obs) and the Seattle Public Libary (50,000+ obs) datasets, since they each have all the data.
goodreads = pd.read_csv('exported_models/goodreads.csv', usecols=['isbn', 'title', 'authors', 'publisher', 'publication_date'])
goodreads['Publication Year'] = goodreads['publication_date'].apply(lambda x: x.split('/')[2]).astype(int)
goodreads.drop('publication_date', axis = 1, inplace = True)
goodreads = goodreads[['title', 'authors', 'publisher', 'Publication Year', 'isbn']]
large_spl = pd.read_csv('exported_models/Small_SPL.csv')

In [80]:
goodreads.info(3)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11127 entries, 0 to 11126
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   title             11127 non-null  object
 1   authors           11127 non-null  object
 2   publisher         11127 non-null  object
 3   Publication Year  11127 non-null  int64 
 4   isbn              11127 non-null  object
dtypes: int64(1), object(4)
memory usage: 434.8+ KB


In [81]:
large_spl.info(3)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   ISBN              10000 non-null  object
 1   Title             10000 non-null  object
 2   Author            10000 non-null  object
 3   Publisher         10000 non-null  object
 4   Publication Year  10000 non-null  int64 
dtypes: int64(1), object(4)
memory usage: 390.8+ KB


In [87]:
training_set = pd.DataFrame(columns=['is_match', 'title_a', 'author_a', 'publisher_a', 'publish_year_a', 'isbn_a', 'title_b', 'author_b', 'publisher_b', 'publish_year_b', 'isbn_b'])
counter = 0
all_data_len = len(goodreads)
comparison_len = len(large_spl)
save_counter = 0
for index, row in goodreads.iterrows():
    counter += 1
    print(f'{(counter / all_data_len) : 0.00%} ({counter}/{all_data_len})', end='\r', flush=True)
    data_set = large_spl.copy()
    data_set['is_match'] = 0
    data_set.loc[int(index), 'is_match'] = 1
    data_set['title_b'] = row['title']
    data_set['author_b'] = row['authors']
    data_set['publisher_b'] = row['publisher']
    data_set['publish_year_b'] = row['Publication Year']
    data_set['isbn_b'] = row['isbn']
    data_set.columns = ['isbn_a', 'title_a', 'author_a', 'publisher_a', 'publish_year_a', 'is_match', 'title_b', 'author_b', 'publisher_b', 'publish_year_b', 'isbn_b']
    data_set = data_set[['is_match', 'title_a', 'author_a', 'publisher_a', 'publish_year_a', 'isbn_a', 'title_b', 'author_b', 'publisher_b', 'publish_year_b', 'isbn_b']]
    training_set = pd.concat([training_set, data_set])
    # Every 200 iterations, save the data.
    if counter % 200 == 0:
        save_counter += 1
        print('Processing... This may take a while.    ')
        cleaned = BookCleaner(training_set).get_clean_df()
        ndf = gen_numeric_df(cleaned)
        ndf.to_csv('cross_walk_data/training_set_'+str(save_counter)+'.csv', index = False)
        print(f'Saved {counter} iterations.             ')
        # Reset the training set. This allows us to save memory.
        training_set = pd.DataFrame(columns=['is_match', 'title_a', 'author_a', 'publisher_a', 'publish_year_a', 'isbn_a', 'title_b', 'author_b', 'publisher_b', 'publish_year_b', 'isbn_b'])

print(f'Completed {counter * comparison_len} permutations.')

Processing...7)
Generating for col removed_common_title_

KeyboardInterrupt: 

## OUR CONTRIBUTION BEYOND THE CROSSWALK: Exporting our Neural Network as a class

In [41]:
from joblib import dump

def train_neural_network(df : pd.DataFrame, dump_scaler = True, scaler_name = 'scaler') -> MLPClassifier:
    """Trains a neural network on the given dataframe"""
    y = df['is_match']
    X = df.drop(['is_match', 'isbn_a', 'isbn_b'], axis = 1)
    X_std = StandardScaler().fit(X).transform(X)
    if dump_scaler:
        dump(StandardScaler().fit(X), scaler_name + '.pkl')
    X_train, X_test, y_train, y_test = train_test_split(X_std, y, random_state=42)

    print('Training Neural Network...')
    neural_grid ={'hidden_layer_sizes': [[10, 10], [5, 5], [15, 5]], 'activation': ['identity', 'logistic'], 'solver': ['lbfgs'], 'alpha': [.00001, .0001, .01], 'learning_rate': ['constant', 'invscaling', 'adaptive']}
    grid_search = GridSearchCV(MLPClassifier(),neural_grid,cv=5,return_train_score=True)
    neural_best=grid_search.fit(X_train,y_train)

    print("\tResults of Neural Net:")
    print("Best hidden layer size: ",neural_best.best_estimator_.get_params()['hidden_layer_sizes'])
    print("Best activation: ",neural_best.best_estimator_.get_params()['activation'])
    print("Best solver: ",neural_best.best_estimator_.get_params()['solver'])
    print("Best alpha: ",neural_best.best_estimator_.get_params()['alpha'])
    print("Best learning_rate: ",neural_best.best_estimator_.get_params()['learning_rate'])
    print("Accuracy on training set: {:.5f}".format(neural_best.score(X_train, y_train)))
    print("Accuracy on test set: {:.5f}".format(neural_best.score(X_test, y_test)))
    print("\n\n")

    return neural_best

In [27]:
# But wait! We might not have all the data every time we're interested in making a prediction. 
# Here are the following relevant scenarios:
# 1. We have the title, but not the author or publisher or publication date
# 2. We have the title and author, but not the publisher or publication date
# 3. We have the title and publisher, but not the author or publication date
# 4. We have the author and publisher and publication date, but not the title

In [28]:
title_only_df = subset_df[[col for col in list(subset_df.columns) if 'publish' not in col and 'author' not in col]].reset_index(drop = True)
title_author_df = subset_df[[col for col in list(subset_df.columns) if 'publish' not in col]].reset_index(drop = True)
title_publisher_df = subset_df[[col for col in list(subset_df.columns) if 'author' not in col and 'publish_year' not in col]].reset_index(drop = True)
author_publisher_df = subset_df[[col for col in list(subset_df.columns) if 'title' not in col]].reset_index(drop = True)

In [31]:
title_only_df

Unnamed: 0,is_match,isbn_a,isbn_b,cleaned_title_levenshtein,cleaned_title_damerau,cleaned_title_hamming,cleaned_title_jaro,cleaned_title_jaro_winkler,cleaned_title_lcs_seq_len,cleaned_title_edit_dist,...,removed_common_title_jaro_winkler,removed_common_title_lcs_seq_len,removed_common_title_edit_dist,removed_all_title_levenshtein,removed_all_title_damerau,removed_all_title_hamming,removed_all_title_jaro,removed_all_title_jaro_winkler,removed_all_title_lcs_seq_len,removed_all_title_edit_dist
0,1,074324754X,074324754X,1,1,9,0.973846,0.984308,25,1,...,1.000000,17,0,0,0,0,1.000000,1.000000,11,0
1,1,0385537859,1101972971,28,28,34,0.698214,0.698214,14,28,...,0.677382,11,26,18,18,18,0.760000,0.856000,7,18
2,1,030788743X,0307887448,8,8,8,0.888889,0.933333,16,8,...,0.947368,14,5,0,0,0,1.000000,1.000000,14,0
3,1,0590353403,059035342X,0,0,0,1.000000,1.000000,36,0,...,1.000000,25,0,0,0,0,1.000000,1.000000,25,0
4,1,1250012570,1250012570,0,0,0,1.000000,1.000000,13,0,...,1.000000,8,0,0,0,0,1.000000,1.000000,8,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5858,0,141690817X,0590846280,22,22,31,0.650277,0.650277,15,22,...,0.428395,5,22,22,22,26,0.428395,0.428395,5,22
5859,0,0345479726,0385542690,28,28,36,0.534259,0.534259,8,28,...,0.498042,6,24,17,17,20,0.505128,0.505128,6,17
5860,0,1250086612,0735212163,23,23,28,0.645743,0.645743,11,23,...,0.517460,6,15,13,13,14,0.537500,0.537500,5,13
5861,0,0545298393,014241543X,13,13,15,0.562963,0.562963,3,13,...,0.417989,2,8,8,8,9,0.417989,0.417989,2,8


In [29]:
full_data_trained_network = train_neural_network(subset_df)
title_only_trained_network = train_neural_network(title_only_df)
title_author_trained_network = train_neural_network(title_author_df)
title_publisher_network = train_neural_network(title_publisher_df)
author_publisher_network = train_neural_network(author_publisher_df)

Training Neural Network...
	Results of Neural Net:
Best hidden layer size:  [5, 5]
Best activation:  logistic
Best solver:  lbfgs
Best alpha:  0.01
Best learning_rate:  invscaling
Accuracy on training set: 0.99909
Accuracy on test set: 0.99727



Training Neural Network...
	Results of Neural Net:
Best hidden layer size:  [15, 5]
Best activation:  identity
Best solver:  lbfgs
Best alpha:  1e-05
Best learning_rate:  adaptive
Accuracy on training set: 0.99886
Accuracy on test set: 0.99454



Training Neural Network...
	Results of Neural Net:
Best hidden layer size:  [5, 5]
Best activation:  logistic
Best solver:  lbfgs
Best alpha:  1e-05
Best learning_rate:  invscaling
Accuracy on training set: 0.99886
Accuracy on test set: 0.99659



Training Neural Network...
	Results of Neural Net:
Best hidden layer size:  [10, 10]
Best activation:  logistic
Best solver:  lbfgs
Best alpha:  0.01
Best learning_rate:  constant
Accuracy on training set: 0.99909
Accuracy on test set: 0.99727



Training Ne

In [30]:
# We would like to export the trained neural network so that we can use it for arbitrary OOS data.
# We can do this by using the pickle library.
# Let's save the full data trained network
with open('exported_models/full_data_trained_network.pkl', 'wb') as f:
    pickle.dump(full_data_trained_network, f)

with open('exported_models/title_only_trained_network.pkl', 'wb') as f:
    pickle.dump(title_only_trained_network, f)

# Let's save the title author trained network
with open('exported_models/title_author_trained_network.pkl', 'wb') as f:
    pickle.dump(title_author_trained_network, f)

# Let's save the title publisher trained network
with open('exported_models/title_publisher_trained_network.pkl', 'wb') as f:
    pickle.dump(title_publisher_network, f)

# Let's save the author publisher trained network
with open('exported_models/author_publisher_trained_network.pkl', 'wb') as f:
    pickle.dump(author_publisher_network, f)