# ICPSR Dataset Archive

In [94]:
from IPython.display import display
from ipywidgets import interact, interactive, fixed, interact_manual, HBox, VBox, Label
import ipywidgets as widgets
import pandas as pd
import qgrid
pd.set_option('display.max_colwidth', -1)

datasets = pd.read_json('data/data_sets.json')
publications = pd.read_json('data/publications.json')
datasets['title_lower'] = datasets['title'].str.lower()
publications['title_lower'] = publications['title'].str.lower()

In [109]:
@interact

def showDatasets(By = ['Dataset Title', 'Dataset ID', 'Publication Title', 'Publication ID', 'DOI'], Search="ANES 1952"):
    column = {
        'Dataset Title': 'title_lower',
        'Dataset ID': 'data_set_id',
        'Publication Title': 'title_lower',
        'Publication ID': 'publication_id',
        'DOI': 'unique_identifier'
    }
    slist = Search.lower().split(',')
#     return slist
    try:
        if(By == 'Dataset Title'):
            return datasets[['data_set_id', 'unique_identifier','title', 'description']].loc[datasets[column[By]].str.contains('|'.join(slist))]
        elif (By == 'Dataset ID'):
            tmp = datasets[['data_set_id', 'unique_identifier','title', 'description']].loc[datasets[column[By]].isin([int(n) for n in slist])]
            sorter = pd.DataFrame([int(n) for n in slist], columns=['data_set_id'])
            return pd.merge(sorter, tmp, how='left', on='data_set_id')
        elif (By == 'Publication Title'):
            return publications[['publication_id', 'unique_identifier','title']].loc[publications[column[By]].str.contains('|'.join(slist))]
        elif (By == 'Publication ID'):
            tmp = publications[['publication_id', 'unique_identifier','title']].loc[publications[column[By]].isin([int(n) for n in slist])]
            sorter = pd.DataFrame([int(n) for n in slist], columns=['publication_id'])
            return pd.merge(sorter, tmp, how='left', on='publication_id')
        elif (By == 'DOI'):
            sl
            tmp = datasets[['data_set_id', 'unique_identifier','title', 'description']].loc[datasets[column[By]].str.contains('|'.join(slist))]
            tmp2 = publications[['publication_id', 'unique_identifier','title']].loc[publications[column[By]].str.contains('|'.join(slist))]
            return tmp if tmp.shape[0]>0 else tmp2            
    except Exception as e:
        print("ERROR "+e)
        pass
    

interactive(children=(Dropdown(description='By', options=('Dataset Title', 'Dataset ID', 'Publication Title', …

In [101]:
import networkx as nx
import random
from networkx.readwrite import json_graph
import json

def read_json_file(filename):
    with open(filename, 'r') as f:
        js_graph = json.load(f)
    return json_graph.node_link_graph(js_graph)
def generatePairs(firstnode, allDatasets):
    return [(firstnode, i) for i in allDatasets]
def getJaccard(node, g):
    allDatasets = [i for i in g.nodes if(str(i).startswith('data'))]
    
    if (node.startswith('d')):
        titleID = df_datasets.title_id[df_datasets.data_set_id == int(node.replace('data_', ''))]
        filterSim = list(df_datasets.data_set_id[df_datasets.title_id == titleID.iloc[0]])
        filterSim = ['data_'+str(i) for i in filterSim]

        filtered = [i for i in allDatasets if i not in filterSim]
    else:
        filtered = allDatasets
    pairs = generatePairs(node, filtered)
    preds_jaccard = nx.jaccard_coefficient(g, pairs)
    res = []
    for u, v, p in preds_jaccard:
        if p > 0.0:
            res.append((u,v,p))
    return res
def getRecommendations(i, G):
    tmp = getJaccard(i, G)
    print("Fetching dataset ID: %s \nCalculating similarity scores for %s/%s datasets"%(i, len(tmp),len(allDatasets)))
    if len(tmp)>0:
        res = getJaccard(i, G)
        df = pd.DataFrame(res, columns=['x', 'data_set_id', 'score']).sort_values(by=['score'], ascending=False).reset_index()
        df.data_set_id = [int(i.replace('data_', '')) for i in df.data_set_id]

        ids = df.data_set_id[:N]
        ids = df_datasets[df_datasets.data_set_id.isin(ids)]
        ids = ids.groupby('title_id').first().reset_index()

        res = pd.merge(ids, df, how='inner', on='data_set_id').sort_values(by=['score'], ascending=False).reset_index()

        return res[['data_set_id', 'score']].iloc[:10]

G = read_json_file('data/network_json.json')

titles = datasets.title.str.lower()
titles = titles.str.replace('[^a-zA-Z]', '')

df_titles = pd.DataFrame(set(titles)).reset_index()
df_titles.columns = ['title_id','title_unique']
df_titles.title_id = ['title_'+str(i) for i in df_titles.index]

df_datasets = datasets.copy()
df_datasets['title_unique'] = df_datasets['title'].str.lower()
df_datasets.title_unique = df_datasets.title_unique.str.replace('[^a-zA-Z]', '')
df_datasets = pd.merge(df_datasets, df_titles, on='title_unique', how='left')

In [108]:
btn = widgets.Button(description = "Recommend Me")
typeDD = widgets.Dropdown(
    options = ['Publication Paper', 'Dataset', 'Keyword'],
    value = 'Dataset',
    description = 'Entity Type'
)
searchBar = widgets.Text(
    value = '',
    placeholder = 'Insert Keyword/ID'
)
typeDict = {
    'Publication Paper': 'pub_',
    'Dataset': 'data_',
    'Keyword': ''
}
btn_reset = widgets.Button(description = 'Reset')
display(Label('Press to generate data recommendations'))
# display(HBox([typeDD, searchBar,btn, btn_reset]))
output = widgets.Output()

allDatasets = [i for i in G.nodes if(str(i).startswith('data'))]
allPubs = [i for i in G.nodes if(str(i).startswith('pub'))]
N = 100

@output.capture()
def button_handler(btn):
    if typeDD.value == 'Keyword':
        print('Feature unavailable')
        return
    elif (typeDD.value == 'Dataset') & (searchBar.value == ''):
        sampleDataset = random.sample(allDatasets, 1)
    elif (typeDD.value == 'Publication Paper') & (searchBar.value == ''):
        sampleDataset = random.sample(allPubs, 1)
    else:
        sampleDataset = [typeDict[typeDD.value]+searchBar.value]
    for i in sampleDataset:
        res = getRecommendations(i, G)
        display(",".join(str(i) for i in res.data_set_id[:10]))
        display(res)
def clear_output(btn_reset):
    out.clear_output()
btn.on_click(button_handler)
btn_reset.on_click(clear_output)

display(HBox([typeDD, searchBar,btn]))
display(output)

Label(value='Press to generate random set of recommendations')

HBox(children=(Dropdown(description='Entity Type', index=1, options=('Publication Paper', 'Dataset', 'Keyword'…

Output()