In [1]:
import json
import time
import sys
import pandas as pd
import numpy  as np
import scipy.sparse as sp
import pickle as pkl
import collections 
import gc

In [2]:
with open('../aps/index_item_map.pkl', 'rb') as f:
    data_map = pkl.load(f)

In [3]:
paper_id_title = data_map['paper_id_title']
author_id_name = data_map['author_id_name']
venue_id_name = data_map['venue_id_name']
keywords_id_name = data_map['keywords_id_name']
paper_title_id = data_map['paper_title_id']
author_name_id = data_map['author_name_id']
venue_name_id = data_map['venue_name_id']
keywords_name_id = data_map['keywords_name_id']
keywords_set = data_map['keywords_set']
venue_set = data_map['venue_set']

In [4]:
paper_ids = set(paper_id_title.keys())
author_ids = set(author_id_name.keys())
venue_ids = set(venue_id_name.keys())
keywords_ids = set(keywords_id_name.keys())

In [5]:
full_pair = pd.read_csv('../aps/whole_graph.csv')

In [6]:
full_P1P = full_pair[full_pair.type=='P1P']

In [7]:
for year in range(1990,2018):
    paper_list_2001 = full_pair[(full_pair.O==year)&(full_pair.type=='P1Y')]['P']
    citation_2001 = pd.merge(paper_list_2001,full_P1P)
    citation_2001 = citation_2001.groupby('O').count().reset_index()[['O','type']]
    citation_2001.to_csv('../aps/citation_'+str(year)+".csv",index=None)
    print (year)

1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017


In [8]:
def cumulative(df):
    colsn = list(df.columns)
    cols = df.shape[1]
    for i in range(2,cols):
        df[colsn[i]] = df[colsn[i]] + df[colsn[i-1]]
    return df

In [9]:
for year in range(1995,2006):
    paper_list_2000 = full_pair[(full_pair.O==year)&(full_pair.type=='P1Y')]['P']
    for i in range(1,13):
        citation_year = pd.read_csv('../aps/citation_'+str(year+i)+".csv")
        citation_year = citation_year.rename(columns={"O": "P","type":("citation_"+str(year+i))})
        paper_list_2000 = pd.merge(paper_list_2000,citation_year,left_on='P',right_on='P',how='left')
    paper_list_2000 = paper_list_2000.fillna(0)
    if year == 2000:
        pp2000 = paper_list_2000
    if year == 2005:
        pp2005 = paper_list_2000
    paper_list_2000.to_csv('../aps_for_intro/original_labels'+str(year)+'.txt',index=None,header=False)
      
    cols = paper_list_2000.columns
    paper_list_2000_log = pd.DataFrame({"P":paper_list_2000["P"],cols[1]:0,cols[2]:0,cols[3]:0,cols[4]:0,cols[5]:0})
    paper_list_2000_log.iloc[:,1:] = np.log(paper_list_2000.iloc[:,1:]+1)
    paper_list_2000_log.to_csv('../aps/log_labels'+str(year)+'.txt',index=None)
        
    paper_list_2000_cumu = cumulative(paper_list_2000)
    paper_list_2000_cumu.to_csv('../aps/cumulative_labels'+str(year)+'.txt',index=None,header=False)

    paper_list_2000_cumu_log = pd.DataFrame({"P":paper_list_2000["P"],cols[1]:0,cols[2]:0,cols[3]:0,cols[4]:0,cols[5]:0})
    paper_list_2000_cumu_log.iloc[:,1:] = np.log(paper_list_2000_cumu.iloc[:,1:]+1)
    paper_list_2000_cumu_log.to_csv('../aps/cumulative_log_labels'+str(year)+'.txt',index=None)
    print (year)

1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005


In [10]:
# import pandas as pd

# pd.read_csv('../aps/cumulative_log_labels'+str(year)+'.txt').iloc[:, 1:6].values

In [11]:
def feature_element(x):
    if int(x) in paper_ids:
        return np.array([1,0,0,0])
    elif int(x) in author_ids:
        return np.array([0,1,0,0])
    elif int(x) in venue_ids:
        return np.array([0,0,1,0])
    elif int(x) in keywords_ids:
        return np.array([0,0,0,1])
    else:
        print (x)
def create_feature(array):
    return np.array( [ feature_element(num)   for num in array])

In [15]:
for year in range(1990,2005):
    print (year)
    t = time.time()
    P1Y_2000 = full_pair[(full_pair.type=='P1Y')]
    P1Y_2000['O'] =P1Y_2000['O'].astype('int')
#     P1Y_2000 = P1Y_2000[(P1Y_2000.O<=year)]
    P1Y_2000 = P1Y_2000[(P1Y_2000.O==year)]
    all_paper_2000 = pd.DataFrame({"P":list(set(P1Y_2000['P']))})
    full_pair_2000 = pd.merge(all_paper_2000,full_pair,how="left")
    full_pair_2000[full_pair_2000.type!='P1Y']
    idx_2000 = np.array(list(set(pd.concat([full_pair_2000['P'],full_pair_2000['O']]))))
    id_item_2000 = {i:j for i,j in enumerate(idx_2000)}
    item_id_2000 = {j:i for i,j in enumerate(idx_2000)}
    feature_2000 = create_feature(idx_2000)
    print (idx_2000.shape[0])

    P1P_pair_2000 = full_pair_2000[full_pair_2000.type=='P1P']
    row = np.array([item_id_2000[item] for item in list(P1P_pair_2000['P'])])
    col = np.array([item_id_2000[item] for item in list(P1P_pair_2000['O'])])
    data = np.ones(row.shape[0])
    adj_P1P_2000 = sp.csr_matrix((data,(row,col)),shape=(idx_2000.shape[0],idx_2000.shape[0]))

    P1A_pair_2000 = full_pair_2000[full_pair_2000.type=='P1A']
    row = np.array([item_id_2000[item] for item in list(P1A_pair_2000['P'])])
    col = np.array([item_id_2000[item] for item in list(P1A_pair_2000['O'])])
    data = np.ones(row.shape[0])
    adj_P1A_2000 = sp.csr_matrix((data,(row,col)),shape=(idx_2000.shape[0],idx_2000.shape[0]))

    P1V_pair_2000 = full_pair_2000[full_pair_2000.type=='P1V']
    row = np.array([item_id_2000[item] for item in list(P1V_pair_2000['P'])])
    col = np.array([item_id_2000[item] for item in list(P1V_pair_2000['O'])])
    data = np.ones(row.shape[0])
    adj_P1V_2000 = sp.csr_matrix((data,(row,col)),shape=(idx_2000.shape[0],idx_2000.shape[0]))

    P1K_pair_2000 = full_pair_2000[full_pair_2000.type=='P1K']
    row = np.array([item_id_2000[item] for item in list(P1K_pair_2000['P'])])
    col = np.array([item_id_2000[item] for item in list(P1K_pair_2000['O'])])
    data = np.ones(row.shape[0])
    adj_P1K_2000 = sp.csr_matrix((data,(row,col)),shape=(idx_2000.shape[0],idx_2000.shape[0]))

    row = np.array(range(len(idx_2000)))
    col =  np.array(range(len(idx_2000)))
    data = np.ones(row.shape[0])
    adj_self_2000 = sp.csr_matrix((data,(row,col)),shape=(idx_2000.shape[0],idx_2000.shape[0]))

    graph = {'adj':[adj_P1P_2000,adj_P1A_2000,adj_P1V_2000,adj_P1K_2000,adj_self_2000],
                     'feature':feature_2000,
                     ' idx':idx_2000,
                     'id_item':id_item_2000,
                     'item_id':item_id_2000}
    with open('../aps/individual_graph_'+str(year)+'.pkl','wb') as f:
        pkl.dump(graph,f,0)
    print (time.time()-t)

1990


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


69164
2.258089542388916
1991
73648
2.1792190074920654
1992
80899
2.250282049179077
1993
90371
2.386385440826416
1994
95389
2.4106216430664062
1995
102179
2.552769660949707
1996
105188
2.4546873569488525
1997
105211
2.4426045417785645
1998
115242
2.630056619644165
1999
119204
2.58478045463562
2000
126152
2.773040533065796
2001
130274
2.8419830799102783
2002
138309
2.9367306232452393
2003
137805
2.913489580154419
2004
149351
3.0425236225128174


In [16]:
for year in range(2012,2013):
    print (year)
    t = time.time()
    P1Y_2000 = full_pair[(full_pair.type=='P1Y')]
    P1Y_2000['O'] =P1Y_2000['O'].astype('int')
    P1Y_2000 = P1Y_2000[(P1Y_2000.O<=year)]
#     P1Y_2000 = P1Y_2000[(P1Y_2000.O==year)]
    all_paper_2000 = pd.DataFrame({"P":list(set(P1Y_2000['P']))})
    full_pair_2000 = pd.merge(all_paper_2000,full_pair,how="left")
    full_pair_2000[full_pair_2000.type!='P1Y']
    idx_2000 = np.array(list(set(pd.concat([full_pair_2000['P'],full_pair_2000['O']]))))
    id_item_2000 = {i:j for i,j in enumerate(idx_2000)}
    item_id_2000 = {j:i for i,j in enumerate(idx_2000)}
    feature_2000 = create_feature(idx_2000)
    print (idx_2000.shape[0])

    P1P_pair_2000 = full_pair_2000[full_pair_2000.type=='P1P']
    row = np.array([item_id_2000[item] for item in list(P1P_pair_2000['P'])])
    col = np.array([item_id_2000[item] for item in list(P1P_pair_2000['O'])])
    data = np.ones(row.shape[0])
    adj_P1P_2000 = sp.csr_matrix((data,(row,col)),shape=(idx_2000.shape[0],idx_2000.shape[0]))

    P1A_pair_2000 = full_pair_2000[full_pair_2000.type=='P1A']
    row = np.array([item_id_2000[item] for item in list(P1A_pair_2000['P'])])
    col = np.array([item_id_2000[item] for item in list(P1A_pair_2000['O'])])
    data = np.ones(row.shape[0])
    adj_P1A_2000 = sp.csr_matrix((data,(row,col)),shape=(idx_2000.shape[0],idx_2000.shape[0]))

    P1V_pair_2000 = full_pair_2000[full_pair_2000.type=='P1V']
    row = np.array([item_id_2000[item] for item in list(P1V_pair_2000['P'])])
    col = np.array([item_id_2000[item] for item in list(P1V_pair_2000['O'])])
    data = np.ones(row.shape[0])
    adj_P1V_2000 = sp.csr_matrix((data,(row,col)),shape=(idx_2000.shape[0],idx_2000.shape[0]))

    P1K_pair_2000 = full_pair_2000[full_pair_2000.type=='P1K']
    row = np.array([item_id_2000[item] for item in list(P1K_pair_2000['P'])])
    col = np.array([item_id_2000[item] for item in list(P1K_pair_2000['O'])])
    data = np.ones(row.shape[0])
    adj_P1K_2000 = sp.csr_matrix((data,(row,col)),shape=(idx_2000.shape[0],idx_2000.shape[0]))

    row = np.array(range(len(idx_2000)))
    col =  np.array(range(len(idx_2000)))
    data = np.ones(row.shape[0])
    adj_self_2000 = sp.csr_matrix((data,(row,col)),shape=(idx_2000.shape[0],idx_2000.shape[0]))

    graph = {'adj':[adj_P1P_2000,adj_P1A_2000,adj_P1V_2000,adj_P1K_2000,adj_self_2000],
                     'feature':feature_2000,
                     ' idx':idx_2000,
                     'id_item':id_item_2000,
                     'item_id':item_id_2000}
    with open('../aps/all_graph_'+str(year)+'.pkl','wb') as f:
        pkl.dump(graph,f,0)
    print (time.time()-t)

2012


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


913797
22.430230140686035
