In [5]:
%matplotlib inline

import numpy as np
import pandas as pd
import networkx as nx
import requests
import matplotlib.pyplot as plt

# set some nicer defaults for matplotlib
from matplotlib import rcParams

#these colors come from colorbrewer2.org. Each is an RGB triplet
dark2_colors = [(0.10588235294117647, 0.6196078431372549, 0.4666666666666667),
                (0.8509803921568627, 0.37254901960784315, 0.00784313725490196),
                (0.4588235294117647, 0.4392156862745098, 0.7019607843137254),
                (0.9058823529411765, 0.1607843137254902, 0.5411764705882353),
                (0.4, 0.6509803921568628, 0.11764705882352941),
                (0.9019607843137255, 0.6705882352941176, 0.00784313725490196),
                (0.6509803921568628, 0.4627450980392157, 0.11372549019607843),
                (0.4, 0.4, 0.4)]

rcParams['figure.figsize'] = (10, 6)
rcParams['figure.dpi'] = 150
rcParams['axes.color_cycle'] = dark2_colors
rcParams['lines.linewidth'] = 2
rcParams['axes.grid'] = False
rcParams['axes.facecolor'] = 'white'
rcParams['font.size'] = 14
rcParams['patch.edgecolor'] = 'none'

def remove_border(axes=None, top=False, right=False, left=True, bottom=True):
    """
    Minimize chartjunk by stripping out unnecessary plot borders and axis ticks
    
    The top/right/left/bottom keywords toggle whether the corresponding plot border is drawn
    """
    ax = axes or plt.gca()
    ax.spines['top'].set_visible(top)
    ax.spines['right'].set_visible(right)
    ax.spines['left'].set_visible(left)
    ax.spines['bottom'].set_visible(bottom)
    
    #turn off all ticks
    ax.yaxis.set_ticks_position('none')
    ax.xaxis.set_ticks_position('none')
    
    #now re-enable visibles
    if top:
        ax.xaxis.tick_top()
    if bottom:
        ax.xaxis.tick_bottom()
    if left:
        ax.yaxis.tick_left()
    if right:
        ax.yaxis.tick_right()

In [6]:
url = 'http://opsahl.co.uk/tnet/datasets/OF_longitudinal_weightedchar.txt'
r = requests.get(url)

In [7]:
# import string as str
answer = r.text
# print(answer)
print(type(answer))
output = answer.split("\n")

<class 'str'>


In [70]:
data_array = []
for lines in output:
    line = lines.split(' ')
    line[0:2] = [' '.join(line[0:2])]
    data_array.append(line)
data_array

[['"2004-05-14 20:53:16"', '201', '3', '23'],
 ['"2004-05-14 20:54:08"', '187', '3', '63'],
 ['"2004-05-14 20:55:40"', '138', '2', '6'],
 ['"2004-05-14 21:00:04"', '345', '3', '38'],
 ['"2004-05-14 21:02:17"', '233', '3', '60'],
 ['"2004-05-14 21:02:21"', '140', '4', '7'],
 ['"2004-05-14 21:03:35"', '307', '3', '33'],
 ['"2004-05-14 21:06:14"', '352', '3', '12'],
 ['"2004-05-14 21:09:11"', '269', '3', '42'],
 ['"2004-05-14 21:14:15"', '187', '5', '61'],
 ['"2004-05-14 21:14:29"', '269', '5', '75'],
 ['"2004-05-14 21:15:37"', '201', '5', '101'],
 ['"2004-05-14 21:18:57"', '233', '5', '28'],
 ['"2004-05-14 21:39:16"', '281', '3', '11'],
 ['"2004-05-14 21:44:30"', '44', '7', '9'],
 ['"2004-05-14 21:50:12"', '239', '3', '36'],
 ['"2004-05-14 21:55:20"', '239', '5', '102'],
 ['"2004-05-14 21:55:30"', '213', '9', '7'],
 ['"2004-05-14 21:55:52"', '402', '9', '4'],
 ['"2004-05-14 21:56:02"', '402', '9', '33'],
 ['"2004-05-14 21:56:40"', '402', '9', '4'],
 ['"2004-05-14 21:56:47"', '356', '9', 

In [9]:
#Creating an empty dataframe to populate in python
index = np.arange(len(data_array)) #Number of indexes needed
columns = ['date','time', 'person', 'forum', 'unknown']
df = pd.DataFrame(columns=columns, index = index)

#create data frame from array
for val, item in enumerate(data_array):
    df.ix[val] = item

In [10]:
#Test writing to database in cypher language
from py2neo import neo4j, authenticate, Graph, Node, Relationship

# Creates the connection to the graph which defaults to the localhost
# set up authentication parameters
# authenticate("localhost:7474", "user", "pass")

# connect to authenticated graph database
remote_graph = Graph("http://localhost:7474/db/data/")


In [11]:
# Testing writing to the graph 

alice = Node("Person", name="Alice")
bob = Node("Person", name="Bob")
alice_knows_bob = Relationship(alice, "KNOWS", bob)
remote_graph.create(alice_knows_bob)

(<Relationship graph='http://localhost:7474/db/data/' ref='relationship/0' start='node/0' end='node/1' type='KNOWS' properties={}>,)

In [72]:
# The below line deletes any existing data in the neo4j database
remote_graph.cypher.execute("MATCH (n) OPTIONAL MATCH (n)-[r]-() DELETE n,r") # deleting existing data



In [28]:
data_array.pop()#removing last element of list (NEED TO FIX THIS LATER)

[]

In [80]:
# Try inputting all the user nodes as one type and forum as another
tx = remote_graph.cypher.begin()
# statement = "CREATE  (A:Person {time: {B}})" #Cypher statement to create node
# statement_pers = "CREATE  (A:Person)" #Cypher statement to create node
# statement_forum = "CREATE  (B:Forum)" #Cypher statement to create node
# statement_times = "CREATE (A:Person {pers_id: {E}})-[:COMMENT {date:{C}, time:{D}}]->(B:Forum {for_id: {F}})"
statement_times = "MERGE (A:Person {pers_id: {E}}) MERGE (B:Forum {for_id: {F}}) CREATE (A)-[:COMMENT {date:{C}, time:{D}}]->(B)"


i=0; #This will be our counter for when to commit
j=0; #This will be our counter for lines commited
for lines in data_array:
#date_a, time_a, person_A, forum_a, unknown_a

#Need to merge on the person node - if the person exists then will create
#Need to merge on the forum node and create it if not
#Need to match the full pattern



    tx.append(statement_times, {"A": lines[2], "B": lines[3], "C": lines[0], "D": lines[1], "E": lines[2],"F": lines[3]})
#     mynode = list(remote_graph.find('Forum', property_key=lines[3]))
#     if (len(mynode)==0):
#        tx.append(statement_forum, {"B": lines[3]}) 
    i += 1
    j += 1
    if(i==100):
        tx.commit()
        print (j, ' lines processed')
        i=0
        tx = remote_graph.cypher.begin()
    if(j==1000):
        break
tx.commit() #submit remainder of the lines
print (j, ' lines processed')





100  lines processed
200  lines processed
300  lines processed
400  lines processed
500  lines processed
600  lines processed
700  lines processed
800  lines processed
900  lines processed
1000  lines processed
1000  lines processed


In [29]:
data_array[-10:]

[['"2004-10-25', '23:40:08"', '137', '244', '62'],
 ['"2004-10-25', '23:43:32"', '137', '39', '19'],
 ['"2004-10-25', '23:52:16"', '441', '488', '8'],
 ['"2004-10-26', '00:14:08"', '109', '59', '75'],
 ['"2004-10-26', '01:40:43"', '836', '39', '21'],
 ['"2004-10-26', '01:51:33"', '719', '20', '143'],
 ['"2004-10-26', '01:58:17"', '719', '107', '231'],
 ['"2004-10-26', '02:35:18"', '625', '59', '51'],
 ['"2004-10-26', '08:40:22"', '375', '266', '28'],
 ['"2004-10-26', '08:41:41"', '375', '266', '19']]

In [46]:
data_array[1:2]

[['"2004-05-14', '20:54:08"', '187', '3', '63']]