## [Graph Data Science Python Driver](https://github.com/neo4j/graph-data-science-client)

In [33]:
#!pip install graphdatascience

## Imports

In [35]:
# Data
import pandas as pd
from graphdatascience import GraphDataScience

# Viz
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import seaborn as sns

# ML
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier

# Metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import plot_confusion_matrix

## Setup

In [None]:
URI      = "neo4j://localhost:7687"
creds    = ('neo4j', 'password')
gds      = GraphDataScience(URI, auth=creds)

## Load CORA Data
#### Data from https://github.com/CJ2001

![](https://cdn-images-1.medium.com/fit/t/1600/480/1*oygeCjtUsS87duvFoDT8tA.png)

### Create Constraint

In [None]:
c = '''
// Create constraints
CREATE CONSTRAINT papers IF NOT EXISTS ON (p:Paper) ASSERT p.id IS UNIQUE;
'''

gds.run_cypher(c)

### Import Nodes

In [None]:
c = '''
// Load node list
LOAD CSV WITH HEADERS FROM 'https://raw.githubusercontent.com/cj2001/pydata2021/main/notebooks/data/cora_nodes.csv' AS line
WITH line
MERGE (p:Paper {id: line.id})
    ON CREATE SET p.subject = line.subject, p.features = line.features
RETURN COUNT(*)
'''

gds.run_cypher(c)

### Import Edges

In [None]:
c = '''
// Load edge list
LOAD CSV WITH HEADERS FROM 'https://raw.githubusercontent.com/cj2001/pydata2021/main/notebooks/data/cora_edges.csv' AS line
WITH line
MATCH (source:Paper {id: line.source})
MATCH (target:Paper {id: line.target})
MERGE (source)-[:CITES]->(target)
RETURN COUNT(*)
'''

gds.run_cypher(c)

## Create graph projection

In [None]:
node_projection = ["Paper"]
relationship_projection = {"CITES": 
                           {"orientation": "UNDIRECTED"}
                          }
# G.drop()

G, _ = gds.graph.project("papers", 
                         node_projection, 
                         relationship_projection)

In [None]:
def projection_stats(G):
    print('name:', G.name(), '\n')
    print('degree_distribution:\n', G.degree_distribution())
    print('density:', G.density(), '\n')
    print('node_count:', G.node_count())
    print('relationship_count:', G.relationship_count())
    
projection_stats(G)

## Stream FastRP Embeddings

In [None]:
# Run FastRP and mutate projection with result
result = gds.fastRP.stream(
    G,
    embeddingDimension=64,
    iterationWeights=[0.8, 1, 1, 1],
)

result.head()

### Clean up Dataframe with Pandas transforms

In [None]:
df = pd.concat([result[['nodeId']],
                   pd.DataFrame.from_dict(dict(zip(result.embedding.index, result.embedding.values))).T],
                axis=1)
df.head()

In [None]:
df = pd.merge(gds.run_cypher('MATCH (n:Paper) RETURN id(n) as nodeId, n.subject as class'),
                df,
              on="nodeId")
df.head()

## Loop over desired algorithms and add them to Dataframe

In [None]:
for algo in ['pageRank', 'betweenness']:
    method = getattr(gds, algo)
    
    df = pd.merge(df, 
                    method.stream(G).rename(columns={'score': algo}),
                  on="nodeId")
df.head()

## Create Pandas Plot

In [None]:
boxplot = df.boxplot(column=['betweenness'])  

## Create Seaborn Plot

In [None]:
sns.histplot(df.pageRank, log_scale=True)

## Visualize FastRP embeddings with TSNE plot

In [None]:
def create_tsne_plot(df):

    X_emb = TSNE(n_components=2).fit_transform(df[df.columns[2:-3]])

    tsne_df = pd.DataFrame(data = {
        'x': [value[0] for value in X_emb],
        'y': [value[1] for value in X_emb], 
        'label': pd.factorize(df['class'])[0].astype("float32")
    })

    plt.figure(figsize=(16,10))
    s = 30
    ax = sns.scatterplot(
        x='x', y='y',
        palette=sns.color_palette('hls', 7),
        data=tsne_df,
        hue='label',
        legend=True, 
        s=100,
        alpha=0.75
    )
    ax.legend(prop={'size': 10})
    plt.xlabel('X Component', fontsize=16)
    plt.ylabel('Y Component', fontsize=16)
    plt.show

In [None]:
create_tsne_plot(df)

## Split Train & Test Data

In [None]:
encoder = LabelEncoder()
encoder.fit(df['class'])

df['class'] = encoder.transform(df['class'])


In [None]:
X = df[df.columns[2:-3]]  # Features
y = df['class']  # Labels

# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3) # 70% training and 30% test

## Build Classifier

In [None]:
#Create a Gaussian Classifier
clf = XGBClassifier(n_estimators=100,
                    use_label_encoder=False,
                    eval_metric='mlogloss')

#Train the model using the training sets y_pred=clf.predict(X_test)
clf.fit(X_train,y_train)

y_pred=clf.predict(X_test)

## Classifier Metrics

In [None]:
print("Accuracy:", accuracy_score(y_test, y_pred))

In [None]:
matrix = plot_confusion_matrix(clf, X_test, y_test, cmap=plt.cm.Blues, normalize='true')
plt.show(matrix)
plt.show()

In [None]:
top_n = 20

feat_importances = pd.Series(clf.feature_importances_, index=X.columns)
feat_importances.nlargest(top_n).plot(kind='barh', 
                                      title=f'Top {top_n} Feature Importances',
                                      xlabel= 'Feature',
                                      ylabel= 'Feature')