In [39]:
pip install neo4j-driver

Note: you may need to restart the kernel to use updated packages.


In [40]:
from neo4j import GraphDatabase, basic_auth
import pandas as pd
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split
import numpy as np

db_location = "bolt://neo4j:7687"
username = "neo4j"
password = "qwerty"

query = """
        MATCH (p:Person)-[sm:SMOKES]->(t:Tobacco), (p)-[lv:LIVES]->(s:State)
        RETURN s.name as state_name, p.gender as gender, p.age as age, t.name as tobacco
        """


def get_dataset(tx):
    db_res = tx.run(query)
    training_data = pd.DataFrame([r.values() for r in db_res], columns=db_res.keys())
    return training_data

In [41]:
db = GraphDatabase.driver(db_location, auth=basic_auth(username, password))
feature_encoder = OrdinalEncoder()
linear_model = LinearRegression()

with db.session() as session:
    df = session.read_transaction(get_dataset)

df.sample(frac=1).head()

Unnamed: 0,state_name,gender,age,tobacco
3856,Florida,Male,23,Cigarette
13677,Pennsylvania,Male,30,Cigarette
14057,Rhode Island,Male,30,Cigarette
14,Alabama,Male,19,Smokeless Tobacco
14329,South Carolina,Female,34,Cigarette


In [42]:
y_data = df['age'].values
X_data = feature_encoder.fit_transform(df.filter(items=['state_name', 'gender', 'tobacco']))

pd.DataFrame(X_data).head()

Unnamed: 0,0,1,2
0,0.0,1.0,3.0
1,0.0,1.0,3.0
2,0.0,1.0,3.0
3,0.0,1.0,3.0
4,0.0,1.0,3.0


In [43]:
X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.10, random_state=42)

linear_model.fit(X_train, y_train)

y_pred = linear_model.predict(X_test)
res = {f'Accuracy for {i} years': len(y_pred[np.abs(y_pred - y_test) < i]) / len(y_pred) for i in range(3, 10, 2)}
res

{'Accuracy for 3 years': 0.4521489971346705,
 'Accuracy for 5 years': 0.6756446991404012,
 'Accuracy for 7 years': 0.828080229226361,
 'Accuracy for 9 years': 0.9083094555873925}

In [44]:
def predict_linear(model: LinearRegression, encoder: OrdinalEncoder, df: pd.DataFrame):
    return model.predict(encoder.transform(df.filter(items=['state_name', 'gender', 'tobacco'])))

In [45]:
predict_linear(linear_model, feature_encoder, pd.DataFrame({'state_name': ['Alaska'],
                                                            'gender': ['Female'],
                                                            'tobacco': ['Smokeless Tobacco']}))

array([21.15000994])