In [1]:
pip install neo4j-driver

Note: you may need to restart the kernel to use updated packages.


In [2]:
from neo4j import GraphDatabase, basic_auth
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputRegressor
from sklearn.ensemble import GradientBoostingRegressor
import numpy as np

db_location = "bolt://neo4j:7687"
username = "neo4j"
password = "qwerty"

query = """
        MATCH (p:Person)-[sm:SMOKES]->(t:Tobacco), (p)-[lv:LIVES]->(s:State)
        RETURN s.name as state_name, p.gender as gender, p.age as age, t.name as tobacco
        """


def get_dataset(tx):
    db_res = tx.run(query)
    training_data = pd.DataFrame([r.values() for r in db_res], columns=db_res.keys())
    return training_data

In [3]:
db = GraphDatabase.driver(db_location, auth=basic_auth(username, password))
fe = OrdinalEncoder()
multilinear_model = MultiOutputRegressor(estimator=GradientBoostingRegressor(random_state=42))

with db.session() as session:
    df = session.read_transaction(get_dataset)

df.sample(frac=1).head()

Unnamed: 0,state_name,gender,age,tobacco
2232,California,Male,19,Smokeless Tobacco
10946,New Mexico,Male,37,Cigarette
3668,District of Columbia,Male,25,Cigarette
4574,Georgia,Male,24,Cigarette
2197,Arkansas,Female,30,Cigarette


In [4]:
Y_data = np.column_stack((df['age'].values, fe.fit_transform(df[['gender']])))
X_data = fe.fit_transform(df[['state_name', 'tobacco']])

pd.DataFrame(X_data).head()

Unnamed: 0,0,1
0,0.0,1.0
1,0.0,1.0
2,0.0,1.0
3,0.0,1.0
4,0.0,1.0


In [5]:
X_train, X_test, Y_train, Y_test = train_test_split(X_data, Y_data, test_size=0.10, random_state=42)

multilinear_model.fit(X_train, Y_train)

MultiOutputRegressor(estimator=GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=3, max_features=None,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_sampl...te=42, subsample=1.0, tol=0.0001,
             validation_fraction=0.1, verbose=0, warm_start=False),
           n_jobs=None)

In [6]:
y_pred = multilinear_model.predict(X_test)
y_pred[1:5]

array([[22.17792327,  0.95986881],
       [29.49383602,  0.33568177],
       [22.39524522,  0.95938102],
       [29.49383602,  0.33568177]])

In [7]:
def predict_multilinear(model: MultiOutputRegressor, encoder: OrdinalEncoder, df: pd.DataFrame):
    res = model.predict(encoder.transform(df[['state_name', 'tobacco']])).round()
    df = pd.DataFrame(columns=["age", "gender"], data=res)
    df["gender"] = df["gender"].apply(lambda a: "Male" if a == 1. else "Female")
    return df

In [8]:
predict_multilinear(multilinear_model, fe, pd.DataFrame({'state_name': ['Alaska'],
                                                         'tobacco': ['Pipe']}))

Unnamed: 0,age,gender
0,52.0,Male
