In [1]:
pip install neo4j-driver

Collecting neo4j-driver
  Downloading https://files.pythonhosted.org/packages/77/ac/b8b9dbe47062457ec9fa3f57b62198243869b33144218d40a03c7c25c170/neo4j-driver-1.7.2.tar.gz
Collecting neobolt<2,>=1.7.4 (from neo4j-driver)
[?25l  Downloading https://files.pythonhosted.org/packages/ba/02/641c5241db092f75bce1334cb728d3fb48f4dddc5d21401fe94a5ed636ad/neobolt-1.7.4.tar.gz (182kB)
[K    100% |████████████████████████████████| 184kB 1.8MB/s ta 0:00:01
[?25hCollecting neotime<2,>=1.7.1 (from neo4j-driver)
  Downloading https://files.pythonhosted.org/packages/0b/7e/ca368a8d8e288be1352d4e2df35da1e01f8aaffbf526695df71630bcb8a6/neotime-1.7.4.tar.gz
Building wheels for collected packages: neo4j-driver, neobolt, neotime
  Building wheel for neo4j-driver (setup.py) ... [?25ldone
[?25h  Stored in directory: /home/jovyan/.cache/pip/wheels/50/b5/17/5972f7821c48b83217a47da99387d1f2d67903357a529e752c
  Building wheel for neobolt (setup.py) ... [?25ldone
[?25h  Stored in directory: /home/jovyan/.cache/

In [2]:
from neo4j import GraphDatabase, basic_auth
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split
import numpy as np

db_location = "bolt://neo4j:7687"
username = "neo4j"
password = "qwerty"

query = """
        MATCH (p:Person)-[sm:SMOKES]->(t:Tobacco), (p)-[lv:LIVES]->(s:State)
        RETURN s.name as state_name, p.gender as gender, p.age as age, t.name as tobacco
        """


def get_dataset(tx):
    db_res = tx.run(query)
    training_data = pd.DataFrame([r.values() for r in db_res], columns=db_res.keys())
    return training_data


def is_tobacco_of(tobacco: str):
    def go(val: str):
        return tobacco == val
    return go


In [3]:
db = GraphDatabase.driver(db_location, auth=basic_auth(username, password))
feature_encoder = OrdinalEncoder()

with db.session() as session:
    df = session.read_transaction(get_dataset)

df.sample(frac=1).head()

Unnamed: 0,state_name,gender,age,tobacco
11890,North Carolina,Female,29,Cigarette
5435,Kansas,Male,25,Cigarette
17342,Wyoming,Male,23,Cigarette
7848,Maryland,Male,37,Cigar
14783,South Dakota,Male,22,Cigarette


In [4]:
        y_data = df['tobacco'].apply(is_tobacco_of('Cigar'))
        X_data = np.column_stack((feature_encoder.fit_transform(df.filter(items=['state_name', 'gender'])), df['age'].values))
X_data

array([[ 0.,  0., 27.],
       [ 0.,  0., 29.],
       [ 0.,  0., 26.],
       ...,
       [50.,  1., 58.],
       [50.,  1., 52.],
       [50.,  1., 56.]])

In [5]:
        X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.10, random_state=42)
        logistic_model = LogisticRegression(solver='lbfgs')
        logistic_model.fit(X_train, y_train)
        y_pred = logistic_model.predict(X_test)
        res = len(y_pred[y_pred == y_test]) / len(y_pred)
res

0.8774815655133296

In [6]:
def predict_logistic(model: LogisticRegression, encoder: OrdinalEncoder, df: pd.DataFrame):
    return pd.DataFrame(columns=["does person smoke specified tobacco"],
                        data=model.predict(np.column_stack((encoder.transform(df[['state_name', 'gender']]), df['age'].values))))

In [7]:
predict_logistic(logistic_model, feature_encoder, pd.DataFrame({'state_name': ['Alabama', 'Georgia'],
                                                                'gender': ['Female', 'Male'],
                                                                'age': [33, 55]
                                                                }))

Unnamed: 0,does person smoke specified tobacco
0,False
1,True
