**Install the required Python packages**

```bash
pip install -r requirements.txt
```


**Dataset**

The dataset is a CSV file containing user behavior data. It includes the following columns:

- User ID: ID of the user
- Age: Age of the user
- Gender: Gender of the user
- App Usage Time (min/day): Daily app usage in minutes
- Screen On Time (hours/day): Daily screen time in hours
- Battery Drain (mAh/day): Daily battery drain in mAh
- Number of Apps Installed: Number of installed apps
- Data Usage (MB/day): Daily data usage in MB
- User Behavior Class: User behavior classification

https://www.kaggle.com/datasets/valakhorasani/mobile-device-usage-and-user-behavior-dataset?select=user_behavior_dataset.CSV

In [15]:
from typing import List
import pandas as pd
from neo4j import GraphDatabase
from langchain_openai import ChatOpenAI
from langchain_community.graphs.neo4j_graph import Neo4jGraph
from langchain_community.chains.graph_qa.cypher import GraphCypherQAChain
import os
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

True

In [16]:
# Neo4j credentials
NEO4J_URI = os.getenv("NEO4J_URI", "bolt://localhost:7687")
NEO4J_USER = os.getenv("NEO4J_USER", "neo4j")
NEO4J_PASSWORD = os.getenv("NEO4J_PASSWORD")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

In [17]:
# Initialize Neo4j driver
driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USER, NEO4J_PASSWORD))
database = "mobileusagegraphrag"  # Specify custom database name

# Create the database if it doesn't exist
try:
    with driver.session(database="system") as session:
        # Check if database exists
        result = session.run("SHOW DATABASES")
        databases = [record["name"] for record in result]

        if database not in databases:
            print(f"Creating database: {database}")
            session.run(f"CREATE DATABASE {database}")
            print(f"Database {database} created successfully")
        else:
            print(f"Database {database} already exists")

except Exception as e:
    print(f"Error creating database: {str(e)}")
    raise

# Test connection
try:
    driver.verify_connectivity()
    print("Successfully connected to Neo4j!")
except Exception as e:
    print(f"Failed to connect to Neo4j: {str(e)}")
    raise

# Initialize Neo4jGraph with custom database
graph = Neo4jGraph(
    url=NEO4J_URI,
    username=NEO4J_USER,
    password=NEO4J_PASSWORD,
    database=database,
)

# Initialize LLM
llm = ChatOpenAI(
    temperature=0, model_name="gpt-3.5-turbo", openai_api_key=OPENAI_API_KEY
)


Database mobileusagegraphrag already exists
Successfully connected to Neo4j!


In [18]:
def run_query(query: str, params: dict = None):
        """Custom query runner for Neo4jGraph"""
        with driver.session(database=database) as session:
            result = session.run(query, params or {})
            return [dict(record) for record in result]

In [19]:
def create_graph_schema(session):
        """Create constraints and indexes for the graph"""
        try:
            constraints = [
                "CREATE CONSTRAINT IF NOT EXISTS FOR (u:User) REQUIRE u.user_id IS UNIQUE",
                "CREATE CONSTRAINT IF NOT EXISTS FOR (d:Device) REQUIRE d.device_id IS UNIQUE",
            ]

            indexes = [
                "CREATE INDEX IF NOT EXISTS FOR (a:App) ON (a.name)",
                "CREATE INDEX IF NOT EXISTS FOR (l:Location) ON (l.type)",
            ]

            for constraint in constraints:
                session.run(constraint)

            for index in indexes:
                session.run(index)

            print("Successfully created schema constraints and indexes")
        except Exception as e:
            print(f"Error creating schema: {str(e)}")
            raise

In [20]:
def load_data_to_neo4j():
        """Load the mobile usage data into Neo4j"""
        try:
            # Read the CSV file
            df = pd.read_csv("user_behavior_dataset.csv")
            print(f"Successfully loaded dataset with {len(df)} rows")

            with driver.session(
                database=database
            ) as session:  # Use custom database
                # Clear existing data
                session.run("MATCH (n) DETACH DELETE n")

                # Create indexes and constraints
                session.run(
                    "CREATE CONSTRAINT user_id IF NOT EXISTS FOR (u:User) REQUIRE u.id IS UNIQUE"
                )
                session.run(
                    "CREATE INDEX device_model IF NOT EXISTS FOR (d:Device) ON (d.model)"
                )

                # Batch process the data
                batch_size = 100
                total_rows = len(df)

                for i in range(0, total_rows, batch_size):
                    batch = df.iloc[i : min(i + batch_size, total_rows)]

                    for _, row in batch.iterrows():
                        # Create User node with properties
                        cypher_query = """
                        MERGE (u:User {id: $user_id})
                        SET 
                            u.age = $age,
                            u.gender = $gender,
                            u.app_usage_time = $app_usage_time,
                            u.screen_time = $screen_time,
                            u.battery_drain = $battery_drain,
                            u.apps_installed = $apps_installed,
                            u.data_usage = $data_usage,
                            u.behavior_class = $behavior_class
                        WITH u
                        MERGE (d:Device {model: $device_model})
                        SET d.os = $os
                        MERGE (u)-[:USES]->(d)
                        """

                        session.run(
                            cypher_query,
                            user_id=int(row["User ID"]),
                            age=int(row["Age"]),
                            gender=row["Gender"],
                            app_usage_time=float(row["App Usage Time (min/day)"]),
                            screen_time=float(row["Screen On Time (hours/day)"]),
                            battery_drain=float(row["Battery Drain (mAh/day)"]),
                            apps_installed=int(row["Number of Apps Installed"]),
                            data_usage=float(row["Data Usage (MB/day)"]),
                            behavior_class=int(row["User Behavior Class"]),
                            device_model=row["Device Model"],
                            os=row["Operating System"],
                        )

                    print(
                        f"Processed {min(i+batch_size, total_rows)}/{total_rows} rows"
                    )

                print("Successfully loaded all data into Neo4j")

        except Exception as e:
            print(f"Error loading data into Neo4j: {str(e)}")
            raise

In [21]:
def get_schema() -> str:
        """Custom schema getter"""
        return """
        Node properties and relationships in the graph:
        
        User properties:
        - id: ID of the user
        - age: User's age
        - gender: User's gender
        - app_usage_time: Daily app usage in minutes
        - screen_time: Daily screen time in hours
        - battery_drain: Daily battery drain in mAh
        - apps_installed: Number of installed apps
        - data_usage: Daily data usage in MB
        - behavior_class: User behavior classification
        
        Device properties:
        - model: Device model name
        - os: Operating system
        
        Relationships:
        - (User)-[:USES]->(Device): Shows which device a user uses
        """

In [22]:
def setup_qa_chain():
        """Set up the question-answering chain"""
        try:
            qa_chain = GraphCypherQAChain.from_llm(
                llm=llm,
                graph=graph,
                verbose=True,
                allow_dangerous_requests=True,
                top_k=10,  # Return more results
            )
            print("Successfully set up QA chain")
            return qa_chain
        except Exception as e:
            print(f"Error setting up QA chain: {str(e)}")
            raise

In [23]:
def ask_question(qa_chain, question: str) -> str:
        """Ask a question about the mobile usage data"""
        try:
            return qa_chain.invoke(question)
        except Exception as e:
            print(f"Error processing question: {str(e)}")
            return f"Error: {str(e)}"

In [24]:
def cleanup_database():
        """Clean up the database when done"""
        try:
            with driver.session(database="system") as session:
                # Stop the database before dropping
                session.run(f"STOP DATABASE {database}")
                # Drop the database
                session.run(f"DROP DATABASE {database} IF EXISTS")
                print(f"Database {database} cleaned up successfully")
        except Exception as e:
            print(f"Error cleaning up database: {str(e)}")


In [25]:
def close():
        """Close the Neo4j connection"""
        try:
            # cleanup_database()  # Clean up the database
            driver.close()
            print("Successfully closed Neo4j connection")
        except Exception as e:
            print(f"Error closing Neo4j connection: {str(e)}")

In [26]:
try:

    # Load data into Neo4j
    print("\nLoading data into Neo4j...")
    load_data_to_neo4j()

    # Setup the QA chain
    print("\nSetting up QA chain...")
    qa_chain = setup_qa_chain()

    # Example questions specific to the dataset
    questions = [
        "How many users uses Android?",
        "How many users uses iOS?",
        "What is average app usage time for Andriod users?",
        "What is average app usage time for iOS users?",
        "What is the count of user uses Andriod based on their age?",
        "What is the count of user based on Operating System and Device Model?",
    ]

    # Ask questions
    print("\nProcessing questions...")
    for question in questions:
        print(f"\nQuestion: {question}")
        answer = ask_question(qa_chain,question)
        print(f"Answer: {answer}")

except Exception as e:
    print(f"An error occurred: {str(e)}")
finally:
        print("\nClosing connections...")
        close()


Loading data into Neo4j...
Successfully loaded dataset with 700 rows
Processed 100/700 rows
Processed 200/700 rows
Processed 300/700 rows
Processed 400/700 rows
Processed 500/700 rows
Processed 600/700 rows
Processed 700/700 rows
Successfully loaded all data into Neo4j

Setting up QA chain...
Successfully set up QA chain

Processing questions...

Question: How many users uses Android?


[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3mMATCH (:User)-[:USES]->(d:Device {os: 'Android'})
RETURN COUNT(*) as android_users;[0m
Full Context:
[32;1m[1;3m[{'android_users': 554}][0m

[1m> Finished chain.[0m
Answer: {'query': 'How many users uses Android?', 'result': '554 users use Android.'}

Question: How many users uses iOS?


[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3mMATCH (:User)-[:USES]->(d:Device)
WHERE d.os = 'iOS'
RETURN COUNT(*) as users_using_iOS;[0m
Full Context:
[32;1m[1;3m[{'users_using_iOS': 146}][0m