In [1]:
!pip install opendatasets




[notice] A new release of pip is available: 23.2.1 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import opendatasets as od

In [3]:
dataset = "https://www.kaggle.com/datasets/currie32/crimes-in-chicago?select=Chicago_Crimes_2012_to_2017.csv"

In [4]:
od.download(dataset, force=False)

Skipping, found downloaded files in ".\crimes-in-chicago" (use force=True to force download)


In [5]:
import os

In [6]:
data_dir = '.\\crimes-in-chicago'

In [7]:
os.listdir(data_dir)

['Chicago_Crimes_2001_to_2004.csv',
 'Chicago_Crimes_2005_to_2007.csv',
 'Chicago_Crimes_2008_to_2011.csv',
 'Chicago_Crimes_2012_to_2017.csv']

In [8]:
import pandas as pd 

In [9]:
df = pd.read_csv('crimes-in-chicago/Chicago_Crimes_2012_to_2017.csv')

In [10]:
df = df.dropna()

In [11]:
dataset_size = df.shape
num_columns = df.shape[1]
print("Dataset size (rows, columns):", dataset_size)
print("Number of columns:", num_columns)

Dataset size (rows, columns): (1418365, 23)
Number of columns: 23


In [12]:
null_values = df.isnull().sum()
print(null_values)

Unnamed: 0              0
ID                      0
Case Number             0
Date                    0
Block                   0
IUCR                    0
Primary Type            0
Description             0
Location Description    0
Arrest                  0
Domestic                0
Beat                    0
District                0
Ward                    0
Community Area          0
FBI Code                0
X Coordinate            0
Y Coordinate            0
Year                    0
Updated On              0
Latitude                0
Longitude               0
Location                0
dtype: int64


In [13]:
data_types = df.dtypes
print("Data types of each column:")
print(data_types)

Data types of each column:
Unnamed: 0                int64
ID                        int64
Case Number              object
Date                     object
Block                    object
IUCR                     object
Primary Type             object
Description              object
Location Description     object
Arrest                     bool
Domestic                   bool
Beat                      int64
District                float64
Ward                    float64
Community Area          float64
FBI Code                 object
X Coordinate            float64
Y Coordinate            float64
Year                      int64
Updated On               object
Latitude                float64
Longitude               float64
Location                 object
dtype: object


In [14]:
def get_last_n_observations(df, n=100000):
    return df.tail(n)
df = get_last_n_observations(df, 100000)
dataset_size = df.shape
num_columns = df.shape[1]
print("Dataset size (rows, columns):", dataset_size)
print("Number of columns:", num_columns)

Dataset size (rows, columns): (100000, 23)
Number of columns: 23


In [15]:
pip install neo4j





[notice] A new release of pip is available: 23.2.1 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip


In [16]:
from neo4j import GraphDatabase

In [None]:
def connect_to_neo4j(uri, user, password):
    return GraphDatabase.driver(uri, auth=(user, password))
def load_data_to_neo4j(driver, df):
    with driver.session() as session:
        for index, row in df.iterrows():
            parameters = {
                "ID": row["ID"],
                "Case_Number": row["Case Number"],
                "Date": row["Date"],
                "Block": row["Block"],
                "IUCR": row["IUCR"],
                "Primary_Type": row["Primary Type"],
                "Description": row["Description"],
                "Location_Description": row["Location Description"],
                "Arrest": str(row["Arrest"]),
                "Domestic": str(row["Domestic"]),
                "Beat": row["Beat"],
                "District": row["District"],
                "Ward": row["Ward"],
                "Community_Area": row["Community Area"],
                "FBI_Code": row["FBI Code"],
                "X_Coordinate": row["X Coordinate"],
                "Y_Coordinate": row["Y Coordinate"],
                "Year": row["Year"],
                "Updated_On": row["Updated On"],
                "Latitude": row["Latitude"],
                "Longitude": row["Longitude"],
                "Location": row["Location"]
            }

            session.run(
                "MERGE (crime:Crime {ID: $ID}) "
                "ON CREATE SET "
                "crime.Case_Number = $Case_Number, "
                "crime.Date = $Date, "
                "crime.Block = $Block, "
                "crime.IUCR = $IUCR, "
                "crime.Primary_Type = $Primary_Type, "
                "crime.Description = $Description, "
                "crime.Location_Description = $Location_Description, "
                "crime.Arrest = $Arrest, "
                "crime.Domestic = $Domestic, "
                "crime.Beat = $Beat, "
                "crime.District = $District, "
                "crime.Ward = $Ward, "
                "crime.Community_Area = $Community_Area, "
                "crime.FBI_Code = $FBI_Code, "
                "crime.X_Coordinate = $X_Coordinate, "
                "crime.Y_Coordinate = $Y_Coordinate, "
                "crime.Year = $Year, "
                "crime.Updated_On = $Updated_On, "
                "crime.Latitude = $Latitude, "
                "crime.Longitude = $Longitude, "
                "crime.Location = $Location",
                parameters
            )

uri = "neo4j://localhost:7687"
user = "neo4j"
password = "password"

driver = connect_to_neo4j(uri, user, password)
load_data_to_neo4j(driver, df)
driver.close()