# Rob Sandbox
- Tests and trials developed by Rob

# Imports

#### Standard library imports

#### Local application imports

#### Third party imports

In [1]:
import pandas as pd

import numpy as np

import networkx as nx

import scipy.sparse
from scipy.sparse import csr_matrix

import matplotlib.pyplot as plt

---

# Reading data 

## Reading data from csv

In [2]:
df = pd.read_csv("Cooffending.csv")

In [3]:
df

Unnamed: 0,OffenderIdentifier,OffenderGender,CrimeIdentifier,CrimeType1,CrimeType2,CrimeType3,CrimeType4,Municipality,CrimeLocation,NumberYouthOffenders,NumberAdultOffenders,CrimeDate,CrimeYear
0,1,F,1085034,3530.0,,,,58227,2.0,0,1,12/17/2005,2005
1,2,F,1431379,1430.0,,,,94068,5.0,0,1,04/23/2008,2008
2,4,M,167174,1430.0,21702.0,,,49058,2.0,0,1,03/06/2008,2008
3,5,M,1179096,1420.0,,,,65005,71.0,0,1,08/21/2008,2008
4,17,M,1270690,1625.0,,,,23027,,0,3,04/30/2003,2003
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1280454,670532,F,1112491,1330.0,,,,58227,34.0,0,1,03/26/2009,2009
1280455,670533,M,78081,21702.0,,,,46075,10.0,0,4,06/29/2009,2009
1280456,670534,M,187904,21201.0,21702.0,,,31015,1.0,0,4,03/16/2009,2009
1280457,670535,M,523837,21704.0,,,,98806,1.0,0,1,10/18/2009,2009


## Initial data exploration

### CrimeIdentifier

#### How many CrimeIds are unique

In [None]:
1 - len(df["CrimeIdentifier"].unique())/len(df["CrimeIdentifier"].notnull())

#### Which crimes have more than one CrimeID?

In [None]:
dfx = df["CrimeIdentifier"].value_counts().to_frame()

# x = x[(x["CrimeIdentifier"] > 1) & (x["CrimeIdentifier"] < 10)]
dfx = dfx[(dfx["CrimeIdentifier"] > 10)]

dfx.value_counts()

## Histogram
# plt.hist(dfx["CrimeIdentifier"])
plt.hist(dfx["CrimeIdentifier"], bins=len(dfx["CrimeIdentifier"].unique()))

In [None]:
ref_val = 23610
df[df["CrimeIdentifier"] == ref_val]

### CrimeTypeX

#### How many unique CrimeType1 are there?

In [None]:
len(df["CrimeType1"].unique())

In [None]:
len(df["CrimeType2"].unique())

In [None]:
len(df["CrimeType3"].unique())

In [None]:
len(df["CrimeType4"].unique())

---

# Analysis 1: Building matrix relating *Crime type* and *Municipality*

In [None]:
dfx = df.copy()

## Defining relevant columns

In [None]:
rc = ["CrimeType1", "Municipality"]
dfx = dfx.loc[:, rc]

## Cleaning and preparing original data

### Eliminating rows with null values

In [None]:
dfx.info()

In [None]:
dfx = dfx.loc[dfx["CrimeType1"].notnull(), :]

### Reindexing and reformatting

In [None]:
for col in dfx.columns:
    dfx[col] = dfx[col].astype("int64")
    dfx[col] = pd.factorize(dfx[col])[0]

### Simplified matrix

In [None]:
dfx = np.column_stack(
    [
        np.ones(dfx.shape[0], dtype=int),
        np.array(dfx["CrimeType1"]),
        np.array(dfx["Municipality"])
    ]
)

dfx.shape

### Data as sparse matrix

In [None]:
crime_matrix = csr_matrix(
    (
        dfx[:, 0].sum(),
        (dfx[:, 1], dfx[:, 2])
    ),
    shape=(
        dfx[:, 1].max() + 1,
        dfx[:, 2].max() + 1
    )
)

### Adjacency matrix

In [None]:
A = crime_matrix*crime_matrix.T 

---

# Analysis 2: Relating *OffenderId* and *Municipality*

In [4]:
dfx = df.copy()

## Defining relevant columns

In [5]:
rc = ["OffenderIdentifier", "Municipality"]
dfx = dfx.loc[:, rc]

## Cleaning and preparing original data

### Eliminating rows with null values

In [None]:
dfx.info()

### Eliminating duplicated rows

In [6]:
dfx = dfx.drop_duplicates()

### Reindexing and reformatting

In [7]:
for col in dfx.columns:
    dfx[col] = dfx[col].astype("int64")
    dfx[col] = pd.factorize(dfx[col])[0]

### To numpy array

In [None]:
dfx = np.column_stack(
    [
        np.ones(dfx.shape[0], dtype=int),
        np.array(dfx[rc[0]]),
        np.array(dfx[rc[1]])
    ]
)

dfx.shape

### Data as sparse matrix

In [None]:
crime_matrix = csr_matrix(
    (
        dfx[:, 0],
        (dfx[:, 1], dfx[:, 2])
    ),
    shape=(
        dfx[:, 1].max() + 1,
        dfx[:, 2].max() + 1
    )
)

### Adjacency matrix

In [None]:
A = crime_matrix*crime_matrix.T 

#### Cleaning adjacency matrix

In [None]:
## Eliminating self loops
A.setdiag(0)

## Eliminating zeros
A.eliminate_zeros()

## Working with graph object

---

---
---