# Data Science Project

* Name: Author Name
* Email:


## TABLE OF CONTENTS 


- **[Introduction](#INTRODUCTION)<br>**
- **[OBTAIN](#OBTAIN)**<br>
- **[SCRUB](#SCRUB)**<br>
- **[EXPLORE](#EXPLORE)**<br>
- **[MODEL](#MODEL)**<br>
- **[iNTERPRET](#iNTERPRET)**<br>
- **[Conclusions/Recommendations](#CONCLUSIONS-&-RECOMMENDATIONS)<br>**
___

# INTRODUCTION

> Explain the point of your project and what question you are trying to answer with your modeling.



In [None]:
# Importing packages
import pandas as pd
from pandasql import sqldf
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import gzip
import shutil
import os
import sqlite3
import db_to_sqlite
from sqlite3 import Error
import csv
from pathlib import Path
import subprocess
import io
from icecream import ic
import warnings
warnings.filterwarnings(action='ignore', category=FutureWarning)


%matplotlib inline

In [None]:
# This function is needed to easily display the dataframe from a csv file with all the columns names

def display_csvfileDF(file_name, folder):
    """Function displays an csv file as a dataframe
    Argument:
    file name: a file a name, str
    folder: a name of a sub-folder in data folder the file is resigning in, str"""
    
    df = pd.read_csv('data/'+folder+file_name, header=0, encoding='UTF-8')
    return df

def table_query(q):
    """ Function executing a query and displaying it as a dataframe"
    Argument: a query statement as a string"""
    
    df = pd.DataFrame(cur.execute(q))
    df.columns = [x[0] for x in cur.description]
    return df

def import_data_to_tables(db_path_name, list_of_files, replace_dir_name):
    """Function uses subprocess to run a sql statement of importing csv files into an sqlite db
    Arguments:
    db_path_name: full path to a database file from the current repo, str
    list_of_files: list of files in to import (with a full path from the current repo)
    replace_dir_name: a name of the directory where csv files reside, full path from the current repo, str;
    example 'dir/dir1/'
    """
    
    db_name = Path(db_path_name).resolve()

    for entry in list_of_files:
        csv_file = Path(entry).resolve()
        result = subprocess.run(['sqlite3',
                                 str(db_name),
                                 '-cmd',
                                 '.mode csv',
                                 '.import --skip 1 ' + str(csv_file).replace('\\','\\\\')
                                 + ' ' + entry.replace('.csv','').replace(replace_dir_name,'')],
                                capture_output=True)
    return

def create_filelist(dir_name,n=2):
    """Creates a list of file names in the directory
    Arguments:
    dir_name: the name of the directory, str
    n: the number of files to skip in case there are invisible files in the firectory, default is 2"""
    
    files_incidents=[]
    files_incidents = os.listdir(dir_name)
    files_incidents=files_incidents[n:]
    files_incidents=[dir_name + s for s in files_incidents]
    return files_incidents

def copy_files(file_list, dir_out, dir_in):
    """Function to copy a list of files from one dir to another:
    file_list: file list to copy
    dir_out: source folder, str
    dir_in:  destination folder, str"""
    
    for entry in file_list:
        shutil.copyfile(dir_out+entry, dir_in+entry)
        
def create_new_table(old_table, new_table, list_of_columns_to_leave, drop_rename=False):
    """Function to create a  new table with select columns only. The reason to make
    this function is that sqlite3 below 3.35 does not support DROP COLUMN operation
    Arguments:
    old_table: the source table, str
    new_table: the destination table, str
    list_of_columns_to_leave: a list of column names as strings
    drop_rename: rename flag, if True, the old table is dropped and the new one is renamed with the old one's name.
                default set to False"""
    
    statement='CREATE TABLE '+ new_table + ' AS SELECT ' + ', '.join(list_of_columns_to_leave) + ' FROM ' + old_table
    cur.execute(statement)
    
    if drop_rename==True:
        
        statement1='DROP TABLE '+old_table
        statement2='ALTER TABLE '+new_table+' RENAME TO '+old_table
        cur.execute(statement1)
        cur.execute(statement2)
        q='SELECT * FROM ' + old_table
        df_temp=table_query(q)
        
    else:
        q='SELECT * FROM ' + new_table
        df_temp=table_query(q)
        
    return df_temp

def add_update_clmn(tbl_to_updt, tbl_to_use, clmn_to_add, clmn_tbl1_to_use, clmn_to_join):
    """Function adds a new column to a table and fills in the values based on the reference table values
    Arguments:
    tbl_to_updt: table to update, str
    tbl_to_use: reference table, str
    clmn_to_add: new column, str
    clmn_tbl1_to_use: columns in tbl_to_use to use to fill in the values in the new column, str
    clmn_to_join: column to use to join two tables, str

    """
    statement1='ALTER TABLE ' + tbl_to_updt + ' ADD COLUMN ' + clmn_to_add
    #print(statement1)
    cur.execute(statement1)
    
    statement2='UPDATE '+tbl_to_updt+' SET '+clmn_to_add+'=(SELECT '+clmn_tbl1_to_use+\
    ' FROM '+tbl_to_use+' WHERE '+tbl_to_use+'.'+clmn_to_join+'='+tbl_to_updt+'.'+clmn_to_join+')'
    #print(statement2)
    cur.execute(statement2)
    
    q='SELECT * FROM '+tbl_to_updt
    #print(q)
    df=table_query(q)
    return df  

def update_value(table, column, old_value, new_value):
    """Updates values in the column based on the old values
    Arguments
    table: table to update, string
    column: column to update, string
    old_value: old value, do not forget to put double quotes around single quotes
    new_value: new value, do not forget to put double quotes around single quotes
    Example: update_value('victim_main_tmp', 'sex_code', "'F'", "'Female'")"""
    
    statement='UPDATE '+table+' SET '+column+'='+new_value+' WHERE '+column+'='+old_value
    #print(statement)
    cur.execute(statement)
    q='SELECT * FROM '+table
    df=table_query(q)
    return df

def remove_dups(old_table, new_table, conn, drop_rename=False):
    """Function to remove duplicates from a table
    Arguments:
    old_table: the source table, str
    new_table: the destination table, str
    drop_rename: rename flag, if True, the old table is dropped and the new one is renamed with the old one's name.
    default set to False"""
    
    q='SELECT * from '+old_table
    df=table_query(q)
    df=df.drop_duplicates()
    df.to_sql(name=new_table, con=conn)
    
    if drop_rename==True:
        
        statement1='DROP TABLE '+old_table
        statement2='ALTER TABLE '+new_table+' RENAME TO '+old_table
        cur.execute(statement1)
        cur.execute(statement2)
        q='SELECT * FROM ' + old_table
        df_temp=table_query(q)
        
    else:
        q='SELECT * FROM ' + new_table
        df_temp=table_query(q)
        
    return df_temp

# OBTAIN

## Data

### Data source and data description

Data is from FBI Crime Data Explorer
[NIBRS data for Colorado from 2009-2019](https://crime-data-explorer.fr.cloud.gov/pages/downloads)

The [data dictionary](data/NIBRS_DataDictionary.pdf) is  and a [record descriptiopn](data/NIBRS_Record_Description.pdf) are available.


The description of the main and reference tables is in data/README.md file.
The agency implemented some changes to the files structure in 2016 and removed the sqlite create and load scripts from the zip directories.
Another fact worth mentioning is that files 'nibrs_property_desc.csv' from 2014 and 2015 have duplicated nibrs_property_desc_ids (unique identifier in the nibrs_property_desc table) which complicated the loading of the data.

**All 2016-2019 files need to be cleaned up because FBI changed the file format. There is a YEAR column that needs to be removed as well as the legacy columns from the previous years need to be added up. It's a tedious job and it needs to be done once and the files need to be backed up.**

In order to clean the tables up the following needs to be done<br>

   1. Remove all **DATA_YEAR** columns from each file, it's the first column<br>
   
   2. Files that do not need any changes beyond **DATA_YEAR** column removal<br>
    
> nibrs_arrestee_weapon.csv<br>
nibrs_bias_motivation.csv<br>
nibrs_criminal_act.csv<br>
nibrs_property_desc.csv<br>
nibrs_suspect_using.csv<br>
nibrs_suspected_drug.csv<br>
nibrs_victim_circumstances.csv<br>
nibrs_victim_injury.csv<br>
nibrs_victim_offender_rel.csv<br>
nibrs_victim_offense.csv<br>
nibrs_weapon.csv<br>

    
   3. in **nibrs_arestee.csv file**:<br><br>
   a. between **ARRESTEE_SEQ_NUM** and **ARREST_DATE** there should be an **arrest_num column**<br>
   b. Between **CLEARANCE_IND** and **AGE_RANGE_LOW_NUM** should be a **ff_line_number** column. <br>

4.  in **nibrs_incident** file:<br><br>
    a.between **NIBRS_MONTH_ID** and **CARGO_THEFT_FLAG** column **incident_number**<br>
    b.between **DATA_HOME** and **ORIG_FORMAT** column **ddocname**<br>
    c.between **ORIG_FORMAT** and **DID** column	**ff_line_number**<br><br>

5. in **nibrs_month.csv** file:<br><br>
    a.between **REPORT_DATE** and **UPDATE_FLAG** add **prepared_date** column<br>
    b.between **ORIG_FORMAT** and **DATA_HOME** column **ff_line_number**<br>
    c.column **MONTH_PUB_STATUS** removed<br><br>

6. in **nibrs_offender.csv** file:<br><br>
     a.between **ETHNICITY_ID** and **AGE_RANGE_LOW_NUM** column **ff_line_number**<br><br>
     
7. in **nibrs_offense.csv** file:<br><br>
     a. the last column **ff_line_number** should be added<br><br>
   
8. in **nibrs_property.csv** file:<br><br>
     a. the last column **ff_line_number** should be added<br><br>

9. in **nibrs_victim.csv** file:<br><br>
     a. between **RESIDENT_STATUS_CODE** and **AGE_RANGE_LOW_NUM** two columns **agency_data_year** and **ff_line_number** (in that order) should be added
    


### Using an already created sqlite database

The notebook with database creation is [here](creating_sqlite_db.ipynb). The referenced database is in ***data/sqlite/db/production1 db***. It takes 2.5 minutes to run the database creation script.

In [None]:
# Uncomment the line below if you are re-running the code part for main tables>>>

#!cp data/sqlite/db/production1_backup.db data/sqlite/db/production1.db

!cp data/sqlite/db/production1.db data/sqlite/db/production1_backup.db

In [None]:
# Initiating a cursor
conn = sqlite3.connect('data/sqlite/db/production1.db')
cur = conn.cursor()

In [None]:
cur.execute("""SELECT name FROM sqlite_master WHERE type='table'""").fetchall()

In [None]:
q='SELECT * FROM nibrs_incident'
df=table_query(q)

In [None]:
df.head()

In [None]:
cur.execute("""SELECT name FROM sqlite_master WHERE type='table'""").fetchall()

# SCRUB

## SQL/cleaning tables

### Main tables

In [None]:
# df at this point is the main incident table, I am displaying it's info
df.info()

#### Dropping unneeded tables

In [None]:
#Dropping the tables irrelavant to modeling and the dashboard

table_list_to_drop=['nibrs_month','nibrs_justifiable_force','nibrs_arrest_type',
                    'nibrs_drug_measure_type','nibrs_injury','nibrs_suspect_using',
                    'nibrs_suspected_drug','nibrs_suspected_drug_type','nibrs_using_list','nibrs_arrestee',
                    'nibrs_arrestee_weapon','nibrs_activity_type','nibrs_assignment_type','nibrs_property',
                    'nibrs_property_desc','nibrs_prop_loss_type','nibrs_victim_injury','nibrs_prop_desc_type',
                    'nibrs_circumstances','nibrs_victim_circumstances','ref_state', 'nibrs_criminal_act',
                    'nibrs_criminal_act_type','nibrs_victim_offense']

for table in table_list_to_drop:
    string=table
    statement='DROP TABLE'+' '+string
    cur.execute(statement)
cur.execute("""SELECT name FROM sqlite_master WHERE type='table'""").fetchall()    

#### Incidents table

In [None]:
#Listing columns in the incidents table

df.columns

In [None]:
# Creating a list of columns to leave in the incidents table

incdnt_clmns_to_lv=['agency_id','incident_id','incident_date','incident_hour']

# Due to the fact that sqlite has a limitation of not being able to drop columns,
# I need to create a new table with only the columns I need.

create_new_table('nibrs_incident', 'incident_main', incdnt_clmns_to_lv)

#### Offense table

In [None]:
# Main offense table columns

q='SELECT * FROM nibrs_offense'
df=table_query(q)
df.head()

In [None]:
# Creating a list with columns to leave in the main offense table

offns_clmns_to_lv=['offense_id','incident_id','offense_type_id', 'location_id']

# Due to the fact that sqlite has a limitation of not being able to drop columns,
# I need to create a new table with only the columns I need.

create_new_table('nibrs_offense', 'offense_main', offns_clmns_to_lv)

#### Offender table

In [None]:
# Main offender table columns

q='SELECT * FROM nibrs_offender'
df=table_query(q)
df.columns

In [None]:
# Creating a list with columns to leave in the main offender table

offndr_clmns_to_lv=['offender_id', 'incident_id','age_id', 'age_num','sex_code', 'race_id', 'ethnicity_id']

# Due to the fact that sqlite has a limitation of not being able to drop columns,
# I need to create a new table with only the columns I need.

create_new_table('nibrs_offender', 'offender_main', offndr_clmns_to_lv)

In [None]:
# Using reference table values in the offender_main table. Replacing codes with values comprehensible to humans.
# I am doing it to simplify creating a dashboard later.

df=add_update_clmn('offender_main','ref_race', 'race', 'race_desc', 'race_id')

df=add_update_clmn('offender_main','nibrs_age', 'age_group', 'age_name', 'age_id')

df=add_update_clmn('offender_main','nibrs_ethnicity', 'ethnicity', 'ethnicity_name', 'ethnicity_id')

df=update_value('offender_main', 'sex_code', "'F'", "'Female'")

df=update_value('offender_main', 'sex_code', "'M'", "'Male'")

q='SELECT * FROM offender_main'
df=table_query(q)
df.head()

In [None]:
df.columns

In [None]:
# Creating a list with columns to leave in the main offender table. I am dropping all obsolete old columns

ofndr_clmns_to_lv=['offender_id', 'incident_id', 'age_num', 'sex_code',
       'race', 'age_group', 'ethnicity']

# Due to the fact that sqlite has a limitation of not being able to drop columns,
# I need to create a new table with only the columns I need, drop the old table and rename the new one.

create_new_table('offender_main', 'offender_main_tmp', ofndr_clmns_to_lv, drop_rename=True)

#### Victim table

In [None]:
# Main victim table columns

q='SELECT * FROM nibrs_victim'
df=table_query(q)
df.columns

In [None]:
# Creating a list with columns to leave in the main victim table

vctm_clmns_to_lv=['victim_id', 'incident_id', 'victim_type_id',
                  'age_id','age_num', 'sex_code', 'race_id',
                  'ethnicity_id','resident_status_code']

# Due to the fact that sqlite has a limitation of not being able to drop columns,
# I need to create a new table with only the columns I need.

create_new_table('nibrs_victim', 'victim_main', vctm_clmns_to_lv)

In [None]:
# Using reference table values in the victim_main table. Replacing codes with values comprehensible to humans.
# I am doing it to simplify creating a dashboard later

df=add_update_clmn('victim_main','ref_race', 'race', 'race_desc', 'race_id')

df=add_update_clmn('victim_main','nibrs_age', 'age_group', 'age_name', 'age_id')

df=add_update_clmn('victim_main','nibrs_ethnicity', 'ethnicity', 'ethnicity_name', 'ethnicity_id')

df=add_update_clmn('victim_main','nibrs_victim_type', 'victim_type', 'victim_type_name', 'victim_type_id')

df=update_value('victim_main', 'sex_code', "'F'", "'Female'")

df=update_value('victim_main', 'sex_code', "'M'", "'Male'")

df=update_value('victim_main', 'resident_status_code', "'R'", "'Resident'")

df=update_value('victim_main', 'resident_status_code', "'N'", "'Non-resident'")

q='SELECT * FROM victim_main'
df=table_query(q)
df.head()

In [None]:
df.columns

In [None]:
# Creating a list with columns to leave in the main victim table. I am dropping all obsolete old columns.

vctm_clmns_to_lv=['victim_id', 'incident_id', 'age_num',
       'sex_code', 'resident_status_code', 'race',
       'age_group', 'ethnicity', 'victim_type']

# Due to the fact that sqlite has a limitation of not being able to drop columns,
# I need to create a new table with only the columns I need, drop the old table and rename the new one.

create_new_table('victim_main', 'victim_main_tmp', vctm_clmns_to_lv, drop_rename=True)

#### Weapon table

In [None]:
# Main weapon table columns

q='SELECT * FROM nibrs_weapon'
df=table_query(q)
df.columns

In [None]:
# Creating a list with columns to leave in the main weapon table

wpn_clmns_to_lv=['weapon_id', 'offense_id']

# Due to the fact that sqlite has a limitation of not being able to drop columns,
# I need to create a new table with only the columns I need.

create_new_table('nibrs_weapon', 'weapon_main', wpn_clmns_to_lv)

In [None]:
cur.execute("""SELECT name FROM sqlite_master WHERE type='table'""").fetchall()    

In [None]:
q='SELECT * FROM weapon_main'
df=table_query(q)
df.count()

In [None]:
q='SELECT * FROM nibrs_weapon_type'
df=table_query(q)
df

In [None]:
# Intermediatly (to be dropped later) adding 'weapon_name' column to weapon_main table, plus 'weapon' column
add_update_clmn('weapon_main','nibrs_weapon_type', 'weapon_name', 'weapon_name', 'weapon_id')
cur.execute('ALTER TABLE weapon_main ADD COLUMN weapon')

# Making sure the columns are there
q='SELECT * FROM weapon_main'
df=table_query(q)
df.head()

In [None]:
# A snippet to change weapon_main by adding a weapon_name and a weapon columns based on nibrs_weapon_type table values
# the final weapont_main will have only 2 columns offence_id and weapon with 5 unique values 'Unarmed','Unknown',
# 'Other weapon','Non-automatic firearm','Automatic firearm'.

# Anything with 'automatic' is mapped to 'Automatic firearm'
# 'Unknown' - to 'Unknown'
# 'Unarned'  or 'None' - to 'Unarmed'
# 'Firarm', 'Handgun','Rifle','Shotgun','Personal Weapons' or 'Other Firearm' to 'Non-automatic firearm'
# the rest of values are mapped to 'Other weapon'

# I could've possibly done it by creating a dataframe, using dictionary to update the values 
# and kicking it back to the database. 

statement="UPDATE weapon_main SET weapon='Automatic firearm' WHERE weapon_name like ('%Automatic%')"
cur.execute(statement)

statement="UPDATE weapon_main SET weapon=weapon_name WHERE weapon_name='Unknown'"
cur.execute(statement)

statement="UPDATE weapon_main SET weapon='Unarmed' WHERE weapon_name in ('None','Unarmed')"
cur.execute(statement)

statement="UPDATE weapon_main SET weapon='Non-automatic firearm' \
WHERE weapon_name in ('Firarm', 'Handgun','Rifle','Shotgun','Personal Weapons','Other Firearm')"
cur.execute(statement)

statement="UPDATE weapon_main SET weapon='Other weapon' WHERE weapon is Null"
cur.execute(statement)

# Creating a list with columns to leave in the main weapon table.
wpn_clmns_to_lv=['offense_id', 'weapon']

# Due to the fact that sqlite has a limitation of not being able to drop columns,
# I need to create a new table with only the columns I need, drop the old table and rename the new one.
df=create_new_table('weapon_main', 'weapon_main_tmp', wpn_clmns_to_lv, drop_rename=True)

In [None]:
q='SELECT * FROM weapon_main'
df=table_query(q)
df.groupby('weapon').nunique()

#### Dropping unneeded tables

In [None]:
# Dropping all the original incident, offense, offender, victim and weapon tables

table_list_to_drop=['nibrs_victim','nibrs_offense','nibrs_incident','nibrs_weapon','nibrs_offender']

for table in table_list_to_drop:
    string=table
    statement='DROP TABLE'+' '+string
    cur.execute(statement)
cur.execute("""SELECT name FROM sqlite_master WHERE type='table'""").fetchall()    

In [None]:
# Dropping all obsolete reference tables
table_list_to_drop=['nibrs_age','nibrs_victim_type','nibrs_ethnicity','ref_race', 'nibrs_weapon_type']

for table in table_list_to_drop:
    string=table
    statement='DROP TABLE'+' '+string
    cur.execute(statement)
cur.execute("""SELECT name FROM sqlite_master WHERE type='table'""").fetchall()    

**Uncomment the following 2 cells, run them and comment out again if you want to re-run the code above**.

In [None]:
# cur.close()
# conn.commit()
# conn.close()

In [None]:
# !cp data/sqlite/db/production1_backup.db data/sqlite/db/production1.db

# !rm data/sqlite/db/production1_backup.db

> At this point victim_main, offender_main and weapon_main tables are ready. I am creating an intermediate database to avoid the need to recreate the main one if I make a mistake.

### Agencies

In [None]:
# stmnt="DROP TABLE table_name"
# cur.execute(stmnt)

**The cell below is to close a production1 db/cursor (commit too) and to use production1 db as a spring board moving forward. Uncomment the cell, run it to copy production1 to production2 plus production2 backup and comment it out again**

In [None]:
# cur.close()
# #conn.commit()
# conn.close()

# !cp data/sqlite/db/production1.db data/sqlite/db/production2.db
# !cp data/sqlite/db/production2.db data/sqlite/db/production2_backup.db

In [None]:
# # Initiating a cursor
# conn = sqlite3.connect('data/sqlite/db/production2.db')
# cur = conn.cursor()

In [None]:
# Checking if production1 copied correctly into production2
q='SELECT * FROM weapon_main'
df=table_query(q)
df.groupby('weapon').nunique()

#### agencies table

> preparing agencies table before comparing it to cde_agencies table

In [None]:
q='SELECT * from agencies'
df=table_query(q)
df.columns

In [None]:
df.head()

In [None]:
# Dropping all unused columns
agncs_to_lv_agnctbl=['agency_id', 'data_year',
       'pub_agency_name',
        'county_name']

df=create_new_table('agencies', 'agencies_tmp', agncs_to_lv_agnctbl, drop_rename=True)

In [None]:
q='SELECT * from agencies'
df=table_query(q)
df.head()

In [None]:
df['agency_id'].nunique()

#### cde_agencies table

> Preparing cde_agencies table befor comparing it to agencies table

In [None]:
q='SELECT * from cde_agencies'
df=table_query(q)
df.head()

In [None]:
df.columns

In [None]:
# Dropping all the columns that seem to be irrelevant. Long and lat coordinates are useless due to the fact that they are 
# either of a center of a zipcode or a center of a county. Either way is't useless

agncs_to_lv_cdeagnctbl=['agency_id', 'agency_name', 'short_name',
       'primary_county_id',
       'primary_county',        
       'current_year',
       'icpsr_zip']

df=create_new_table('cde_agencies', 'cde_agencies_tmp', agncs_to_lv_cdeagnctbl,  drop_rename=True)

In [None]:
q='SELECT * from cde_agencies'
df=table_query(q)
df.head()

> Comparing cde_agencies and agencies tables to use one of them moving forward

In [None]:
df['agency_id'].nunique()

In [None]:
q="SELECT distinct(agency_id) FROM agencies where agency_ID not in (SELECT agency_id FROM cde_agencies)"
df=table_query(q)
df

In [None]:
q="SELECT * FROM agencies where agency_ID=29074"
df = pd.DataFrame(cur.execute(q))
df

In [None]:
stmnt="SELECT distinct(agency_id) FROM incident_main where agency_id not in (SELECT agency_id FROM cde_agencies)"
df = pd.DataFrame(cur.execute(stmnt))
df

In [None]:
clmns_to_lv_cdeagnctbl=['agency_id',
                        'primary_county',
                        'icpsr_zip']

df=create_new_table('cde_agencies', 'cde_agencies_tmp', clmns_to_lv_cdeagnctbl,  drop_rename=True)

#### Conclusion

>There are more counties (and their names are spelled out rather than merged together) in **cde_agencies**. Also there are zip codes in **cde_agencies**. There are 223 zip codes out of 511 active zip codes in Colorado. 
* There are 14 agencies that have records in incident_main table but are missing from agencies table while they are present in **cde_agencies**.
* There is one agency (agency_id=29074), it is a Division of Gaming Criminal Enforcement in Jefferson county, that is in **agencies** table but is not in **cde_agencies**. However, this agency has no incident records.


<span style="font-size:1.5em;">The final conclusion that only **cde_agencies** table will be used moving forward.</span><br>

### Other tables

There are cleaned-up tables:<br>
* cde_agencies
* incident_main
* offence_main
* victim_main
* offender_main
* weapon_main<br>

There are tables that need to be cleaned and joined with the main tables:
* nibrs_bias_list
* nibrs_location_type
* nibrs_offense_type
* nibrs_cleared_except
* nibrs_relationship
* nibrs_bias_motivation
* nibrs_victim_offender_rel<br>

There are several tables that need to be deleted:
* agencies
* agency_participation
* nibrs_criminal_act
* nibrs_criminal_act_type
* nibrs_victim_offense
> Agencies and agency_participation are being dropped as explained above. 


In [None]:
# Deleting the tables above

table_list_to_drop=['agencies','agency_participation']

for table in table_list_to_drop:
    string=table
    statement='DROP TABLE'+' '+string
    cur.execute(statement)
cur.execute("""SELECT name FROM sqlite_master WHERE type='table'""").fetchall()

#### Bias table

> Adding bias type info to the main bias table

In [None]:
q="SELECT * FROM nibrs_bias_list"
df = table_query(q)
df

In [None]:
# Intermediatly (to be dropped later) adding 'bias_name' column to bias_main table

bias_clmns_to_lv=['bias_id', 'offense_id']

# Due to the fact that sqlite has a limitation of not being able to drop columns,
# I need to create a new table with only the columns I need.

create_new_table('nibrs_bias_motivation', 'bias_main', bias_clmns_to_lv)

add_update_clmn('bias_main','nibrs_bias_list', 'bias_name', 'bias_name', 'bias_id')

In [None]:
# Making sure the columns are there
q='SELECT * FROM bias_main'
df=table_query(q)
df.bias_name.unique()

In [None]:
bias_to_lv_biasmot=['offense_id',
       'bias_name']

df=create_new_table('bias_main', 'bias_main_tmp', bias_to_lv_biasmot,  drop_rename=True)

In [None]:
q='SELECT * FROM bias_main'
df=table_query(q)
df.groupby('bias_name').nunique()

#### Location in the offense table

> Leaving all location types in. However, I might reconsider later to change to Home/Residence, Other and Unknown only

In [None]:
# Adding a new column to offense table with location_names

add_update_clmn('offense_main','nibrs_location_type', 'location_name', 'location_name', 'location_id')

q='SELECT * FROM offense_main'
df=table_query(q)
df.location_name.unique()

In [None]:
df.groupby('location_name').nunique()

In [None]:
df.nunique()

#### Offense type in the offense table

> Adding offense type info to the main offense table

In [None]:
q='SELECT * from nibrs_offense_type'
df=table_query(q)
df

In [None]:
# Adding a new column to offense table with offence_type name

add_update_clmn('offense_main','nibrs_offense_type', 'offense_name', 'offense_name', 'offense_type_id')

add_update_clmn('offense_main','nibrs_offense_type', 'crime_against', 'crime_against', 'offense_type_id')

add_update_clmn('offense_main','nibrs_offense_type', 'offence_category_name', 'offense_category_name', 'offense_type_id')

In [None]:
# Dropping all unused columns
offns_to_lv_offnstbl=['offense_id', 'incident_id','location_name','offense_name','crime_against','offence_category_name']

df=create_new_table('offense_main', 'offense_main_tmp', offns_to_lv_offnstbl, drop_rename=True)

In [None]:
q='SELECT * from offense_main'
df=table_query(q)
df.head()

#### Victim-offender relationship

> Adding victim-offender relationship info to the main victim table

In [None]:
cur.execute("""SELECT name FROM sqlite_master WHERE type='table'""").fetchall()   

In [None]:
q='SELECT * from nibrs_relationship'
df=table_query(q)
df.head()

In [None]:
q='SELECT * from nibrs_victim_offender_rel'
df=table_query(q)
df.head()

In [None]:
add_update_clmn('nibrs_victim_offender_rel','nibrs_relationship', 'relationship_name', 'relationship_name',
                'relationship_id')

In [None]:
# Dropping all unused columns
clmns_to_lv_rlshnshptbl=['victim_id', 'offender_id','relationship_name']

df=create_new_table('nibrs_victim_offender_rel', 'nibrs_victim_offender_rel_tmp', clmns_to_lv_rlshnshptbl, drop_rename=True)

In [None]:
q='SELECT * from nibrs_victim_offender_rel'
df=table_query(q)
df.head()

In [None]:
stmnt='ALTER TABLE nibrs_victim_offender_rel RENAME to victim_offender_rel'
cur.execute(stmnt)

In [None]:
cur.execute("""SELECT name FROM sqlite_master WHERE type='table'""").fetchall()

#### Dropping all reference tables

In [None]:
table_list_to_drop=['nibrs_bias_list',
                    'nibrs_location_type',
                    'nibrs_offense_type',
                    'nibrs_cleared_except',
                    'nibrs_relationship',
                    'nibrs_bias_motivation']

for table in table_list_to_drop:
    string=table
    statement='DROP TABLE'+' '+string
    cur.execute(statement)
cur.execute("""SELECT name FROM sqlite_master WHERE type='table'""").fetchall()    

### Combining all tables into one based on offense table

#### Incident table

> Adding agencies info into the main incident table and dropping the cde_agencies table. Replacing '' in the incident table hour column to '0'.

In [None]:
q='SELECT * from incident_main'
df=table_query(q)
df.info()

In [None]:
q='SELECT * from cde_agencies'
df=table_query(q)
df

In [None]:
remove_dups('cde_agencies', 'cde_agencies_nodups', conn, drop_rename=True)

In [None]:
add_update_clmn('incident_main','cde_agencies', 'primary_county', 'primary_county', 'agency_id')

add_update_clmn('incident_main','cde_agencies', 'icpsr_zip', 'icpsr_zip',  'agency_id')

In [None]:
q='SELECT * from incident_main'
df=table_query(q)
df

In [None]:
df.incident_hour.isna().sum()

In [None]:
update_value('incident_main', 'incident_hour', "''", '0')

In [None]:
stmnt="DROP TABLE cde_agencies"
cur.execute(stmnt)

#### Creating dataframes and saving them to pickle files to finalize working with sqlite tables 

In [None]:
cur.execute("""SELECT name FROM sqlite_master WHERE type='table'""").fetchall()

In [None]:
q='SELECT * from incident_main'
df_incident=table_query(q)
with open('data/pickled_dataframes/incident.pickle', 'wb') as f:
    pickle.dump(df_incident, f)

In [None]:
with open('data/pickled_dataframes/incident.pickle', 'rb') as f:
    df_incident=pickle.load(f)
df_incident.head()

In [None]:
len(df_incident)

In [None]:
q='SELECT * from offense_main'
df_offense=table_query(q)
with open('data/pickled_dataframes/offense.pickle', 'wb') as f:
    pickle.dump(df_offense, f)

In [None]:
with open('data/pickled_dataframes/offense.pickle', 'rb') as f:
    df_offence=pickle.load(f)
df_offense.head()

In [None]:
len(df_offense)

In [None]:
q='SELECT * from offender_main'
df_offender=table_query(q)
with open('data/pickled_dataframes/offender.pickle', 'wb') as f:
    pickle.dump(df_offender, f)

In [None]:
with open('data/pickled_dataframes/offender.pickle', 'rb') as f:
    df_offender=pickle.load(f)
df_offender.head()

In [None]:
len(df_offender)

In [None]:
q='SELECT * from victim_main'
df_victim=table_query(q)
with open('data/pickled_dataframes/victim.pickle', 'wb') as f:
    pickle.dump(df_victim, f)

In [None]:
with open('data/pickled_dataframes/victim.pickle', 'rb') as f:
    df_victim=pickle.load(f)
df_victim.head()

In [None]:
len(df_victim)

In [None]:
q='SELECT * from weapon_main'
df_weapon=table_query(q)
with open('data/pickled_dataframes/weapon.pickle', 'wb') as f:
    pickle.dump(df_weapon, f)

In [None]:
with open('data/pickled_dataframes/weapon.pickle', 'rb') as f:
    df_weapon=pickle.load(f)
df_weapon.head()

In [None]:
len(df_weapon)

In [None]:
q='SELECT * from bias_main'
df_bias=table_query(q)
with open('data/pickled_dataframes/bias.pickle', 'wb') as f:
    pickle.dump(df_bias, f)

In [None]:
with open('data/pickled_dataframes/bias.pickle', 'rb') as f:
    df_bias=pickle.load(f)
df_bias.head()

In [None]:
len(df_bias)

In [None]:
q='SELECT * from victim_offender_rel'
df_rel=table_query(q)
with open('data/pickled_dataframes/relationship.pickle', 'wb') as f:
    pickle.dump(df_rel, f)

In [None]:
with open('data/pickled_dataframes/relationship.pickle', 'rb') as f:
    df_rel=pickle.load(f)
df_rel.head()

In [None]:
len(df_rel)

In [None]:
cur.close()
conn.commit()
conn.close()

>The next step is working with the dataframes in [scrub, part 2 notebook](capstone_prj_scrub_part2.ipynb) 

# EXPLORE

# MODEL

# iNTERPRET

# CONCLUSIONS & RECOMMENDATIONS

> Summarize your conclusions and bullet-point your list of recommendations, which are based on your modeling results.

# TO DO/FUTURE WORK

- 