In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import sys
import os
import glob
import geopandas as gp
import matplotlib.pyplot as plt

from PreProcessing import PreProcessingCrimeData
from PreProcessing import PreProcessingShapefiles

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
np.set_printoptions(threshold=sys.maxsize)

# 1. Crime and victims data

In [None]:
ppc = PreProcessingCrimeData('crime')
ppv = PreProcessingCrimeData('victims')
ppc.getDirectories()

In [None]:
crime_list = ['homicidio doloso', 'lesiones dolosas', 'violación simple', 'violación equiparada', 'abuso sexual', 
              'acoso sexual', 'robo a transeunte en via publica', 'robo a pasajero a bordo de transporte publico colectivo', 
              'robo de vehiculo']
crime_features_list = ['date_events', 'crimeType', 'crimeTypeViolence', 'neighborhood_events', 'longitude', 'latitude']
victims_features_list = ['date_events', 'sex', 'age', 'crimeType', 'crimeTypeViolence', 'neighborhood_events', 'longitude', 'latitude']

In [None]:
pp_crime_data = ppc.preProcessCrime(2019, 2021, crime_list, crime_features_list)

In [None]:
pp_victims_data = ppv.preProcessVictims(2019, 2021, crime_list, victims_features_list)

# 2. Shapefiles (environmental data)

See ShapefilePreProcessing.ipynb 

# 3. Merge pre-processed crime, victims and environmental data

In [None]:
# Read pre-processed crime data
pp_crime_data = ppc.loadProcessedDatasets()
pp_crime_data.head(3)

In [None]:
# Read pre-processed victims data
pp_victims_data = ppv.loadProcessedDatasets()
pp_victims_data.head(3)

In [None]:
# Read pre-processed shapefile
pps = PreProcessingShapefiles()
pp_shapefiles = pps.loadProcessedData()
pp_shapefiles.head(3)

In [None]:
gdf_crime_data = pps.setGeometryProjection(pp_crime_data)
gdf_victims_data = pps.setGeometryProjection(pp_victims_data)

In [None]:
gdf_crime_data.head(3)

In [None]:
gdf_victims_data.head(3)

In [None]:
ax = pp_shapefiles.plot(figsize=(5,5))
gdf_crime_data.plot(ax=ax, color = 'yellow')
plt.show()

In [None]:
ax = pp_shapefiles.plot(figsize=(5,5))
gdf_victims_data.plot(ax=ax, color = 'yellow')
plt.show()

In [None]:
merged_crime_data, merged_victims_data = pps.joinPointsAttributes([gdf_crime_data, gdf_victims_data], pp_shapefiles)

In [None]:
merged_crime_data.head(3)

In [None]:
merged_crime_data.columns

In [None]:
merged_victims_data.head(3)

In [None]:
merged_crime_data.isna().sum()[-10:]

In [None]:
merged_victims_data.isna().sum()[-12:]

In [None]:
def aggregateAge(data):
    
    def categorizeAge(row):
        if 0 <= row['age'] < 18:
            category = 'less than 18'
        elif 18 <= row['age'] < 30:
            category = '18 to 29'
        elif 30 <= row['age'] < 40:
            category = '30 to 39'
        elif 40 <= row['age'] < 50:
            category = '40 to 49'
        elif 50 <= row['age'] < 60:
            category = '50 to 59'
        elif 60 <= row['age'] <= 120:
            category = 'more than 60'
        else:
            category = 'not registered'
        return category

    df = data.copy()
    df.age.fillna(-99, inplace=True)
    df["age_categ"] = df.apply(lambda x: categorizeAge(x), axis=1)
    return df
    

In [None]:
merged_victims_data = aggregateAge(merged_victims_data)
merged_victims_data.head(3)

## 3.1 Aggregate victims data

In [None]:
# Aggregate per year, month and take median of age and count sex types

def aggregateVictimData(data, feature):
    if feature == 'sex':
        df = pd.pivot_table(data, index=['year_event', 'month_event', 'key_neighb'], values='date_events',  
                   columns=['sex'], aggfunc={'count'}, fill_value=0).reset_index()
        df2 = pd.concat([df[['year_event', 'month_event', 'key_neighb']], df['count']], axis=1)
        df2.columns = ['year_event', 'month_event', 'key_neighb', 'women_vic', 'men_vic', 'unregistered']
        df2["year_event"] = df2["year_event"].astype('int32')
        df2["month_event"] = df2["month_event"].astype('int32')
        df2["year_event"] = df2["year_event"].astype('string')
        df2["month_event"] = df2["month_event"].astype('string')
        df2 = df2.assign(unique_id = lambda x: df2["year_event"] + df2["month_event"] + df2["key_neighb"])
        df3 = df2.drop(["year_event", "month_event", "key_neighb", "unregistered"], axis=1)
        return df3
    elif feature == 'age':
        df = pd.pivot_table(data, index=['year_event', 'month_event', 'key_neighb'], values='date_events',  
                   columns=['age_categ'], aggfunc={'count'}, fill_value=0).reset_index()
        df2 = pd.concat([df[['year_event', 'month_event', 'key_neighb']], df['count']], axis=1)
        df2.columns = ['year_event', 'month_event', 'key_neighb', "less than 18", "18 to 29", "30 to 39", "40 to 49", "50 to 59", "more than 60", "not registered"]
        df2["year_event"] = df2["year_event"].astype('int32')
        df2["month_event"] = df2["month_event"].astype('int32')
        df2["year_event"] = df2["year_event"].astype('string')
        df2["month_event"] = df2["month_event"].astype('string')
        df2 = df2.assign(unique_id = lambda x: df2["year_event"] + df2["month_event"] + df2["key_neighb"])
        df3 = df2.drop(["year_event", "month_event", "key_neighb", "not registered"], axis=1)
        #df3 = df2.drop(["not registered"], axis=1)
        return df3
    else:
        print('Choose the feature over which to aggregate.')
    return df3

In [None]:
sex_features = aggregateVictimData(merged_victims_data, 'sex')
print(sex_features.shape[0], '\n')
sex_features.head(3)

In [None]:
age_features = aggregateVictimData(merged_victims_data, 'age')
print(age_features.shape[0], '\n')
age_features.head(3)

In [None]:
# Merge both datasets
merged_features = sex_features.merge(age_features, how="left", on="unique_id")
merged_features.head()

In [None]:
def divideDataPerMonthYear(data):
    list_dfs = []
    for year in data.year_event.unique():
        print("For the year " + str(year) + "...")
        df = data[data.year_event == year]
        for month in df.month_event.unique():
            df2 = df[df.month_event == month]
            list_dfs.append(df2)
    print(len(list_dfs))
    return list_dfs

In [None]:
list_shapefiles = divideDataPerMonthYear(gdf_crime_data)

In [None]:
list_shapefiles[0].head(3)

In [None]:
gdf_crime_data.head(3)

In [None]:
gdf_crime_data.isna().sum()

# 4. Count crime and victims reports points in neighborhoods

In [None]:
def countCrimePointsInPolygon(crime_shapefile, polygon_shapefile):
    gdf = polygon_shapefile.copy()
    gdf_counts = gdf.merge(gdf.sjoin(crime_shapefile, predicate='contains').groupby('key_neighb').size().rename('crime_count').reset_index(), how='left').fillna(0)
    gdf_counts['crime_count'] = gdf_counts['crime_count'].astype('int32')
    month = str(crime_shapefile.month_event.unique()[0])
    year = str(crime_shapefile.year_event.unique()[0])
    gdf_counts['month_event'] = month
    gdf_counts['year_event'] = year
    gdf_final = gdf_counts.assign(unique_id = lambda x: gdf_counts["year_event"] + gdf_counts["month_event"] + gdf_counts["key_neighb"])
    return gdf_final

def mergeAllCrimeCounts(list_crime_shapefiles, polygon_shapefile):
    list_gdfs = []
    for shapefile in list_crime_shapefiles:
        gdf = countCrimePointsInPolygon(shapefile, polygon_shapefile)
        list_gdfs.append(gdf)
    concat_gds = pd.concat(list_gdfs)
    return concat_gds
    
def mergeAllFeatures(data, df_merged_features):
    gdfs = data.merge(df_merged_features, how="left", on="unique_id")
    gdfs = gdfs[['unique_id', 'month_event', 'year_event', 'key_neighb', 'name_neigh', 'key_boroug', 'name_borou', 'cablebus_s', 'commer_ven', 'health_cen', 'metro_sta', 'pmarkets', 'pparking', 'hospitals', 
                 'train_sta', 'trolebus_s', 'cablebus_l', 'ptransp_ro', 'main_roads', 'metro_line', 'rtp_lines', 'train_line', 'trolebus_l', 'centres_va', 'be_schools', 
                 'commercial', 'industrial', 'service_un', 'crime_count', 'women_vic', 'men_vic', 'less than 18', '18 to 29', '30 to 39', '40 to 49', 
                 '50 to 59', 'more than 60', 'geometry']]
    gdfs.fillna(0, inplace=True)
    cols_list = ['women_vic', 'men_vic', 'less than 18', '18 to 29', '30 to 39', '40 to 49', '50 to 59', 'more than 60'] 
    for col in cols_list:
        gdfs[col] = gdfs[col].astype('int32')
    #gdfs.set_index('unique_id', inplace=True)
    return gdfs
    

In [None]:
pre_final_gdf = mergeAllCrimeCounts(list_shapefiles, pp_shapefiles)
pre_final_gdf.head()

In [None]:
pre_final_gdf.shape[0]

In [None]:
final_preproc_gdf = mergeAllFeatures(pre_final_gdf, merged_features)
final_preproc_gdf.head()

In [None]:
final_preproc_gdf.isna().sum()

In [None]:
final_preproc_gdf.to_file(os.path.normpath(os.path.join(os.getcwd(), '../data/preprocessed')) + '/shapefiles/preprocessed_final_data.shp', driver ='ESRI Shapefile')
final_preproc_gdf.to_csv(os.path.normpath(os.path.join(os.getcwd(), '../data/preprocessed')) + '/preprocessed_final_data.csv')
