In [None]:
import pandas as pd
from pandas.tseries.offsets import Hour, Minute, Second # ...
from pandas.tseries.holiday import USFederalHolidayCalendar as calendar

import itertools as it
import pytz
import numpy as np
import folium
import matplotlib.pyplot as plt
import seaborn as sns

from folium.plugins import HeatMap, HeatMapWithTime, MarkerCluster

from sklearn.manifold import TSNE
from sklearn.manifold import LocallyLinearEmbedding

from sklearn.cluster import KMeans
from sklearn.cluster import SpectralClustering #kernel
from sklearn.cluster import DBSCAN
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import FeatureAgglomeration
from sklearn.cluster import AffinityPropagation

from yellowbrick.cluster import KElbowVisualizer

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from xgboost import XGBClassifier

from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score
from sklearn.metrics import r2_score
from sklearn.metrics import classification_report
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score, recall_score, precision_recall_curve
from sklearn.metrics import f1_score
from sklearn.metrics import roc_curve, roc_auc_score

from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

%matplotlib inline

<h1 style="background-color:#000000;font-family:Georgia;color:#FFFFFF;font-size:300%;text-align:center;border-radius:10px 10px;border-style:solid;border-width:3px;border-color:#000000;"><b>San Francisco Crime</b></h1>

- Please give me an UPVOTE if you can. Your UPVOTE will be a great encouragement to me!

<p style="background-color:#000000;font-family:Georgia;color:#FFFFFF;font-size:150%;text-align:center;border-radius:10px 10px;">Overview</p>

<p style="background-color:#000000;font-family:Georgia;color:#FFFFFF;font-size:150%;text-align:center;border-radius:10px 10px;">Data Fields</p>

- Data fields
- Dates - timestamp of the crime incident
- Category - category of the crime incident (only in train.csv). This is the target variable you are going to predict.
- Descript - detailed description of the crime incident (only in train.csv)
- DayOfWeek - the day of the week
- PdDistrict - name of the Police Department District
- Resolution - how the crime incident was resolved (only in train.csv)
- Address - the approximate street address of the crime incident 
- X - Longitude
- Y - Latitude

<h1 style="background-color:#000000;font-family:Georgia;color:#FFFFFF;font-size:300%;text-align:center;border-radius:10px 10px;border-style:solid;border-width:3px;border-color:#000000;"><b>First EDA</b></h1>

* Get a bird's eye view of the data, checking the internal status of the csv (columns,null,mean,std)
* Here's what you can easily see
    1. there are timestamps and they are also categorized.
    2. the day of the week is categorized against the timestamp. 
    3. The timestamp is the date of the crime, so the location is categorized. 
    4. For locations, we can assume that X and Y are latitudes and longitudes. (This is described in the dataset description without guessing, but it is also important to guess what this is. However, it is also important to guess what this is, because you may discover new FEs in the process of guessing). 
    5. If X and Y are not easy to understand as Columns, rename them to Longitude and Latitude. (Lon and Lat are also acceptable).
        - Depending on the library, the position to pass the latitude and longitude may be reversed, so check the library.
* Visualization that may be possible
    1. normal visualization (Visualization) plot
    2. visualization of a map, which can be used to visualize in which districts crime occurs most often
    3. different data from SanFrancisco (stores, residential blocks, residential blocks house prices, residential or non-residential by race) can be used to get different views on why crime is high in a given area. 
    4. crime rate and GDP, higher education rate, number of single households in the past, time-stamped increase/decrease of crime rate in industrial development, growth of price index in San Francisco as a whole, time-stamped urban development, increase/decrease of residents in San Francisco, etc. This will provide a variety of insights. This could also serve as a base cohort for other states that may themselves be relevant, as it would elucidate the underlying phenomenon that explains why crime increases and decreases (in which case, I would guess that industrial development should be excluded, as it varies by state).
* Possible transformations
    1. hierarchical categorization
    2. create cluster transformation, hierarchical clusters
    3. If distribution does not follow a normal distribution, use Box-Cox transformation to convert to normal distribution. 
    4. Gaussian mixture clustering analysis using (transformed or untransformed) normal distribution.
    5. visualize clustering by dimensionality reduction while maintaining variance using PCA.
* Visualization is used only for gaining knowledge, and methodological interpretations, such as improving the accuracy of FeatureEnginiering by gaining knowledge, are performed through visualization.

In [None]:
train = pd.read_csv('../input/sf-crime/train.csv.zip')
test = pd.read_csv('../input/sf-crime/test.csv.zip')

In [None]:
train.shape, test.shape

<p style="background-color:#000000;font-family:Georgia;color:#FFFFFF;font-size:200%;text-align:center;border-radius:10px 10px;">Trainset</p>

In [None]:
train.head(3).style.background_gradient(cmap='mako_r', text_color_threshold=0.02)

In [None]:
train.describe().style.background_gradient(cmap='mako_r', text_color_threshold=0.02)

In [None]:
train.info()

In [None]:
train.isnull().sum()

<p style="background-color:#000000;font-family:Georgia;color:#FFFFFF;font-size:200%;text-align:center;border-radius:10px 10px;">Testset</p>

In [None]:
test.head(3).style.background_gradient(cmap='rocket', text_color_threshold=0.02)

In [None]:
test.describe().style.background_gradient(cmap='mako_r', text_color_threshold=0.02)

In [None]:
test.info()

In [None]:
test.isnull().sum()

-----------
-----------

<p style="background-color:#000000;font-family:Georgia;color:#FFFFFF;font-size:150%;text-align:center;border-radius:10px 10px;">unique</p>

In [None]:
train.head(3).style.background_gradient(cmap='mako_r', text_color_threshold=0.02)

In [None]:
train['PdDistrict'].unique()

In [None]:
train['Resolution'].unique()

In [None]:
train['Category'].unique() # target

In [None]:
train['DayOfWeek'].unique()

In [None]:
train['Address'].unique(), len(train['Address'].unique())

In [None]:
len(train['Descript'].unique())

In [None]:
train.iloc[1]

--------
<h1 style="background-color:#000000;font-family:Georgia;color:#FFFFFF;font-size:300%;text-align:center;border-radius:10px 10px;"><b>Visualization</b></h1>

* 1. visualization of Category against DayOfWeek, which week has the highest crime rate. 2.

* 2. Histgram of Category

* 3. Histgram of PdDistrict, i.e. which station has the highest number of correspondences.

* 4. increase/decrease in crime rate against time stamp, visibility of seasonality.

* 5. yearly rate of increase/decrease in crime for the timestamp and knowledge of socio-dynamic economic dynamics such as human flow, GDP, etc. showing the relationship between the two.

In [None]:
def visualization_crime(value, tight=False):
    with plt.style.context('fivethirtyeight'):
        fig, ax = plt.subplots(1, 1, figsize=(19, 6))
        sns.histplot(x=value, data=train, kde=True, palette='rocket', ax=ax)
        ax.tick_params(axis='x', rotation=90)
        if tight == True:
            fig.tight_layout()

**Category Visualization**

In [None]:
visualization_crime('Category', tight=False)

**DayOfWeek Visualization**

In [None]:
visualization_crime('DayOfWeek')

**PdDistrict Visualization**

In [None]:
visualization_crime('PdDistrict')

**Resolution Visualization**

In [None]:
visualization_crime('Resolution')

**Latitude Visualization**

In [None]:
visualization_crime('X')

------

-------
<h1 style="background-color:#000000;font-family:Georgia;color:#FFFFFF;font-size:200%;text-align:center;border-radius:10px 10px;">Latitude and longitude scatter chart</h1>

- Since the latitude and longitude information exists, it is visualized as a scatter plot.
    - In this way, the shape of the scatter plot displays the shape of the city. This will become visually clearer as the number of data increases.
    - What we can see here is that the Latitude and Longitude of the data clearly show the coordinates and are accurate. Using Latitude and Longitude without visualization is dangerous in my experience.
    - The following scatter plot shows the location of the crimes. The following scatterplot is based on 250,000/ALL, to make it clear whether the data is actual latitude and longitude.
        - It is also possible to specify conditions to display only specific crimes. See Folium below for details.

In [None]:
with plt.style.context('fivethirtyeight'):
    fig, ax = plt.subplots(1, 1, figsize=(19, 19))
    sns.scatterplot(data=train.iloc[:250000], x='X', y='Y', alpha=0.6, palette='rocket', hue='Category', size='Category') # 878049 
    plt.legend(bbox_to_anchor=(1.0, 1.0), loc='upper left')

- Data subdivided by DayOfWeek. It is a little difficult to see because of the large number of data, but it has a left-leaning shape over the weekend.
- All these problems can be solved with folium. This is just a pre-implementation test display.

In [None]:
with plt.style.context('fivethirtyeight'):
    fig, ax = plt.subplots(1, 1, figsize=(19, 19))
    sns.scatterplot(data=train.iloc[:50000], x='X', y='Y', alpha=0.6, palette='flare', hue='DayOfWeek',
                    size='DayOfWeek', sizes=(20, 200), markers=True) # 878049 

----
<h1 style="background-color:#000000;font-family:Georgia;color:#FFFFFF;font-size:300%;text-align:center;border-radius:10px 10px;"><b>Folium [1]</b></h1>

> In 2, annual data transition and animation display by clustering are implemented in Folium.

- Functionalization
- The pandas query is used to speed up the process (actually, Numexpr) and to display the heatmap of the data according to multiple conditions.
    - The query shows that the ASSAULT data is concentrated around downtown and Chinatown. The reason why there are so many ASSAULTs in the Bay Area is because they tend to gather in groups.
    - The following EDA makes it clear that the probability of Crime increases when a group of people get together.

In [None]:
def query_heat_map(query, location=[37.774599, -122.425892]):
    train_query = train.query(query).loc[:, ['Y', 'X']]
    if train_query.shape[0] == 0:
        print('Either the query is failing or there is no data itself.')
    m = folium.Map(location=location, zoom_start=13, tiles='CartoDB dark_matter') # HeatmapPlot-tiles : cartodbdark_matter
    train_query_geo_list = train_query.values.tolist()
    HeatMap(train_query_geo_list, blur=2, radius=3).add_to(m)
    #m.save('SanFrancisco-Crime-geo.html') # add 
    return m

In [None]:
query_heat_map("Category=='ASSAULT' & Resolution=='ARREST, BOOKED'")

---------
<p style="background-color:#000000;font-family:Georgia;color:#FFFFFF;font-size:200%;text-align:center;border-radius:10px 10px;">Timestamp operations</p>

1. when reading index using Dates

2. read Dates as Timestamp, and split and convert the Timestamp data. It is not necessary to convert the data to minutes and seconds, but we can assume that we can perform effective lineplotting by converting the data to hours. This will allow for effective visualization to a third party. For example, what time of the day has the highest crime rate using the .count() statistic.

3. If time series data exists, it is best practice to actively convert it to time series for use.

**1. read index as Timestamp.**
- What kind of transformation is possible?
- We are not actually using this data as Pre-Processing.

In [None]:
train_timestamp = pd.read_csv('../input/sf-crime/train.csv.zip', parse_dates=True, index_col='Dates')

In [None]:
train_timestamp.loc['2015-05-13'].head(2)

In [None]:
train_timestamp.index

- Strings are not statistically processed. It is an average of each latitude and longitude, and this average can be used as the basis for map-location in Folium.

In [None]:
train_timestamp.loc['2015'].mean()

In [None]:
train_timestamp.loc['2015'].count()

In [None]:
train_timestamp.loc['2013'].count()

In [None]:
mc = train_timestamp.groupby(level=0)

In [None]:
mc.count().head(3)

----
<h1 style="background-color:#000000;font-family:Georgia;color:#FFFFFF;font-size:300%;text-align:center;border-radius:10px 10px;"><b>SigmaClip</b></h1>
- SigmaClip is used to remove outliers. This method can be used to remove wrong dates, etc. (32 days, etc.) at once.

- The last line uses a robust estimation of the sample mean, where 0.74 is the value obtained from the quartile range of the Gaussian distribution. The actual syntax is not described here.

- Use query to speed up the process.

In [None]:
train_plot = pd.read_csv('../input/sf-crime/train.csv.zip', parse_dates=True)

In [None]:
frequency_c = pd.DataFrame(train_plot['Category'].value_counts(normalize=True))
frequency_c.reset_index(inplace=True)
frequency_c.rename({'index': 'Category', 'Category': 'Freq'}, axis=1, inplace=True)
frequency_c['Cumsum_Freq'] = frequency_c['Freq'].cumsum()

In [None]:
frequency_c = frequency_c.set_index('Category')

In [None]:
freq_quantiles = np.percentile(frequency_c['Cumsum_Freq'], [25, 50, 75]) # array([0.84974358, 0.96996523, 0.99674847])

In [None]:
qt = freq_quantiles[1] # 0.9699652297309147

In [None]:
sigma = 0.74 * (freq_quantiles[2] - freq_quantiles[0]) # 0.10878362141520569

In [None]:
query_freq = frequency_c.query("(Cumsum_Freq > @qt - 5 * @sigma) & (Cumsum_Freq < @qt + 5 * @sigma)") # Cumsum_Freq is Data_Columns

In [None]:
query_freq.reset_index(inplace=True)

In [None]:
query_freq_list = list(query_freq['Category'])

In [None]:
query_freq_list

In [None]:
train_sigma = train.loc[train['Category'].isin(query_freq_list)]

In [None]:
train_sigma.head()

In [None]:
train_sigma['Dates'] = pd.to_datetime(train_sigma['Dates']) # Not Copy is SettingCopyWarning

In [None]:
test['Dates'] = pd.to_datetime(test['Dates'])

--------
<h1 style="background-color:#000000;font-family:Georgia;color:#FFFFFF;font-size:300%;text-align:center;border-radius:10px 10px;">Second EDA</h1>

* Using the knowledge obtained in the above and the results of data processing, perform a more detailed EDA, obtain knowledge, and make it one of the EDA materials for presentation to a third party.
* The explanation details of each visualization are omitted to some extent.

**Which Crime is the most common?**

In [None]:
with plt.style.context('fivethirtyeight'):
    fig, ax = plt.subplots(1, 1, figsize=(19, 19))
    sns.countplot(y='Category', data=train_sigma, order=train_sigma['Category'].value_counts().index,  palette='rocket')
    ax.tick_params(axis='x', rotation=45)
    #ax.set_xticklabels(ax.get_xticklabels(), rotation=60, ha="right")

- Which incidents are most frequently dealt with by each police station?
    - This can be made clearer by showing the police stations as location points on the map.

In [None]:
cross_tab_train = pd.crosstab(train_sigma['Category'], train_sigma['PdDistrict'])

In [None]:
cross_tab_train.style.background_gradient(cmap='mako_r', text_color_threshold=0.02)

In [None]:
stack_sigma_list = cross_tab_train.stack().reset_index().rename(columns= {0:'value'})

- From numerical visualization to visualization with bar plots. It is important to do both, so that we can diversify our knowledge.

In [None]:
with plt.style.context('fivethirtyeight'):
    fig, ax = plt.subplots(1, 1, figsize= (15,10))
    sns.barplot(x=stack_sigma_list['PdDistrict'], y=stack_sigma_list['value'], hue=stack_sigma_list['Category'],  palette='rocket')
    ax.set_title('Categories Count per District')
    plt.legend(bbox_to_anchor=(1.0, 1.0), loc='upper left')

----
- base 0.8

In [None]:
frequency_c = pd.DataFrame(train_plot['Category'].value_counts(normalize=True))
frequency_c.reset_index(inplace=True)
frequency_c.rename({'index': 'Category', 'Category': 'Freq'}, axis=1, inplace=True)
frequency_c['Cumsum_Freq'] = frequency_c['Freq'].cumsum()
frequency_50_list = list(frequency_c.loc[frequency_c['Cumsum_Freq'] < 0.8, 'Category'])

In [None]:
train_50_list = train.loc[train['Category'].isin(frequency_50_list)]

In [None]:
train_50_list['Dates'] = pd.to_datetime(train_50_list['Dates'])

In [None]:
with plt.style.context('fivethirtyeight'):
    fig, ax = plt.subplots(1, 1, figsize=(19, 19))
    sns.countplot(y='Category', data=train_50_list, order=train_50_list['Category'].value_counts().index,  palette='rocket')
    ax.tick_params(axis='x', rotation=45)

In [None]:
ct_50_list = pd.crosstab(train_50_list['Category'], train_50_list['PdDistrict'])
ct_50_list.style.background_gradient(cmap='mako_r', text_color_threshold=0.02)

In [None]:
stack_50_list = ct_50_list.stack().reset_index().rename(columns= {0:'value'})

In [None]:
with plt.style.context('fivethirtyeight'):
    fig, ax = plt.subplots(1, 1, figsize= (15,10))
    sns.barplot(x=stack_50_list['PdDistrict'], y=stack_50_list['value'], hue=stack_50_list['Category'], palette='rocket')
    ax.set_title('Categories Count per District')

---------
<h1 style="background-color:#000000;font-family:Georgia;color:#FFFFFF;font-size:300%;text-align:center;border-radius:10px 10px;">Pre-Processing & Feature Enginiering</h1>

- Data transformation and feature engineering are performed using the knowledge obtained in the above.
- For date and time conversion, we refer to the time division table specified by the Ministry of Land, Infrastructure, Transport and Tourism.
- Features that correspond to the "Explanation" will be deleted in subsequent Pipelines. This is because the conversion is very difficult (e.g., using NLP) and because this feature does not exist in the test set, so adding explanations to the test set is not likely to be a best practice. However, if you want to make accurate predictions, removing it completely is a bad idea.

In [None]:
train_sigma.shape, train_50_list.shape

In [None]:
def time_group(date):
    
    date['Date'] = date['Dates'].dt.date
    date['Year'] = date['Dates'].dt.year
    date['Month'] = date['Dates'].dt.month
    date["Day"] = date["Dates"].dt.day
    date["Hour"] = date["Dates"].dt.hour
    date["Minute"] = date["Dates"].dt.minute
    date["Second"] = date["Dates"].dt.second
    
    ca = calendar()
    holidays = ca.holidays(start=date['Dates'].min(), end=date['Dates'].max())
    date['Holiday']= date['Dates'].dt.date.astype('datetime64').isin(holidays)
    # Week_replace
    week_mapping = {'Saturday': 5, 'Sunday': 6, 'Monday': 0, 'Tuesday': 1, 'Wednesday': 2, 'Thursday': 3, 'Friday': 4}
    date['Week_Mapping'] = date['DayOfWeek'].map(week_mapping)
    
    # https://www.jma.go.jp/jma/kishou/know/yougo_hp/saibun.html Ministry of Land, Infrastructure, Transport and Tourism: Japan Meteorological Agency [Definition classification]. time scale
    def hour_Segmentation(x):
        if x >= 3 and x < 9:
            return 0
        elif x >= 9 and x < 15:
            return 1
        else:
            return 2
        
    def morning_Or_Afternoon(x): # np.where
        if x >= 0 and x > 12:
            return 0
        else:
            return 1
        
    def daytime_Or_Nighttime(x): # np.where
        if x >= 9 and x > 18:
            return 0
        else:
            return 1
    
    def default_Work_Time_Hours(x): # np.where
        return 8 <= x <= 18
    
    def weekday_and_ends(x):
        return np.where(x < 5, 'Weekday', 'Weekend')
    
    # Q1:1 , Q2:2, Q3:3, Q4:4
    def quarter(x):
        if x >= 1 and x <= 3:
            return 1
        elif x > 3 and x <= 6:
            return 2
        elif x > 6 and x <= 9:
            return 3
        elif x > 9 and x <= 12:
            return 4
    
    # T1:1, T2:2, T3:3
    def month_sep(x):
        if x >= 1 and x < 10:
            return 1
        elif x >= 10 and x < 20:
            return 2
        elif x >= 20 and x <= 31:
            return 3
    
    def street_type(x):
        street_list = x.split(' ')
        for index in range(len(street_list)):
            fo_list = street_list[index]
            if len(fo_list) == 2 and fo_list not in ['OF', 'US', 'LA', 'of']:
                return fo_list
        
    date['Street_type'] = date['Address'].apply(street_type)
   
    date['HourGroup'] = date["Hour"].apply(hour_Segmentation)
    date['MAGroup'] = date['Hour'].apply(morning_Or_Afternoon)
    date['DNGroup'] = date['Hour'].apply(daytime_Or_Nighttime)
    date['DworkGroup'] = date['Hour'].apply(default_Work_Time_Hours)
    date['Week_cat'] = date['Week_Mapping'].apply(weekday_and_ends)
    date['Month_quarter'] = date['Month'].apply(quarter)
    date['Month_sep'] = date['Day'].apply(month_sep)
    
    return date

In [None]:
train_sigma_1 = train_sigma.copy()

In [None]:
train_50_list_1 = train_sigma.copy()

In [None]:
train_group = time_group(train_sigma_1) # Main_train_group

In [None]:
test_group = time_group(test) # Main_Test

In [None]:
train_group.shape, test_group.shape

In [None]:
train_group_1 = time_group(train_50_list_1)

------
**Check the data after conversion.**
- Obviously, there is not much data for 2015, because the data only exists up to the middle period as described above.

In [None]:
year_data = pd.DataFrame(train_group.groupby('Year')['Category'].count())
year_data_1 = pd.DataFrame(train_group_1.groupby('Year')['Category'].count())

In [None]:
year_data.reset_index(inplace=True)

In [None]:
year_data.T.style.background_gradient(cmap='mako_r', text_color_threshold=0.02)

In [None]:
year_data.drop(12, axis=0, inplace=True)

In [None]:
year_data.T.style.background_gradient(cmap='mako_r', text_color_threshold=0.02)

--------
**Visualization with new features**

In [None]:
hour_vs_cate = train_group.groupby(['Category', 'HourGroup'], as_index=False).count()

In [None]:
hour_vs_cate.style.background_gradient(cmap='mako_r', text_color_threshold=0.02)

In [None]:
hour_vs_cate_pv = hour_vs_cate.pivot(index='HourGroup', columns='Category', values='Dates') #.fillna(0)

In [None]:
hour_vs_cate_pv.style.background_gradient(cmap='mako_r', text_color_threshold=0.02)

In [None]:
fig, ax = plt.subplots(figsize=(50, 5)) 
sns.heatmap(hour_vs_cate_pv.apply(lambda x:x/sum(x),axis=0), square=True, annot=True)

In [None]:
with plt.style.context('fivethirtyeight'):
    de_time = train_group.groupby([train_group.Week_cat, train_group.Hour])['Category'].count()
    de_time.plot(figsize=(19, 6))

- Number of Crime Occurrences on Weekend,day
- The probability of a Crime occurring at 12:00 increases. This means that in order to form a group, a situation where such a situation can inevitably occur is created.
- The second visualization also shows a temporary increase in the occurrence of Crime at 12:00.

In [None]:
with plt.style.context('fivethirtyeight'):
    de_time = train_group.groupby([train_group.Week_cat, train_group.Hour])['Category'].count()
    de_time.loc['Weekday'].plot(figsize=(19, 6), label='Week_Day')
    de_time.loc['Weekend'].plot(figsize=(19, 6), label='Week_End')
    plt.title('Week, Day or End Crimes')
    plt.legend(bbox_to_anchor=(1.0, 1.0), loc='upper left')

In [None]:
with plt.style.context('fivethirtyeight'):
    pt = pd.pivot_table(train_group.loc[:, ['Hour', 'Category']], index="Hour", columns="Category", aggfunc=len, fill_value=0)
    pt.plot(figsize=(30,10))
    plt.title('Hour Crimes')
    plt.legend(bbox_to_anchor=(1.0, 1.0), loc='upper left')
    #plt.tight_layout()

In [None]:
with plt.style.context('fivethirtyeight'):
    pt = pd.pivot_table(train_group.loc[:, ['Year', 'Category']], index='Year', columns='Category', aggfunc=len, fill_value=0)
    pt.plot(figsize=(30,10))
    #plt.gca().set_xticklabels(['2010', '2011', '2012', '2013', '2014', '2015'])
    plt.legend(bbox_to_anchor=(1.0, 1.0), loc='upper left')

In [None]:
with plt.style.context('fivethirtyeight'):
    fig, ax = plt.subplots(figsize=(30, 10))
    ax = sns.lineplot(x=year_data['Year'], y=year_data['Category'])
    ax.set_title('Year Crimes')

In [None]:
with plt.style.context('fivethirtyeight'):
    pt = pd.pivot_table(train_group.loc[:, ['Month', 'Category']], index='Month', columns='Category', aggfunc=len, fill_value=0)
    pt.plot(figsize=(30,10))
    plt.legend(bbox_to_anchor=(1.0, 1.0), loc='upper left')

In [None]:
test_1 = train_group.loc[:, ['Year', 'Category']].reset_index().merge(train_group.loc[:, ['Y', 'X']].reset_index(), on=['index']).set_index('Year')

In [None]:
test_g = train_group.groupby(["Category", "Year"]).count()

In [None]:
test_g.unstack()['Dates'].style.background_gradient(cmap='mako_r', text_color_threshold=0.02)

-----
<h1 style="background-color:#000000;font-family:Georgia;color:#FFFFFF;font-size:300%;text-align:center;border-radius:10px 10px;">Folium [2]</h1>

* Perform dynamic visualization with Folium.

In [None]:
time_series = list(np.sort(train_group['Year']))

In [None]:
train_query_move = train_group.query("Category=='ASSAULT'").loc[:, ['Y', 'X', 'Year']]

In [None]:
train_query_move = train_query_move.set_index('Year', drop=False)

In [None]:
test_11 = np.sort(train_query_move.index.unique())

In [None]:
train_move_list = [train_query_move.loc[i].values.tolist() for i in np.sort(train_query_move.index.unique())]

In [None]:
train_query = train.query("Category=='ASSAULT' & Resolution=='ARREST, BOOKED'").loc[: ,['Y','X']]

In [None]:
train_query_geo_list = train_query.values.tolist() # default feat map array

In [None]:
index_time = np.sort(train_query_move.index.unique()).tolist()

In [None]:
# It is more efficient to use the dict type for the basic list of locations that you have created, although you don't know if you will use it.
'''
down_town = [[37.77493, -122.419416]]
union_square = [[37.786163522, -122.404498382]]
fishermans_wharf = [[37.80499678, -122.409331696]]
china_town = [[37.790163506, -122.404331716]]
soma = [[37.777311, -122.411083]]
oak_street = [[37.77412, -122.431384]]
twin_peaks = [[37.751586275, -122.447721511]]
southeast = [[37.7369444, -122.3941667]]
bernal_heights = [[37.744385, -122.417046]]
mission_district = [[37.76, -122.42]]
noe_valley = [[37.7502, -122.4337]]
haight_ashbury = [[37.770015, -122.446937]]
presidio = [[37.798085, -122.466538]]
'''

sanfrancisco_location = {'down_town': [[37.77493, -122.419416]], 'union_square': [[37.786163522, -122.404498382]],
                         'fishermans_wharf': [[37.80499678, -122.409331696]], 'china_town': [[37.790163506, -122.404331716]],
                         'soma': [[37.777311, -122.411083]], 'oak_street': [[37.77412, -122.431384]],
                         'twin_peaks': [[37.751586275, -122.447721511]], 'southeast': [[37.7369444, -122.3941667]],
                         'bernal_heights': [[37.744385, -122.417046]], 'mission_district': [[37.76, -122.42]],
                         'noe_valley': [[37.7502, -122.4337]], 'height_ashbury': [[37.770015, -122.446937]], 'presidio': [[37.798085, -122.466538]]}

In [None]:
def geo_polygon(lat_lon):
    '''Find geographic quarter(NW, NE, SW, SE) based on lat_lon'''
    sw, nw, se, ne = [(lat + py * pow(10, -3), lon + px * pow(10, -3)) for px, py in it.product([-1, 1], [-1, 1]) for lat, lon in lat_lon]
    return [sw, se, ne, nw]

In [None]:
def location_point(location):
    '''Function to display LocationPoint in bulk: variable by dictionary'''
    for location_name, lat_lon in location.items():
        lat_lon_locate = geo_polygon(lat_lon)
        folium.Polygon(locations=lat_lon_locate, color="red", weight=1, fill=True, fill_opacity=0.1, popup=location_name).add_to(m)

def location_point_1(location):
    for location_name, lat_lon in location.items():
        lat_lon_locate = geo_polygon(lat_lon)
        folium.Polygon(locations=lat_lon_locate, color="red", weight=1, fill=True, fill_opacity=0.1, popup=location_name).add_to(m_1)

In [None]:
m = folium.Map(location=[37.774599, -122.425892], zoom_start=13, tiles='CartoDB dark_matter')

In [None]:
location_point(sanfrancisco_location)

In [None]:
#HeatMapWithTime(train_query_geo_list,auto_play=False,radius=40,max_opacity=1,gradient={0.1: 'blue', 0.25: 'lime', 0.5:'yellow',0.75: 'red'}).add_to(m)
HeatMapWithTime(train_move_list, index=index_time, auto_play=False, radius=1 , max_opacity=1, gradient={0.1: 'blue', 0.25: 'lime', 0.5:'yellow',0.75: 'red'}).add_to(m)

## Scatterplot of ASSAULT from 2003-2015.
- Dynamically displayed with the play button. fps uses the default of 10, but can be increased by raising the slide bar.
- The red frame is the location point, and clicking on it will display the location name. The reason for the red frame is just to show that this kind of display is possible. The pin is probably the most effective.

In [None]:
m

In [None]:
m_1 = folium.Map(location=[37.774599, -122.425892], zoom_start=13, tiles='CartoDB dark_matter')

In [None]:
marker_cluster = MarkerCluster().add_to(m_1)

In [None]:
location_point_1(sanfrancisco_location)

In [None]:
for point in range(0, len(train_query_geo_list)):
    folium.Marker(train_query_geo_list[point], popup='A').add_to(marker_cluster)

## Clustering view of ASSULT from 2003 to 2015.
- You can visualize the clusters either by scrolling the map or by clicking on the target parcel. Because the latitude and longitude are accurately recorded, you can clearly pinpoint where the incident is occurring when you scroll down to the smallest unit. Due to the performance problem of the PC I created it on, the pins are all marked with 'A', but it is possible to display the details of the incident itself, and since there is a popup at the minimum unit marker that can display what it is, the details of the Crime can be displayed there.
- The details of the crime can be displayed in the popup. To do this, the ID is linked to the lat_lon management, and the details of the crime are extracted from the linked item and displayed in the popup.

In [None]:
m_1

----
**Visualize what time of the week each Crime is happening.**
- This will give you a clearer idea of the nature of the case and the background of the case as knowledge.

In [None]:
sorted_map = {'Saturday': 0, 'Sunday': 1, 'Monday': 2, 'Tuesday': 3, 'Wednesday': 4, 'Thursday': 5, 'Friday': 6}

In [None]:
def dayOfWeek_plot(data, string='ASSAULT'):
    with plt.style.context('fivethirtyeight'):
        week_data = pd.DataFrame(data[data['Category'] == string].groupby(by=['DayOfWeek'])['Category'].count()).reset_index()
        week_data['SortedDayOfWeek'] = week_data['DayOfWeek'].map(sorted_map)
        week_data = week_data.sort_values('SortedDayOfWeek').drop('SortedDayOfWeek', axis=1)

        fig, ax = plt.subplots(1, 1, figsize = (19, 6))
        ax = sns.lineplot(x=week_data['DayOfWeek'], y=week_data['Category'])
        ax.set_title('{} Crimes. Week'.format(string))

In [None]:
dayOfWeek_plot(train_group, 'ASSAULT')

In [None]:
list(train_group['Category'].unique()); # test code

In [None]:
# add def dayOfWeel_plot
for string in list(train_group['Category'].unique()):
    dayOfWeek_plot(train_group, string)

**Holiday**

In [None]:
ct_holiday_cat = pd.crosstab(train_group['Category'], train_group['Holiday'])
ct_holiday_cat.T

In [None]:
stacked = ct_holiday_cat.stack().reset_index().rename(columns={0:'value'})

In [None]:
stacked.loc[stacked['Holiday'] == False, 'value'] /= train_group.loc[train_group['Holiday'] == False, 'Holiday'].count()
stacked.loc[stacked['Holiday'] == True, 'value'] /= train_group.loc[train_group['Holiday'] == True, 'Holiday'].count()

In [None]:
stacked.T

- Holiday　Plot

In [None]:
with plt.style.context('fivethirtyeight'):
    fig, ax = plt.subplots(1, 1, figsize = (19, 6))
    bar = sns.barplot(x=stacked['Category'], y=stacked['value'], hue=stacked['Holiday'], palette=['#682F2F', '#F3AB60'])
    bar.set_title('Proportions of crimes during regular days vs holidays')
    ax.tick_params(axis='x', rotation=90)

In [None]:
ct_business_hrs_cat = pd.crosstab(train_group['Category'], train_group['DworkGroup'])
ct_business_hrs_cat.T

In [None]:
stacked = ct_business_hrs_cat.stack().reset_index().rename(columns={0:'value'})
stacked.loc[stacked['DworkGroup'] == False, 'value'] /= train_group.loc[train_group['DworkGroup'] == False, 'DworkGroup'].count()
stacked.loc[stacked['DworkGroup'] == True, 'value'] /= train_group.loc[train_group['DworkGroup'] == True, 'DworkGroup'].count()
stacked.T

In [None]:
with plt.style.context('fivethirtyeight'):
    fig, ax = plt.subplots(1, 1, figsize = (19, 6))
    bar = sns.barplot(x=stacked['Category'], y=stacked['value'], hue=stacked['DworkGroup'], palette=['#682F2F', '#F3AB60'])
    bar.set_title('Proportions of crimes during regular days vs holidays')
    ax.tick_params(axis='x', rotation=90)

----------
**TimeStamp Conversion**
- It is also important to infer the data by switching the time series data to the display of business days, quarters, etc., but this is only a conversion and will be omitted.

In [None]:
train_group.head(2)

In [None]:
train_group_2 = train_group.copy()

In [None]:
train_group_2.set_index('Dates', inplace=True)

In [None]:
train_group_2.loc['2014'].head(2)

In [None]:
freq_d = pd.to_datetime(train_group_2.index)

In [None]:
train_group_freq_d = pd.DataFrame(train_group_2, index=freq_d)

In [None]:
train_group_2.to_period('Q-DEC').head(2)

------
<h1 style= "background-color:#000000;font-family:Georgia;color:#FFFFFF;font-size:300%;text-align:center;border-radius:10px 10px;border-style:solid;border-width:3px;border-color:#000000;"><b>Main : Pre-Process-Pipeline</b></h1>


- What the next process needs is a process that can accurately categorize.
    - Address, Resolusion, and Descript do not have unique values correctly. These need to be processed.
    
    - Address : Je(street_type)
        - In addition, it may be possible to add the characteristics of which block the street is located in (not done below).
       
- Category : target
- Delete Columns : Address, Dates

- In the pipeline passed to the model, we use the pipeline with StreetType deleted. This is done to avoid the situation where the category does not match when specifying the number of digits because it takes time to train the model depending on the PC environment.

In [None]:
train_group.head(2)

In [None]:
test_group.head(2)

In [None]:
train_group.drop(['Dates', 'Descript', 'Resolution', 'Address', 'Date', 'Street_type'], axis=1, inplace=True) # Street_type

In [None]:
test_group.drop(['Id','Dates', 'Address', 'Date', 'Street_type'], axis=1, inplace=True) # Street_type

In [None]:
X = train_group.drop('Category', axis=1)
y = train_group['Category']

In [None]:
data_labels = y.to_numpy()

In [None]:
data_num = X.drop(['DayOfWeek', 'PdDistrict', 'Week_cat', 'DworkGroup','Holiday'], axis=1) # 'Street_type'

In [None]:
number_attribs = list(data_num)

In [None]:
category_attribs = ['DayOfWeek', 'PdDistrict', 'Week_cat', 'Holiday','DworkGroup'] # 'Street_type'

In [None]:
number_pipeline = Pipeline([
    ('std_scaler', StandardScaler()),
])

main_pipeline = ColumnTransformer([
    ("number", number_pipeline, number_attribs),
    ("categorie", OneHotEncoder(), category_attribs),
])

In [None]:
X.columns

In [None]:
test_group.columns

-----
<p style= "background-color:#000000;font-family:Georgia;color:#FFFFFF;font-size:300%;text-align:center;border-radius:10px 10px;border-style:solid;border-width:3px;border-color:#000000;"><b>Default Transform</b></p>

- The original Pipeline Transform.
- Due to the training time of the model, this is not used in this notebook.
- Clustering is not included in the transformation.

In [None]:
X.info()

In [None]:
X.isnull().sum()

In [None]:
main_x = main_pipeline.fit_transform(X)

In [None]:
main_test = main_pipeline.transform(test_group)

In [None]:
X_train, X_val, y_train, y_val = train_test_split(main_x, data_labels, test_size=0.25, stratify=data_labels, shuffle=True)

In [None]:
X_train.shape, X_val.shape, y_train.shape, y_val.shape

------
<h1 style= "background-color:#000000;font-family:Georgia;color:#FFFFFF;font-size:300%;text-align:center;border-radius:10px 10px;border-style:solid;border-width:3px;border-color:#000000;"><b>Clustering</b></h1>

In [None]:
model = KMeans(init='k-means++', n_init=10)
visualizer = KElbowVisualizer(model, k=(1, 20))

In [None]:
visualize = visualizer.fit(main_x)

In [None]:
visualizer.show();

In [None]:
main_cluster_data = KMeans(n_clusters=visualizer.elbow_value_).fit_predict(main_x) # 6 or 5

In [None]:
main_cluster_test = KMeans(n_clusters=visualizer.elbow_value_).fit_predict(main_test)

In [None]:
X_cluster = X.copy()

In [None]:
test_cluster = test_group.copy()

- Assign the cluster you created. This will be transformed again, but it is better to do the Clustering before transforming.

In [None]:
X_cluster['Cluster'] = main_cluster_data

In [None]:
test_cluster['Cluster'] = main_cluster_test

---------------
<p style= "background-color:#000000;font-family:Georgia;color:#FFFFFF;font-size:300%;text-align:center;border-radius:10px 10px;border-style:solid;border-width:3px;border-color:#000000;"><b>Second Cluster transform</b></p>

- First is practically the final transform, but the number of indices is so large that it takes a long time to process in a normal PC environment. To solve this problem, we can reduce some of the indices before transforming. This method is not the best, but rather a bad idea, but it is beneficial in terms of running the model. It is important to understand that the results obtained from this are not exact.
- The first step is to extract only a part of the data from the dataset with the clusters assigned to it before applying the transformation. I think this can be done with .sample() (since you can extract randomly).
- The number of datasets before extraction is 500,000, so we need to reduce this number to less than 100,000 (this can be adjusted).

In [None]:
#from sklearn.model_selection import StratifiedKFold
#kfold =StratifiedKFold(n_splits=5,shuffle=True,random_state=42)　

In [None]:
X_cluster.shape, X_cluster[:100000].shape

In [None]:
X_cluster.sample(10000);

-----
- The following is not inherently necessary. This is a syntax to make the model run smoothly, not an inherent best practice. It is a worst-case scenario when used for anything other than running the model.
- Because Street is categorized, you may be told that you don't have enough categories unless you have the full sample of X_cluster, but this is not a best practice and you can run it a few times to get them all included.
- In the following, Street is removed.

In [None]:
#X_cluster_test = X_cluster.drop('Street_type', axis=1)

In [None]:
#test_cluster_test = test_cluster.drop('Street_type', axis=1)

In [None]:
data_num = X.drop(['DayOfWeek', 'PdDistrict', 'DworkGroup', 'Week_cat', 'Holiday'], axis=1) # 'Street_type'
number_attribs = list(data_num)
category_attribs = ['DayOfWeek', 'PdDistrict', 'DworkGroup', 'Week_cat', 'Holiday']

number_pipeline = Pipeline([
    ('std_scaler', StandardScaler()),
])

main_pipeline = ColumnTransformer([
    ("number", number_pipeline, number_attribs),
    ("categorie", OneHotEncoder(), category_attribs),
])

In [None]:
main_x_1 = main_pipeline.fit_transform(X_cluster.sample(50000)) # X_cluster_test

In [None]:
main_test_1 = main_pipeline.transform(test_cluster) # test_cluster_test

-----
# Final training and validation sets to be used in this Notebook

In [None]:
X_train, X_val, y_train, y_val = train_test_split(main_x_1, data_labels[:50000], test_size=0.25, shuffle=True) # stratify=data_labels

In [None]:
X_train.shape, X_val.shape, y_train.shape, y_val.shape

--------
<h1 style= "background-color:#000000;font-family:Georgia;color:#FFFFFF;font-size:300%;text-align:center;border-radius:10px 10px;border-style:solid;border-width:3px;border-color:#000000;"><b>Model</b></h1>

- GridSearchCV, RandomizedSearchCV, CV, etc. are not used due to the low spec environment in which this Notebook was created.
- As mentioned above, we have not adjusted the hyperparameters of the Model, so the performance is quite low. I also did not visualize the training, so I cannot tell visually what kind of state it is in.
- For more information on training models, please refer to another Notebook I am working on.

- I have only created a minimum number of Models.
- Category prediction is being done in y_pred_gb. This means that it predicts what the incident will be, which also means that it is possible to predict what the incident will be and prepare to respond to it in advance if it occurs with the characteristics of a feature. It is also possible to predict the probability of an incident occurring. It's like Minority Report starring Tom Cruise.

# XGBC

In [None]:
xgb_model = XGBClassifier(use_label_encoder=True, metric='mlogloss')

In [None]:
%timeit xgb_model.fit(X_train, y_train)

In [None]:
y_pred_xgb = xgb_model.predict(X_val)

In [None]:
print('Precision : {} / Recall : {}'.format(precision_score(y_val, y_pred_xgb, average='micro'), recall_score(y_val, y_pred_xgb, average='micro')))
print(classification_report(y_val, y_pred_xgb))

In [None]:
disp = ConfusionMatrixDisplay(confusion_matrix=confusion_matrix(y_val, y_pred_xgb))
fig, ax = plt.subplots(1, 1, figsize=(19, 19))
disp.plot(ax=ax);

## GradientBoostingClassifier

In [None]:
#gb_model = GradientBoostingClassifier()

In [None]:
#gb_model.get_params().keys()

In [None]:
#gb_param_grid = {'learning_rate':[0.1, 0.01, 0.001], 'max_depth':[5, 10], 'n_estimators':[10, 100, 200, 300]}

In [None]:
#UserWarning: The least populated class in y has only 5 members, which is less than n_splits=10. : stratify=data_labels
#gb_s_model = GridSearchCV(gb_model, gb_param_grid, cv=10, scoring='accuracy')
#gb_s_model.fit(X_train, y_train)

In [None]:
gb_model = GradientBoostingClassifier(learning_rate=0.1, max_depth=5, n_estimators=5)
%timeit gb_model.fit(X_train, y_train)

In [None]:
y_pred_gb = gb_model.predict(X_val)

In [None]:
print('Precision : {} / Recall : {}'.format(precision_score(y_val, y_pred_gb, average='micro'), recall_score(y_val, y_pred_gb, average='micro')))
print(classification_report(y_val, y_pred_gb))

In [None]:
disp = ConfusionMatrixDisplay(confusion_matrix=confusion_matrix(y_val, y_pred_gb))
fig, ax = plt.subplots(1, 1, figsize=(19, 19))
disp.plot(ax=ax);

--------
# Other Models Creating

# Thanks for reading all the way through.
# Please give me an UPVOTE if you can. Your UPVOTE will be a great encouragement to me!