# 1. Packages Loading.

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import folium
from folium.plugins import MarkerCluster # for clustering the markers
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics
import fbprophet
from scipy.stats import mode
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# 2. Dataset Preparation.

**2.1 Read each dataset.**

In [None]:
df1 = pd.read_csv('../input/atlanta-crime-data2020/COBRA-2020-OldRMS-09292020 (Corrected 11_25_20)/COBRA-2020-OldRMS-09292020.csv')
df2 = pd.read_csv('../input/atlanta-crime-data2020/COBRA-2020 (Updated 12_10_2020)/COBRA-2020.csv')
df3 = pd.read_csv('../input/atlanta-crime-data2020/COBRA-2009-2019 (Updated 1_9_2020)/COBRA-2009-2019.csv')

**2.2 Drop unmatching and unnecessary columns for consistency.**

In [None]:
df1 = df1.drop(columns = ['apartment_office_prefix','apartment_number','watch','location_type','UCR_Number'])
df2 = df2.drop(columns = ['ibr_code'])
df3 = df3.drop(columns = ['Apartment Office Prefix','Apartment Number','Shift Occurence','Location Type','UCR #','IBR Code'])

**2.3 Rename df3 column name to maintain consistency with df1 and df2.******

In [None]:
df3.columns = df1.columns

**2.4 Drop the special row data.** with 'T' in 'Occur_time' field.

In [None]:
df3 = df3[df3['occur_time'].apply(lambda x: str(x).isdecimal())]

In [None]:
# def isNotDecimal(x):
#     if str(x).isdecimal():
#         return False
#     else:
#         return True
# df3[df3['occur_hour'].apply(isNotDecimal)]

**2.5 Check 'occur_time' field.**

In [None]:
df3['occur_time'].value_counts()

**2.6 Add necessary columns for visualization purpose.**

In [None]:
df1['rpt_month'] = df1['rpt_date'].str.split('/').str[0]
df1['rpt_day'] = df1['rpt_date'].str.split('/').str[1]
df1['rpt_year'] = df1['rpt_date'].str.split('/').str[2]
df1['occur_month'] = df1['occur_date'].str.split('/').str[0]
df1['occur_day'] = df1['occur_date'].str.split('/').str[1]
df1['occur_year'] = df1['occur_date'].str.split('/').str[2]
df1['occur_hour'] = df1['occur_time'].str.split(':').str[0]

df2['rpt_month'] = df2['rpt_date'].str.split('/').str[0]
df2['rpt_day'] = df2['rpt_date'].str.split('/').str[1]
df2['rpt_year'] = df2['rpt_date'].str.split('/').str[2]
df2['occur_month'] = df2['occur_date'].str.split('/').str[0]
df2['occur_day'] = df2['occur_date'].str.split('/').str[1]
df2['occur_year'] = df2['occur_date'].str.split('/').str[2]
df2['occur_hour'] = df2['occur_time'].str.split(':').str[0]

df3['rpt_month'] = df3['rpt_date'].str.split('-').str[1]
df3['rpt_day'] = df3['rpt_date'].str.split('-').str[2]
df3['rpt_year'] = df3['rpt_date'].str.split('-').str[0]
df3['occur_month'] = df3['occur_date'].str.split('-').str[1]
df3['occur_day'] = df3['occur_date'].str.split('-').str[2]
df3['occur_year'] = df3['occur_date'].str.split('-').str[0]
df3['occur_hour'] = df3['occur_time'].astype(str).str[:-2]

**2.7 Drop null data.******

In [None]:
df3 = df3[df3['occur_hour'].apply(lambda x: str(x).isnumeric())]

In [None]:
keys = df1.columns
df1 = df1.dropna(subset = keys).reset_index(drop = True)
df2 = df2.dropna(subset = keys).reset_index(drop = True)
df3 = df3.dropna(subset = keys).reset_index(drop = True)

**2.8 Concatenate the 3 datasets.**

In [None]:
df = pd.concat([df1,df2, df3]).reset_index(drop = True)
df

**2.9 Deal with datatype.**

In [None]:
df = df.astype({'beat' : 'int32', 'occur_year' : 'int32', 'occur_month' : 'int32', 'occur_day' : 'int32', 'occur_hour' : 'int32', 'rpt_year' : 'int32', 'rpt_month' : 'int32', 'rpt_day' : 'int32'})
df['occur_date'] = pd.to_datetime(df['occur_date'])
df['rpt_date'] = pd.to_datetime(df['rpt_date'])

In [None]:
df.info()

**2.10 Save the dataset to a csv file.**

In [None]:
df.to_csv('df.csv', index=False)

# 3. EDA

In [None]:
df

**Practice:::::::::**

In [None]:
df.loc[df['rpt_year'] - df['occur_year'] > 4][['offense_id', 'occur_year','rpt_year']]

In [None]:
df.isnull().sum()

In [None]:
mode(df['rpt_month'])

'Mode' is broadly used for fillna() functionality, as mode stands for the most common values for a specific field.

In [None]:
df.groupby(['rpt_month']).size()

Conditional filter:::::

In [None]:
df.loc[df['rpt_year'] == 2020, : ]

Crosstab is important for EDA, useful for initial 'feel'.

In [None]:
pd.crosstab(df['rpt_year'], df['UC2_Literal'])

In [None]:
df.groupby(['rpt_year', 'UC2_Literal']).size().unstack(level = 1).plot(figsize=(12,10))

In [None]:
df.info()

In [None]:
df.dtypes

In [None]:
print(df)

In [None]:
df['occur_year'].value_counts().sort_index()

In [None]:
df = df[(df['occur_year'] >= 2009) & (df['occur_year'] <= 2020)]

In [None]:
df['occur_year'].value_counts().sort_index()

In [None]:
df['occur_month'].value_counts().sort_index()

In [None]:
df['occur_day'].value_counts().sort_index()

In [None]:
df['occur_hour'].value_counts().sort_index()

**Deal with occur_hour: 24**

In [None]:
def updateOccurHour(hour):
    if hour == 24:
        return 0
    else:
        return hour
df['occur_hour'] = df['occur_hour'].apply(updateOccurHour)

**Note: Until here, the 'occur_hour' turns to 'int64'.**

In [None]:
df['occur_hour'].value_counts().sort_index()

In [None]:
df.info()

In [None]:
df['rpt_month'].value_counts().sort_index()

In [None]:
df['rpt_day'].value_counts().sort_index()

In [None]:
df['rpt_year'].value_counts().sort_index()

In [None]:
df['lat'].describe()

In [None]:
df['long'].describe()

In [None]:
df['offense_id'].isna().sum()

**Q: What is 'beat' ?**

In [None]:
df['beat'].describe()

In [None]:
df['UC2_Literal'].value_counts()

In [None]:
df['neighborhood'].value_counts() 

**Q: What is 'npu' ?**

In [None]:
df['npu'].value_counts().sort_index()

**Use plot functions coming with Pandas.**

In [None]:
df.groupby(['occur_year', 'UC2_Literal']).size().unstack(level=1).plot.bar(stacked=True)

In [None]:
df.groupby(['occur_year', 'UC2_Literal']).size().unstack(level=0).plot.barh(stacked=True)

In [None]:
df.groupby(['occur_year', 'UC2_Literal']).size().unstack(level=0).plot.hist(stacked=True, bins=20)

In [None]:
df.groupby(['occur_year', 'UC2_Literal']).size().unstack(level=0).plot.box(vert=False, sym='r+')

In [None]:
df.groupby(['occur_year', 'UC2_Literal']).size().unstack(level=1).plot.area()

In [None]:
df.plot.hexbin(x='occur_year', y='occur_month', gridsize=25)

In [None]:
df[df['occur_year'] == 2020].groupby(['occur_year', 'UC2_Literal']).size().plot.pie()

**Deploy number of incidents over years**

In [None]:
df.groupby(['occur_year']).size().plot(figsize=(12,6))

The above figure shows the number of incidents has been all the way going down.

In [None]:
df.groupby(['occur_year', 'UC2_Literal']).size().unstack(level=1).plot(figsize=(12,6))

It looks like the number of incidents has been generally decreasing over years since 2009, which is a good sign of having a safer communities. However, one thing to notice is: 'LARCENY-FROM VEHICLE' incident has a slight increase trend. We need to pay attention to that.

The number of crime over months.

In [None]:
df.groupby(['occur_month']).size().plot(figsize=(12,6))

Febburary seems to be the safest month, it may bacause the crime-related ppl have a better financial status temporarily, by assuming they involve crimes due to financial cause. To make it cleaer, we can tell probably the stipend from government at the end of year can decrease the number of incidents.

In [None]:
df.groupby(['occur_month', 'UC2_Literal']).size().unstack(level=1).plot(figsize=(12,6))

Number of incidents over hours.

In [None]:
df.groupby(['occur_hour']).size().plot(figsize=(12,6))

In [None]:
df.groupby(['occur_hour', 'UC2_Literal']).size().unstack(level=1).plot(figsize=(12,6))

A pick hour of crime would be around 8 and 12, so we can assume that is breakfast/lunch time. When they are out for food, then something happens.

In [None]:
ax = df.groupby(['UC2_Literal']).size().plot.bar(figsize=(12,6), title = '# of Incident Over Incident Category')
ax.set_xlabel("# of Incident")
ax.set_ylabel("Incident Category")

In [None]:
df.groupby(['occur_month']).size()

In [None]:
df

**Geocoded location data for visualization.**

In [None]:
map = folium.Map(location=[df['lat'].mean(), df['long'].mean()], default_zoom_start=12)
# add a marker for every record in the filtered data, use a clustered view
marker_cluster = MarkerCluster().add_to(map) # create marker clusters
for i in range(500): # we can choose any number of incidents to plot on the map, eg: df.shape[0]
    location = [df['lat'][i],df['long'][i]]
    tooltip = "Neighborhood: {}<br> Click for more".format(df["neighborhood"][i])
    folium.Marker(location, 
                  popup="""<i>Crime Address: </i> <br> <b>{}</b> <br>""".format(df['location'][i]), 
                  tooltip=tooltip).add_to(marker_cluster)
map.save('map.html')
map

# 4. Machine Learning

In [None]:
data = df.groupby(['occur_year']).size().reset_index()
X = data.iloc[:, :-1].values
y = data.iloc[:, 1].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
reg = LinearRegression()
reg.fit(X_train, y_train)

In [None]:
print(reg.intercept_)
print(reg.coef_)

In [None]:
y_pred = reg.predict(X_test)
r = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
r

**Evaluate the Algorithm**

In [None]:
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

**Visualize the results.**

In [None]:
data.plot(x='occur_year', y=0, style='o')
plt.plot(X_train, reg.predict(X_train), color = "r")
plt.title('# of Incidents Over Years')
plt.xlabel('Years')
plt.ylabel('# of Incidents')
plt.show()