In [None]:
import os
import pandas as pd
import plotly.express as px

In [None]:
# load data
train_sf_df = pd.read_csv(filepath_or_buffer='data/train.csv')
test_sf_df = pd.read_csv(filepath_or_buffer='data/test.csv')
train_sf_df.shape, test_sf_df.shape

In [None]:
train_sf_df.head()

In [None]:
test_sf_df.head()

In [None]:
train_cols_renamed = ['time', 'category', 'description', 'weekday', 'police_dept', 
                      'resolution', 'address', 'longitude', 'latitude']
train_sf_df.columns = train_cols_renamed

test_cols_renamed = ['id', 'time', 'weekday', 'police_dept', 'address', 'longitude', 'latitude']
test_sf_df.columns = test_cols_renamed

In [None]:
# remove description and resolution because they're not needed for the training
train_sf_df.drop(columns=['description', 'resolution'], axis=1, inplace=True)

In [None]:
train_sf_df.head()

In [None]:
def extract_date(time):
    return time.split(' ')[0]

def extract_year(date):
    return int(date.split('-')[0])

def extract_month(date):
    return int(date.split('-')[1])

def extract_day(date):
    return int(date.split('-')[2])

def extract_hour(time):
    date, hms = time.split(' ')
    return int(hms.split(':')[0])

def extract_minute(time):
    date, hms = time.split(' ')
    return int(hms.split(':')[1])

def extract_season(month):
    if month in [4, 5, 6]:
        return 'summer'
    elif month in [7, 8, 9]:
        return 'rainy'
    elif month in [10, 11, 12]:
        return 'winter'
    return 'spring'

def extract_hour_type(hour):
    if (hour >= 4) and (hour < 12):
        return 'morning'
    elif (hour >= 12) and (hour < 15):
        return 'noon'
    elif (hour >= 15) and (hour < 18):
        return 'evening'
    elif (hour >= 18) and (hour < 22):
        return 'night'
    return 'mid-night'

def extract_time_period(hour):
    if hour in [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]:
        return 'am'
    return 'pm'

In [None]:
def title_text(text):
    if isinstance(text, str):
        text = text.title()
        return text
    return text

In [None]:
def extract_address_type(addr):
    if ' / ' in addr:
        return 'Cross'
    addr_sep = addr.split(' ')
    addr_type = addr_sep[-1]
    return addr_type

In [None]:
def write_temporal_address_features(df, path):
    df['date'] = df['time'].apply(func=extract_date)
    df['year'] = df['date'].apply(func=extract_year)
    df['month'] = df['date'].apply(func=extract_month)
    df['day'] = df['date'].apply(func=extract_day)
    df['hour'] = df['time'].apply(func=extract_hour)
    df['minute'] = df['time'].apply(func=extract_minute)
    df['season'] = df['month'].apply(func=extract_season)
    df['hour_type'] = df['hour'].apply(func=extract_hour_type)
    df['time_period'] = df['hour'].apply(func=extract_time_period)
    df['address_type'] = df['address'].apply(func=extract_address_type)
    df = df.applymap(func=title_text)
    df.to_csv(path_or_buf=path, index=None)
    return True

In [None]:
if (
    not os.path.isfile(path='data/train_time_address.csv') and
    not os.path.isfile(path='data/test_time_address.csv')
   ):
    # Training
    write_temporal_address_features(df=train_sf_df, path='data/train_time_address.csv')
    # Test
    write_temporal_address_features(df=test_sf_df, path='data/test_time_address.csv')

else:
    print("Data already exists in the directory.")

In [None]:
train_sf_df = pd.read_csv(filepath_or_buffer='data/train_time_address.csv')
test_sf_df = pd.read_csv(filepath_or_buffer='data/test_time_address.csv')

In [None]:
train_sf_df.head()

In [None]:
test_sf_df.head()

In [None]:
train_sf_df[['latitude', 'longitude']].describe()

* latitude is around `37.7` to `38`
* but the max value is `90` which indicates that there are wrong entries
* same for longitude

In [None]:
def plot_column_distribution(df, column):
    column_val_df = df[column].value_counts().to_frame().reset_index()
    column_val_df.columns = [column, 'count']
    
    fig = px.bar(data_frame=column_val_df, x=column, y='count')
    fig.update_layout(
        autosize=True,
        height=600,
        hovermode='closest',
        showlegend=True,
        margin=dict(l=10, r=10, t=30, b=0)
    )
    
    fig.show()
    return None

In [None]:
plot_column_distribution(df=train_sf_df, column='category')

* the `Larceny/Theft` `category` is the most occurred type of crime in the all the years
* we also see that the data is not balanced which leads to the fact that stratification is needed

In [None]:
plot_column_distribution(df=train_sf_df, column='address_type')

* we see that most of the crimes occurred on `Streets` and `Crosses`

In [None]:
plot_column_distribution(df=train_sf_df, column='police_dept')

* the `Southern` police district is most affected

In [None]:
plot_column_distribution(df=train_sf_df, column='year')

* they `year 2015` is either a good year or it's not fully filled with the data

In [None]:
plot_column_distribution(df=train_sf_df, column='month')

* we see that the occurrence of crimes in the months from 6 to 9 is lower than the other months

In [None]:
plot_column_distribution(df=train_sf_df, column='weekday')

* we see that `Friday` has the most crimes occurrence
* but `Sunday` is the least 
* which is an indication that on sundays the people are at home or have free time

In [None]:
plot_column_distribution(df=train_sf_df, column='hour')

* most crimes happen at the evenings
* on the morning hours there are less crimes

In [None]:
plot_column_distribution(df=train_sf_df, column='season')