In [4]:
import os
import pandas as pd
import plotly.express as px

In [6]:
# load data
train_sf_df = pd.read_csv(filepath_or_buffer='data/train.csv')
test_sf_df = pd.read_csv(filepath_or_buffer='data/test.csv')
train_sf_df.shape, test_sf_df.shape

((878049, 9), (884262, 7))

In [7]:
train_sf_df.head(1)

Unnamed: 0,Dates,Category,Descript,DayOfWeek,PdDistrict,Resolution,Address,X,Y
0,2015-05-13 23:53:00,WARRANTS,WARRANT ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599


In [5]:
test_sf_df.head(1)

Unnamed: 0,Id,Dates,DayOfWeek,PdDistrict,Address,X,Y
0,0,2015-05-10 23:59:00,Sunday,BAYVIEW,2000 Block of THOMAS AV,-122.399588,37.735051


In [6]:
# renaming columns of train and test data sets
train_cols_renamed = ['time', 'category', 'description', 'weekday', 'police_dept',
                      'resolution', 'address', 'longitude', 'latitude']
train_sf_df.columns = train_cols_renamed
#
test_cols_renamed = ['id', 'time', 'weekday',
                     'police_dept', 'address', 'longitude', 'latitude']
test_sf_df.columns = test_cols_renamed

In [7]:
# remove description and resolution because they're not needed for the training
train_sf_df.drop(columns=['description', 'resolution'], axis=1, inplace=True)

In [8]:
train_sf_df.head(1)

Unnamed: 0,time,category,weekday,police_dept,address,longitude,latitude
0,2015-05-13 23:53:00,WARRANTS,Wednesday,NORTHERN,OAK ST / LAGUNA ST,-122.425892,37.774599


In [9]:
def extract_date(time):
    """Extract data from time"""
    return time.split(' ')[0]


def extract_year(date):
    """Extract year from date"""
    return int(date.split('-')[0])


def extract_month(date):
    """Extract month from date"""
    return int(date.split('-')[1])


def extract_day(date):
    """Extract day from date"""
    return int(date.split('-')[2])


def extract_hour(time):
    """Extract hour from time"""
    date, hms = time.split(' ')
    return int(hms.split(':')[0])


def extract_minute(time):
    """Extract minute from time"""
    date, hms = time.split(' ')
    return int(hms.split(':')[1])


def extract_season(month):
    """Determine season from month"""
    if month in [4, 5, 6]:
        return 'summer'
    elif month in [7, 8, 9]:
        return 'rainy'
    elif month in [10, 11, 12]:
        return 'winter'
    return 'spring'


def extract_hour_type(hour):
    """Determine hour type from hour"""
    if (hour >= 4) and (hour < 12):
        return 'morning'
    elif (hour >= 12) and (hour < 15):
        return 'noon'
    elif (hour >= 15) and (hour < 18):
        return 'evening'
    elif (hour >= 18) and (hour < 22):
        return 'night'
    return 'mid-night'


def extract_time_period(hour):
    """Determine the time period from hour"""
    if hour in [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]:
        return 'am'
    return 'pm'

In [10]:
def title_text(text):
    """Title the text"""
    if isinstance(text, str):
        text = text.title()
        return text
    return text

In [11]:
def extract_address_type(addr):
    """Extract address type if it Street or Cross etc"""
    if ' / ' in addr:
        return 'Cross'
    addr_sep = addr.split(' ')
    addr_type = addr_sep[-1]
    return addr_type

In [1]:
def write_temporal_address_features(df, path):
    """Writing the temporal based features"""
    
    ### Adding temporal features
    df['date'] = df['time'].apply(func=extract_date)
    df['year'] = df['date'].apply(func=extract_year)
    df['month'] = df['date'].apply(func=extract_month)
    df['day'] = df['date'].apply(func=extract_day)
    df['hour'] = df['time'].apply(func=extract_hour)
    df['minute'] = df['time'].apply(func=extract_minute)
    df['season'] = df['month'].apply(func=extract_season)
    df['hour_type'] = df['hour'].apply(func=extract_hour_type)
    df['time_period'] = df['hour'].apply(func=extract_time_period)
    
    ### Adding address type
    df['address_type'] = df['address'].apply(func=extract_address_type)
    
    ### Text titling
    df = df.applymap(func=title_text)
    
    ### Writing
    df.to_csv(path_or_buf=path, index=None)
    
    return True

In [13]:
# seperate features and create new csv

if (
    not os.path.isfile(path='data/train_time_address_cleaned.csv') and
    not os.path.isfile(path='data/test_time_address_cleaned.csv')
):
    # Training
    write_temporal_address_features(df=train_sf_df, path='data/train_time_address_cleaned.csv')
    # Test
    write_temporal_address_features(df=test_sf_df, path='data/test_time_address_cleaned.csv')
else:
    print("Data already exists in the directory.")
    train_sf_df = pd.read_csv(filepath_or_buffer='data/train_time_address_cleaned.csv')
    test_sf_df = pd.read_csv(filepath_or_buffer='data/test_time_address_cleaned.csv')


Data already exists in the directory.


In [14]:
train_sf_df.head(1)

Unnamed: 0,time,category,weekday,police_dept,address,longitude,latitude,date,year,month,day,hour,minute,season,hour_type,time_period,address_type
0,2015-05-13 23:53:00,Warrants,Wednesday,Northern,Oak St / Laguna St,-122.425892,37.774599,2015-05-13,2015,5,13,23,53,Summer,Mid-Night,Pm,Cross


In [35]:
test_sf_df.head(1)

Unnamed: 0,id,time,weekday,police_dept,address,longitude,latitude,date,year,month,day,hour,minute,season,hour_type,time_period,address_type
0,0,2015-05-10 23:59:00,Sunday,BAYVIEW,2000 Block of THOMAS AV,-122.399588,37.735051,2015-05-10,2015,5,10,23,59,summer,mid-night,pm,AV


In [34]:
train_sf_df[['latitude', 'longitude']].describe()

Unnamed: 0,latitude,longitude
count,878049.0,878049.0
mean,37.77102,-122.422616
std,0.456893,0.030354
min,37.707879,-122.513642
25%,37.752427,-122.432952
50%,37.775421,-122.41642
75%,37.784369,-122.406959
max,90.0,-120.5


* latitude is around `37.7` to `38`
* but the max value is `90` which indicates that there are wrong entries
* same for longitude

In [15]:
def plot_column_distribution(df, column):
    """Plot the distribution of the column from dataframe"""
    
    column_val_df = df[column].value_counts().to_frame().reset_index()
    column_val_df.columns = [column, 'count']
    
    fig = px.bar(data_frame=column_val_df, x=column, y='count')
    fig.update_layout(
        autosize=True,
        height=600,
        hovermode='closest',
        showlegend=True,
        margin=dict(l=10, r=10, t=30, b=0)
    )
    
    fig.show()
    return None

In [16]:
plot_column_distribution(df=train_sf_df, column='category')

* the `Larceny/Theft` `category` is the most occurred type of crime in the all the years
* we also see that the data is not balanced which leads to the fact that stratification is needed

In [17]:
plot_column_distribution(df=train_sf_df, column='address_type')

* we see that most of the crimes occurred on `Streets` and `Crosses`

In [18]:
plot_column_distribution(df=train_sf_df, column='police_dept')

* the `Southern` police district is most affected

In [19]:
plot_column_distribution(df=train_sf_df, column='year')

* they `year 2015` is either a good year or it's not fully filled with the data

In [20]:
plot_column_distribution(df=train_sf_df, column='month')

* we see that the occurrence of crimes in the months from 6 to 9 is lower than the other months

In [21]:
plot_column_distribution(df=train_sf_df, column='weekday')

* we see that `Friday` has the most crimes occurrence
* but `Sunday` is the least 
* which is an indication that on sundays the people are at home or have free time

In [22]:
plot_column_distribution(df=train_sf_df, column='hour')

* most crimes happen at the evenings
* on the morning hours there are less crimes

In [23]:
plot_column_distribution(df=train_sf_df, column='season')