In [None]:
import os
import pandas as pd
import numpy as np

In [None]:
# load clean data
train_sf_df = pd.read_csv(filepath_or_buffer='data/train_time_address.csv')
test_sf_df = pd.read_csv(filepath_or_buffer='data/test_time_address.csv')
train_sf_df.shape, test_sf_df.shape

In [None]:
train_sf_df.head()

### One-Hot-Encoding

Extracting time based features via OHE.

In [None]:
def split_categories_numericals(df):
    """Identifying the numerical and categorical columns separately"""
    cols = list(df.columns)
    num_cols = list(df._get_numeric_data().columns)
    cate_cols = list(set(cols) - set(num_cols))
    return cate_cols, num_cols

In [None]:
ignore_columns = ['category', 'time', 'address', 'date']

def extract_feature_dummies(df, column):
    """One-Hot-Encoding using Pandas"""
    col_df = df[column]
    return pd.get_dummies(data=col_df)

def encode_multiple_columns(df, ignore_columns=ignore_columns):
    """Encoding the multiple columns and vertical stacking them"""
    cate_cols, num_cols = split_categories_numericals(df=df)
    
    multi_feature_dummies = [df[num_cols]]
    for i in cate_cols:
        if i not in ignore_columns:
            d = extract_feature_dummies(df=df, column=i)
            multi_feature_dummies.append(d)

    encoded_data = pd.concat(multi_feature_dummies, axis=1)
    
    return encoded_data

### Extracting Spatial Distance Features

In [None]:
sf_pstations_tourists = {
    "sfpd"                : [37.7725, -122.3894],
    "ingleside"           : [37.7247, -122.4463],
    "central"             : [37.7986, -122.4101],
    "northern"            : [37.7802, -122.4324],
    "mission"             : [37.7628, -122.4220],
    "tenderloin"          : [37.7838, -122.4129],
    "taraval"             : [37.7437, -122.4815],
    "sfpd park"           : [37.7678, -122.4552],
    "bayview"             : [37.7298, -122.3977],
    "kma438 sfpd"         : [37.7725, -122.3894],
    "richmond"            : [37.7801, -122.4644],
    "police commission"   : [37.7725, -122.3894],
    "juvenile"            : [37.7632, -122.4220],
    "southern"            : [37.6556, -122.4366],
    "sfpd pistol range"   : [37.7200, -122.4996],
    "sfpd public affairs" : [37.7754, -122.4039],
    "broadmoor"           : [37.6927, -122.4748],
    #################
    "napa wine country"      : [38.2975, -122.2869],
    "sonoma wine country"    : [38.2919, -122.4580],
    "muir woods"             : [37.8970, -122.5811],
    "golden gate"            : [37.8199, -122.4783],
    "yosemite national park" : [37.865101, -119.538330],
}

In [None]:
from mpu import haversine_distance

In [None]:
def get_distance(ij):
    """Get distance from two coordinates"""
    i = ij[0]
    j = ij[1]
    distance = haversine_distance(origin=i, destination=j)
    return distance

def extract_spatial_distance_feature(df, lat_column, lon_column, pname, pcoords):
    """Compute the distance between pcoords and all the feature values"""
    lat_vals = df[lat_column].to_list()
    lon_vals = df[lon_column].to_list()
    
    df_coords = list(zip(lat_vals, lon_vals))
    pcoords_df_coords_combines = zip([pcoords] * len(df), df_coords)
    
    f = pd.DataFrame()
    distances = list(map(get_distance, pcoords_df_coords_combines))
    f[pname] = distances
    
    return f

In [None]:
def extract_spatial_distance_multi_features(df, lat_column, lon_column, stations=sf_pstations_tourists):
    """Compute the spatial distance for multiple features and vertical stacking them"""
    sfeatures = []
    
    for pname, pcoords in stations.items():
        # print(pname, pcoords)
        sf = extract_spatial_distance_feature(df, lat_column, lon_column, pname, pcoords)
        sfeatures.append(sf)
    
    spatial_distances = pd.concat(sfeatures, axis=1)
    return spatial_distances

### Extract Features only based on Latitudes and Longitudes

In [None]:
def lat_lon_sum(ll):
    """Return the sum of lat and lon"""
    lat = ll[0]
    lon = ll[1]
    return lat + lon

def lat_lon_diff(ll):
    """Return the diff of lat and lon"""
    lat = ll[0]
    lon = ll[1]
    return lon - lat

def lat_lon_sum_square(ll):
    """Return the square of sum of lat and lon"""
    lat = ll[0]
    lon = ll[1]
    return (lat + lon) ** 2

def lat_lon_diff_square(ll):
    """Return the square of diff of lat and lon"""
    lat = ll[0]
    lon = ll[1]
    return (lat - lon) ** 2

def lat_lon_sum_sqrt(ll):
    """Return the sqrt of sum of lat and lon"""
    lat = ll[0]
    lon = ll[1]
    return (lat**2 + lon**2) ** (1 / 2)

def lat_lon_diff_sqrt(ll):
    """Return the sqrt of diff of lat and lon"""
    lat = ll[0]
    lon = ll[1]
    return (lon**2 - lat**2) ** (1 / 2)

In [None]:
def features_by_lat_lon(df, lat_column, lon_column):
    """Compute all lat lon based features"""
    
    df_lats = df[lat_column].to_list()
    df_lons = df[lon_column].to_list()
    ll_zipped = list(zip(df_lats, df_lons))

    df_ll = pd.DataFrame()
    df_ll['lat_lon_sum'] = list(map(lat_lon_sum, ll_zipped))
    df_ll['lat_lon_diff'] = list(map(lat_lon_diff, ll_zipped))
    df_ll['lat_lon_sum_square'] = list(map(lat_lon_sum_square, ll_zipped))
    df_ll['lat_lon_diff_square'] = list(map(lat_lon_diff_square, ll_zipped))
    df_ll['lat_lon_sum_sqrt'] = list(map(lat_lon_sum_sqrt, ll_zipped))
    df_ll['lat_lon_diff_sqrt'] = list(map(lat_lon_diff_sqrt, ll_zipped))

    return df_ll

### BoW representation for Address

In [None]:
best_bow_columns = np.array([])

In [None]:
import pickle
from sklearn.feature_selection import SelectKBest
from sklearn.feature_extraction.text import (
    CountVectorizer,
    TfidfVectorizer
)

In [None]:
def create_bow_vectorizer(df, column, target='category', write_vect=True, kbest=20):
    """We should only fit on training data to avoid data leakage"""

    model_name = 'vect_bow_{}.pkl'.format(column)
    print(model_name)
    df_col_val = df[column]

    if not os.path.isfile(path='models/' + model_name):
        vect = CountVectorizer()
        vect.fit(raw_documents=df_col_val)
        pickle.dump(vect, open('models/' + model_name, "wb"))
        
    else:
        print("Model already exists in the directory.")
        vect = pickle.load(open('models/' + model_name, "rb"))
    
    df_col_features = vect.transform(raw_documents=df_col_val)
    global best_bow_columns
    
    if kbest:        
        if best_bow_columns.any():
            return pd.DataFrame(df_col_features[:, best_bow_columns].toarray(), columns=best_bow_columns)
        else:    
            fs = SelectKBest(k=kbest)
            fs.fit(df_col_features, df[target])
            df_col_features = fs.transform(df_col_features)
            best_bow_columns = fs.get_support(indices=True)
            return pd.DataFrame(df_col_features.toarray(), columns=best_bow_columns)

### TfIdf representation for Address

In [None]:
best_tfidf_cols = np.array([])

In [None]:
def create_tfidf_vectorizer(df, column, target='category', write_vect=True, kbest=20):
    """We should only fit on training data to avoid data leakage"""

    model_name = 'vect_tfidf_{}.pkl'.format(column)
    print(model_name)
    df_col_val = df[column]

    if not os.path.isfile(path='models/' + model_name):
        vect = TfidfVectorizer()
        vect.fit(raw_documents=df_col_val)
        pickle.dump(vect, open('models/' + model_name, "wb"))
    else:
        print("Model already exists in the directory.")
        vect = pickle.load(open('models/' + model_name, "rb"))
    
    df_col_features = vect.transform(raw_documents=df_col_val)
    global best_tfidf_cols

    if kbest:
        if best_tfidf_cols.any():
            return pd.DataFrame(df_col_features[:, best_tfidf_cols].toarray(), columns=best_tfidf_cols)
        else:
            fs = SelectKBest(k=kbest)
            fs.fit(df_col_features, df[target])
            df_col_features = fs.transform(df_col_features)
            best_tfidf_cols = fs.get_support(indices=True)
            return pd.DataFrame(df_col_features.toarray(), columns=best_tfidf_cols)

### Combing the data

* OHE data
* Spatial distance features
* Spatial latitude and longitude features
* Address BoW
* Address TfIdf

In [None]:
# temporal based features have already been written for both train and test datasets

def write_data_features(df, path, write_to_file=True):
    encoded_data = encode_multiple_columns(df=df)
    sd_features = extract_spatial_distance_multi_features(df=df, lat_column='latitude', lon_column='longitude')
    sll_features = features_by_lat_lon(df=df, lat_column='latitude', lon_column='longitude')
    address_bow = create_bow_vectorizer(df=df, column='address')
    address_tfidf = create_tfidf_vectorizer(df=df, column='address')
    sf_df_featurized = pd.concat([encoded_data, sd_features, sll_features, address_bow, address_tfidf], axis=1)

    if write_to_file:
        sf_df_featurized.to_csv(path_or_buf=path, index=None)
    
    return True

In [None]:
if (
    not os.path.isfile(path='data/train_data_features.csv') and
    not os.path.isfile(path='data/test_data_features.csv')
   ):
    # Training
    print("Train data")
    write_data_features(df=train_sf_df, path='data/train_data_features.csv')
    print('-' * 30)
    # Test
    print("Test data")
    write_data_features(df=test_sf_df, path='data/test_data_features.csv')
    print('-' * 30)

else:
    print("Data already exists in the directory.")

In [None]:
train_sf_df = pd.read_csv(filepath_or_buffer='data/train_data_features.csv')
test_sf_df = pd.read_csv(filepath_or_buffer='data/test_data_features.csv')

In [None]:
if (not os.path.isfile(path='data/train_data_features_fix.csv')):
    # Training Fix
    print("Train data fix")
    train_sf_df.drop(columns=['Bufano', 'Ferlinghetti'], axis=1, inplace=True)
    train_sf_df.to_csv(path_or_buf='data/train_data_features_fix.csv', index=None)

else:
    print("Data already exists in the directory.")

In [None]:
train_sf_df.head()

In [None]:
test_sf_df.head()