In [23]:
#Import dependencies
import pandas as pd
import sqlite3
import argparse
import json
import pprint
import requests
import sys
import urllib
from urllib.error import HTTPError
from urllib.parse import quote
from urllib.parse import urlencode

In [24]:
#Assign database file to a variable
db_file = "Data/crimedata.db"

#Connect to database file
conn = sqlite3.connect(db_file)



In [25]:
#Query the la crime data table and put into a pandas dataframe
crime_df = pd.read_sql_query("select * from la_crime_1718;", conn)
crime_df.shape

(174766, 26)

In [26]:
#There are too many crime types to put through a model, so let's focus on only those
#crimes where there were 6,000 or more committed.  I chose 6,000 because that gives 
#us the top 11.

crime_types = pd.read_sql_query("select count(*) as cnt, CrimeCode, CrimeCodeDescription from la_crime_1718 group by CrimeCode, CrimeCodeDescription having cnt > 6000 order by cnt desc;", conn)
crime_types

Unnamed: 0,cnt,CrimeCode,CrimeCodeDescription
0,16820,624,BATTERY - SIMPLE ASSAULT
1,14673,330,BURGLARY FROM VEHICLE
2,13046,440,THEFT PLAIN - PETTY ($950 & UNDER)
3,11412,510,VEHICLE - STOLEN
4,10286,310,BURGLARY
5,8903,626,INTIMATE PARTNER - SIMPLE ASSAULT
6,8644,230,"ASSAULT WITH DEADLY WEAPON, AGGRAVATED ASSAULT"
7,7344,210,ROBBERY
8,6942,420,THEFT FROM MOTOR VEHICLE - PETTY ($950 & UNDER)
9,6670,354,THEFT OF IDENTITY


In [34]:
#In order to train a model, it would make sense to select crimes that greatly
#vary from one another.  Using both "Burglary From Vehicle" and "Burglary"
#would probably not produce good results.  I will only keep crime types that
#I want to focus on for this exercise.

#624 = Battery-Simple Assault
#440 = Theft Plain - Petty ($950 & Under)
#510 = Vehicle - Stolen
#310 = Burglary
#740 = Vandalism-Felony

new_crime_df = pd.read_sql_query("select * from la_crime_1718 where CrimeCode in ('624', '440', '510', '310', '740');", conn)
new_crime_df.shape

(61641, 26)

In [35]:
#Query the la weather history data table and put into a pandas dataframe
weather_df = pd.read_sql_query("select * from LA_Hourly_Weather_2016_to_2018;", conn)
weather_df.head()

Unnamed: 0,dt_iso,hour,farenheit,pressure,humidity,wind_speed,weather_code,weather_main
0,1/1/2016,1,58.982,1018,22,5,800,Clear
1,1/1/2016,3,52.016,1018,21,1,800,Clear
2,1/1/2016,5,50.414,1019,34,2,800,Clear
3,1/1/2016,6,47.93,1019,32,1,800,Clear
4,1/1/2016,7,45.662,1019,32,1,721,Haze


In [36]:
#Merge the weather history with the crime data

#First, strip the first two digits of the time occured column so that we can match it up with the weather

new_crime_df['hour'] = new_crime_df['TimeOccurred'].str.slice(0,2)
new_crime_df.head()


Unnamed: 0,DRNumber,DateReported,DateOccurred,TimeOccurred,AreaID,AreaName,ReportingDistrict,CrimeCode,CrimeCodeDescription,MOCodes,...,StatusCode,StatusDescription,CrimeCode1,CrimeCode2,CrimeCode3,CrimeCode4,Address,CrossStreet,Location,hour
0,170209449,4/22/2017,4/21/2017,1930,2,Rampart,201,510,VEHICLE - STOLEN,,...,IC,Invest Cont,510,,,,ALEXANDRIA,ROMAINE,"(34.0886, -118.2979)",19
1,170206243,2/14/2017,2/11/2017,1700,2,Rampart,275,510,VEHICLE - STOLEN,,...,IC,Invest Cont,510,,,,OLYMPIC,WESTLAKE,"(34.0512, -118.2787)",17
2,170311599,4/25/2017,4/25/2017,745,3,Southwest,325,510,VEHICLE - STOLEN,,...,IC,Invest Cont,510,,,,VERMONT,ADAMS,"(34.0328, -118.2915)",7
3,170408264,4/7/2017,4/7/2017,1,4,Hollenbeck,421,510,VEHICLE - STOLEN,,...,IC,Invest Cont,510,,,,ALHAMBRA,VALLEY,"(34.0676, -118.2202)",0
4,170508584,4/10/2017,4/8/2017,730,5,Harbor,566,510,VEHICLE - STOLEN,,...,IC,Invest Cont,510,,,,10TH,CENTURY,"(33.7347, -118.2842)",7


In [37]:
new_df = pd.merge(new_crime_df, weather_df, left_on=['DateOccurred', 'hour'], right_on=['dt_iso', 'hour'])
new_df.shape

(56757, 34)

In [38]:
#Drop columns from the dataframe that will not be needed in our ML models

model_df = new_df.drop(['DRNumber', 'DateReported', 'DateOccurred', 'AreaName', 'CrimeCodeDescription', 'MOCodes',
                       'PremiseDescription', 'weather_main', 'WeaponUsedCode'],
                     axis=1)
model_df

Unnamed: 0,TimeOccurred,AreaID,ReportingDistrict,CrimeCode,VictimAge,VictimSex,VictimDescent,PremiseCode,WeaponDescription,StatusCode,...,Address,CrossStreet,Location,hour,dt_iso,farenheit,pressure,humidity,wind_speed,weather_code
0,1930,2,201,510,16,,,101,,IC,...,ALEXANDRIA,ROMAINE,"(34.0886, -118.2979)",19,4/21/2017,80.492,1013,24,1,800
1,1915,1,153,440,48,F,W,502,,IC,...,500 S SPRING ST,,"(34.0467, -118.252)",19,4/21/2017,80.492,1013,24,1,800
2,1925,7,713,740,29,F,W,122,,IC,...,7500 BEVERLY BL,,"(34.0782, -118.3573)",19,4/21/2017,80.492,1013,24,1,800
3,1920,7,785,740,61,M,B,122,,IC,...,2000 S CLAUDINA AV,,"(34.0385, -118.341)",19,4/21/2017,80.492,1013,24,1,800
4,1900,8,836,310,64,F,O,502,,IC,...,10300 LA GRANGE AV,,"(34.0573, -118.4206)",19,4/21/2017,80.492,1013,24,1,800
5,1900,8,851,740,25,F,O,122,,IC,...,1500 AMHERST AV,,"(34.0395, -118.4637)",19,4/21/2017,80.492,1013,24,1,800
6,1900,9,932,440,35,F,O,108,,IC,...,6100 SEPULVEDA BL,,"(34.1806, -118.4662)",19,4/21/2017,80.492,1013,24,1,800
7,1930,9,971,440,16,F,W,210,,IC,...,15300 VENTURA BL,,"(34.1564, -118.463)",19,4/21/2017,80.492,1013,24,1,800
8,1700,2,275,510,16,,,101,,IC,...,OLYMPIC,WESTLAKE,"(34.0512, -118.2787)",17,2/11/2017,57.632,1019,87,3,500
9,1740,2,281,740,47,F,H,101,,IC,...,1200 S HOOVER ST,,"(34.0492, -118.2842)",17,2/11/2017,57.632,1019,87,3,500


In [39]:
model_df = model_df.drop(['WeaponDescription', 'VictimSex', 'VictimDescent', 'StatusCode', 'StatusDescription',
                         'Address', 'CrossStreet', 'CrimeCode1', 'CrimeCode2', 'CrimeCode3', 'CrimeCode4','dt_iso'], axis=1)
model_df

Unnamed: 0,TimeOccurred,AreaID,ReportingDistrict,CrimeCode,VictimAge,PremiseCode,Location,hour,farenheit,pressure,humidity,wind_speed,weather_code
0,1930,2,201,510,16,101,"(34.0886, -118.2979)",19,80.492,1013,24,1,800
1,1915,1,153,440,48,502,"(34.0467, -118.252)",19,80.492,1013,24,1,800
2,1925,7,713,740,29,122,"(34.0782, -118.3573)",19,80.492,1013,24,1,800
3,1920,7,785,740,61,122,"(34.0385, -118.341)",19,80.492,1013,24,1,800
4,1900,8,836,310,64,502,"(34.0573, -118.4206)",19,80.492,1013,24,1,800
5,1900,8,851,740,25,122,"(34.0395, -118.4637)",19,80.492,1013,24,1,800
6,1900,9,932,440,35,108,"(34.1806, -118.4662)",19,80.492,1013,24,1,800
7,1930,9,971,440,16,210,"(34.1564, -118.463)",19,80.492,1013,24,1,800
8,1700,2,275,510,16,101,"(34.0512, -118.2787)",17,57.632,1019,87,3,500
9,1740,2,281,740,47,101,"(34.0492, -118.2842)",17,57.632,1019,87,3,500


In [40]:
#COnvert the datatype of each column to numeric for machine learning
model_df['TimeOccurred'] = model_df['TimeOccurred'].apply(pd.to_numeric, errors='coerce')
model_df['AreaID'] = model_df['AreaID'].apply(pd.to_numeric, errors='coerce')
model_df['CrimeCode'] = model_df['CrimeCode'].apply(pd.to_numeric, errors='coerce')
model_df['ReportingDistrict'] = model_df['ReportingDistrict'].apply(pd.to_numeric, errors='coerce')
model_df['VictimAge'] = model_df['VictimAge'].apply(pd.to_numeric, errors='coerce')
model_df['PremiseCode'] = model_df['PremiseCode'].apply(pd.to_numeric, errors='coerce')
model_df['Location'] = model_df['Location'].apply(pd.to_numeric, errors='coerce')
model_df['hour'] = model_df['hour'].apply(pd.to_numeric, errors='coerce')
model_df['farenheit'] = model_df['farenheit'].apply(pd.to_numeric, errors='coerce')
model_df['pressure'] = model_df['pressure'].apply(pd.to_numeric, errors='coerce')
model_df['humidity'] = model_df['humidity'].apply(pd.to_numeric, errors='coerce')
model_df['wind_speed'] = model_df['wind_speed'].apply(pd.to_numeric, errors='coerce')
model_df['weather_code'] = model_df['weather_code'].apply(pd.to_numeric, errors='coerce')


print(model_df.dtypes)
model_df.head()

TimeOccurred           int64
AreaID                 int64
ReportingDistrict      int64
CrimeCode              int64
VictimAge            float64
PremiseCode            int64
Location             float64
hour                   int64
farenheit            float64
pressure               int64
humidity               int64
wind_speed             int64
weather_code           int64
dtype: object


Unnamed: 0,TimeOccurred,AreaID,ReportingDistrict,CrimeCode,VictimAge,PremiseCode,Location,hour,farenheit,pressure,humidity,wind_speed,weather_code
0,1930,2,201,510,16.0,101,,19,80.492,1013,24,1,800
1,1915,1,153,440,48.0,502,,19,80.492,1013,24,1,800
2,1925,7,713,740,29.0,122,,19,80.492,1013,24,1,800
3,1920,7,785,740,61.0,122,,19,80.492,1013,24,1,800
4,1900,8,836,310,64.0,502,,19,80.492,1013,24,1,800


In [41]:
model_df = model_df.fillna(0)


In [42]:
model_df.reset_index()

Unnamed: 0,index,TimeOccurred,AreaID,ReportingDistrict,CrimeCode,VictimAge,PremiseCode,Location,hour,farenheit,pressure,humidity,wind_speed,weather_code
0,0,1930,2,201,510,16.0,101,0.0,19,80.492,1013,24,1,800
1,1,1915,1,153,440,48.0,502,0.0,19,80.492,1013,24,1,800
2,2,1925,7,713,740,29.0,122,0.0,19,80.492,1013,24,1,800
3,3,1920,7,785,740,61.0,122,0.0,19,80.492,1013,24,1,800
4,4,1900,8,836,310,64.0,502,0.0,19,80.492,1013,24,1,800
5,5,1900,8,851,740,25.0,122,0.0,19,80.492,1013,24,1,800
6,6,1900,9,932,440,35.0,108,0.0,19,80.492,1013,24,1,800
7,7,1930,9,971,440,16.0,210,0.0,19,80.492,1013,24,1,800
8,8,1700,2,275,510,16.0,101,0.0,17,57.632,1019,87,3,500
9,9,1740,2,281,740,47.0,101,0.0,17,57.632,1019,87,3,500


In [43]:
# Assign X (data) and y (target) to put into model
X = model_df.drop("CrimeCode", axis=1)
y = model_df["CrimeCode"]
print(X.shape, y.shape)


(56757, 12) (56757,)


In [44]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [45]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()
classifier

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [46]:
classifier.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [47]:
print(f"Training Data Score: {classifier.score(X_train, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test, y_test)}")

Training Data Score: 0.4539666878098057
Testing Data Score: 0.4508104298801973


In [48]:
predictions = classifier.predict(X_test)
print(f"First 10 Predictions:   {predictions[:10]}")
print(f"First 10 Actual labels: {y_test[:10].tolist()}")

First 10 Predictions:   [510 510 510 624 440 624 310 624 624 510]
First 10 Actual labels: [510, 510, 510, 440, 624, 624, 740, 440, 624, 510]


In [None]:
#Connect to yelp API to get businesses nearby

API_KEY= "txYI2uUI8mAqa2m2ELWwZKi6d_Q9g6ndaKkGtWvjECd6IYH6uJFDaHSe0oJtbrBdLyS5WKKf1o4BxwXsDkNx16O-0bY2YydRCHzB6yBdpoLX55Fu-xU4C6bM8D2bW3Yx"

API_HOST = 'https://api.yelp.com'
SEARCH_PATH = '/v3/businesses/search'
BUSINESS_PATH = '/v3/businesses/' 

# Defaults for our simple example.
DEFAULT_TERM = 'dinner'
DEFAULT_LOCATION = 'San Francisco, CA'
SEARCH_LIMIT = 3



In [None]:
def request(host, path, api_key, url_params=None):
    url_params = url_params or {}
    url = '{0}{1}'.format(host, quote(path.encode('utf8')))
    headers = {
        'Authorization': 'Bearer %s' % api_key,
    }

    print(u'Querying {0} ...'.format(url))

    response = requests.request('GET', url, headers=headers, params=url_params)

    return response.json()


In [None]:
def search(api_key, term, location):
    """Query the Search API by a search term and location.
    Args:
        term (str): The search term passed to the API.
        location (str): The search location passed to the API.
    Returns:
        dict: The JSON response from the request.
    """

    url_params = {
        'term': term.replace(' ', '+'),
        'location': location.replace(' ', '+'),
        'limit': SEARCH_LIMIT
    }
    return request(API_HOST, SEARCH_PATH, api_key, url_params=url_params)



In [None]:
def get_business(api_key, business_id):
    """Query the Business API by a business ID.
    Args:
        business_id (str): The ID of the business to query.
    Returns:
        dict: The JSON response from the request.
    """
    business_path = BUSINESS_PATH + business_id

    return request(API_HOST, business_path, api_key)


In [None]:
def query_api(term, location):
    """Queries the API by the input values from the user.
    Args:
        term (str): The search term to query.
        location (str): The location of the business to query.
    """
    response = search(API_KEY, term, location)

    businesses = response.get('businesses')

    if not businesses:
        print(u'No businesses for {0} in {1} found.'.format(term, location))
        return

    business_id = businesses[0]['id']

    print(u'{0} businesses found, querying business info ' \
        'for the top result "{1}" ...'.format(
            len(businesses), business_id))
    response = get_business(API_KEY, business_id)

    print(u'Result for business "{0}" found:'.format(business_id))
    pprint.pprint(response, indent=2)


In [None]:
def yelp_main():
    parser = argparse.ArgumentParser()

    parser.add_argument('-q', '--term', dest='term', default=DEFAULT_TERM,
                        type=str, help='Search term (default: %(default)s)')
    parser.add_argument('-l', '--location', dest='location',
                        default=DEFAULT_LOCATION, type=str,
                        help='Search location (default: %(default)s)')

    input_values = parser.parse_args()

    try:
        query_api(input_values.term, input_values.location)
    except HTTPError as error:
        sys.exit(
            'Encountered HTTP error {0} on {1}:\n {2}\nAbort program.'.format(
                error.code,
                error.url,
                error.read(),
            )
        )
