# San Francisco Crime data

Download data here https://www.kaggle.com/c/sf-crime/data

### Import packages etc

In [1]:
%matplotlib inline
import pandas as pd
from pandas import Series,DataFrame
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')

### Initial preview of data:
We are only going to work with the train data for right now

In [2]:
crime_df = pd.read_csv("/Users/KRich/GitHub/data/SF crime/train.csv")
# remove address, resolution and description
crime_df = crime_df.drop(['Address','Resolution','Descript'],axis=1)
crime_df.head()

Unnamed: 0,Dates,Category,DayOfWeek,PdDistrict,X,Y
0,2015-05-13 23:53:00,WARRANTS,Wednesday,NORTHERN,-122.425892,37.774599
1,2015-05-13 23:53:00,OTHER OFFENSES,Wednesday,NORTHERN,-122.425892,37.774599
2,2015-05-13 23:33:00,OTHER OFFENSES,Wednesday,NORTHERN,-122.424363,37.800414
3,2015-05-13 23:30:00,LARCENY/THEFT,Wednesday,NORTHERN,-122.426995,37.800873
4,2015-05-13 23:30:00,LARCENY/THEFT,Wednesday,PARK,-122.438738,37.771541


In [3]:
crime_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 878049 entries, 0 to 878048
Data columns (total 6 columns):
Dates         878049 non-null object
Category      878049 non-null object
DayOfWeek     878049 non-null object
PdDistrict    878049 non-null object
X             878049 non-null float64
Y             878049 non-null float64
dtypes: float64(2), object(4)
memory usage: 40.2+ MB


## Prediction:

In [4]:
def data_load():
    # IMPORT DATA --------------------------
    crime_df = pd.read_csv("/Users/KRich/GitHub/data/SF crime/train.csv")
    # drop address, Resolution, and Descript (not useful for prediction)
    crime_df = crime_df.drop(['Address','Resolution','Descript'],axis=1)
    return crime_df
def data_split(crime_df):
    # SPLIT DATA:---------------------------
    # train data:
    Xtrain = crime_df.sample(frac=0.8, random_state=1)
    YTrain = Xtrain["Category"]    
    Xtrain = Xtrain.drop('Category', axis=1)
    # test data (anything not in train):
    Xtest  = crime_df.loc[~crime_df.index.isin(Xtrain.index)]
    Xtest  = Xtest.drop("Category", axis=1)
    return Xtrain, YTrain, Xtest
#-------------------------------
def data_categorize(Xdata):
    # create dummy categorical values and drop the former var:
    X = Xdata
    # date/time categorical features
    hours  = pd.get_dummies(X.Dates.map(lambda x: pd.to_datetime(x).hour), prefix="hour")
    months = pd.get_dummies(X.Dates.map(lambda x: pd.to_datetime(x).month), prefix="month")
    years  = pd.get_dummies(X.Dates.map(lambda x: pd.to_datetime(x).year), prefix="year")
    # district categorical features
    district = pd.get_dummies(X["PdDistrict"])
    X = X.drop("PdDistrict", axis=1)
    # day of the week categorical features
    day_of_week = pd.get_dummies(X["DayOfWeek"])
    X = X.drop("DayOfWeek", axis=1)
    # string together 
    X = pd.concat([X, hours, months, years, district, day_of_week], axis=1)
    # drop the year of 2015 (see: _Viz_SanFranCrime; 2015 not complete)
    X = X.drop('year_2015' , axis=1)#[X.year_2015 != '2015']
    X = X.drop("Dates", axis=1)
    return X
#-------------------------------
def rand_forest(Xtrain, Ytrain, Xtest):  
    # model
    clf = RandomForestClassifier(n_estimators=10)
    # fit
    clf.fit(Xtrain, Ytrain)
    # predict
    predictions = clf.predict(Xtest)
    #score
    rf_score = clf.score(Xtrain, Ytrain)
    print('Random forest score:' + str(rf_score))
    # predict
    y_test = pd.DataFrame(clf.predict_proba(Xtest), index=Xtest.index, columns=clf.classes_)
    return rf_score
#-------------------------------
def log_regress(Xtrain, Ytrain, Xtest):
    # model
    LogRegModel = LogisticRegression()
    # fit
    LogRegModel.fit(Xtrain, Ytrain)
    # predict
    predictions = LogRegModel.predict(Xtest)
    # score
    log_score = LogRegModel.score(Xtrain, Ytrain)
    print('Logistic regression score:' + str(log_score))
#-------------------------------
def main():
    # load data
    df = data_load()
    # split data train/test
    Xtrain, Ytrain, Xtest = data_split(df)
    # process vars
    Xtrain = data_categorize(Xtrain)
    Xtest  = data_categorize(Xtest)
    # random forests, predict and score
    rand_forest(Xtrain, Ytrain, Xtest)
    # logistic regression
    log_regress(Xtrain, Ytrain, Xtest)
#-------------------------------
              
if __name__ == '__main__':
    main()



Random forest score:0.857532113109
Logistic regression score:0.228404174597
