In [45]:
import numpy as np
import pandas as pd
import patsy

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.grid_search import GridSearchCV

In [46]:
sf_crime = pd.read_csv('../../assets/datasets/sf_crime_train.csv')

In [47]:
sf_crime.head()

Unnamed: 0,Dates,Category,Descript,DayOfWeek,PdDistrict,Resolution,Address,X,Y
0,5/13/15 23:53,WARRANTS,WARRANT ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599
1,5/13/15 23:53,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599
2,5/13/15 23:33,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",VANNESS AV / GREENWICH ST,-122.424363,37.800414
3,5/13/15 23:30,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,NORTHERN,NONE,1500 Block of LOMBARD ST,-122.426995,37.800873
4,5/13/15 23:30,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,PARK,NONE,100 Block of BRODERICK ST,-122.438738,37.771541


In [48]:
sf_crime.shape

(18000, 9)

In [49]:
# prep data, covert date to datetime, split for train and test, and build model
sf_crime = sf_crime.dropna()

sf_crime['Dates'] = pd.to_datetime(sf_crime.Dates)
sf_crime_dates = pd.DatetimeIndex(sf_crime.Dates.values, dtype='datetime64[ns]', freq=None)

sf_crime['hour'] = sf_crime_dates.hour
sf_crime['month'] = sf_crime_dates.month
sf_crime['year'] = sf_crime_dates.year

In [50]:
sf_crime.shape

(18000, 12)

In [83]:
# sf_crime.head(10)

In [85]:
subset = ['VEHICLE THEFT','BURGLARY','DRUG/NARCOTIC']
sf_crime_sub = sf_crime[sf_crime['Category'].str.contains('|'.join(subset))]
sf_crime_sub.head(2)

Unnamed: 0,Dates,Category,Descript,DayOfWeek,PdDistrict,Resolution,Address,X,Y,hour,month,year
6,2015-05-13 23:30:00,VEHICLE THEFT,STOLEN AUTOMOBILE,Wednesday,INGLESIDE,NONE,AVALON AV / PERU AV,-122.423327,37.725138,23,5,2015
7,2015-05-13 23:30:00,VEHICLE THEFT,STOLEN AUTOMOBILE,Wednesday,BAYVIEW,NONE,KIRKWOOD AV / DONAHUE ST,-122.371274,37.727564,23,5,2015


In [92]:
#sf_sample = sf_crime_sub.sample(frac=0.50)

X = patsy.dmatrix('~ C(hour) + C(DayOfWeek) + C(PdDistrict)', sf_crime_sub)
Y = sf_crime_sub.Category.values

# split for train and test
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.33, stratify=Y, random_state=77)


In [93]:
X

DesignMatrix with shape (2195, 39)
  Columns:
    ['C(hour)[0]',
     'C(hour)[1]',
     'C(hour)[2]',
     'C(hour)[3]',
     'C(hour)[4]',
     'C(hour)[5]',
     'C(hour)[6]',
     'C(hour)[7]',
     'C(hour)[8]',
     'C(hour)[9]',
     'C(hour)[10]',
     'C(hour)[11]',
     'C(hour)[12]',
     'C(hour)[13]',
     'C(hour)[14]',
     'C(hour)[15]',
     'C(hour)[16]',
     'C(hour)[17]',
     'C(hour)[18]',
     'C(hour)[19]',
     'C(hour)[20]',
     'C(hour)[21]',
     'C(hour)[22]',
     'C(hour)[23]',
     'C(DayOfWeek)[T.Monday]',
     'C(DayOfWeek)[T.Saturday]',
     'C(DayOfWeek)[T.Sunday]',
     'C(DayOfWeek)[T.Thursday]',
     'C(DayOfWeek)[T.Tuesday]',
     'C(DayOfWeek)[T.Wednesday]',
     'C(PdDistrict)[T.CENTRAL]',
     'C(PdDistrict)[T.INGLESIDE]',
     'C(PdDistrict)[T.MISSION]',
     'C(PdDistrict)[T.NORTHERN]',
     'C(PdDistrict)[T.PARK]',
     'C(PdDistrict)[T.RICHMOND]',
     'C(PdDistrict)[T.SOUTHERN]',
     'C(PdDistrict)[T.TARAVAL]',
     'C(PdDistrict)[T.TE

In [94]:
# fit model with five folds and lasso regularization
# use Cs=15 to test a grid of 15 distinct parameters
# remeber: Cs describes the inverse of regularization strength
logreg_cv = LogisticRegressionCV(solver='liblinear',Cs=15,cv=5,penalty='l1', scoring='accuracy') # update inputs here # update inputs here **You are substantiating a class here
logreg_cv.fit(X_train, Y_train)

LogisticRegressionCV(Cs=15, class_weight=None, cv=5, dual=False,
           fit_intercept=True, intercept_scaling=1.0, max_iter=100,
           multi_class='ovr', n_jobs=1, penalty='l1', random_state=None,
           refit=True, scoring='accuracy', solver='liblinear', tol=0.0001,
           verbose=0)

In [88]:
# find best C per class
print('best C for class:')
best_C = {logreg_cv.classes_[i]:x for i, (x, c) in enumerate(zip(logreg_cv.C_, logreg_cv.classes_))}
print(best_C)

best C for class:
{'BURGLARY': 719.68567300115137, 'VEHICLE THEFT': 1.0, 'DRUG/NARCOTIC': 1.0}


In [80]:
# fit regular logit model to 'DRUG/NARCOTIC' and 'BURGLARY' classes
# use lasso penalty
logreg_1 = LogisticRegression(C=best_C['DRUG/NARCOTIC'], solver='liblinear', penalty='l1')
logreg_2 = LogisticRegression(C=best_C['BURGLARY'], solver='liblinear', penalty='l1')

logreg_1.fit(X_train, Y_train)
logreg_2.fit(X_train, Y_train)
# mdl = sm.MNLogit(y, x)
# mdl_fit = mdl.fit()
# mdl_fit.summary()


AttributeError: 'function' object has no attribute 'score'

In [81]:
# build confusion matrices for the models above
Y_1_pred = logreg_1.predict(X_train)
Y_2_pred = logreg_2.predict(X_train)

conmat_1 = confusion_matrix(Y_train, Y_1_pred, labels=logreg_1.classes_)
conmat_1 = pd.DataFrame(conmat_1, columns=logreg_1.classes_, index=logreg_1.classes_)

conmat_2 = confusion_matrix(Y_train, Y_2_pred, labels=logreg_2.classes_)
conmat_2 = pd.DataFrame(conmat_2, columns=logreg_2.classes_, index=logreg_2.classes_)


In [None]:
# print classification reports


In [None]:
# run gridsearch using GridSearchCV and 5 folds
# score on f1_macro; what does this metric tell us?
logreg = LogisticRegression()
C_vals = [0.0001, 0.001, 0.01, 0.1, 0.5, 0.75, 1.0, 2.5, 5.0, 10.0, 100.0, 1000.0]
penalties = ['l1','l2']


In [None]:
# find the best parameter


In [None]:
# use this parameter to .fit, .predict, and print a classification_report for our X and Y
