In [1]:
import numpy as np
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

import warnings

warnings.filterwarnings('ignore')

In [25]:
# from sklearn import metrics
# metrics.classification_report()

In [None]:
train_data = pd.read_csv("/data/datasets/ml/sf-crime/train.csv", parse_dates=['Dates'])

In [24]:
train_data["Dates"].dt.year

0         2015
1         2015
2         2015
3         2015
4         2015
          ... 
878044    2003
878045    2003
878046    2003
878047    2003
878048    2003
Name: Dates, Length: 878049, dtype: int64

In [3]:
print(train_data.shape)
train_data.head()

(878049, 9)


Unnamed: 0,Dates,Category,Descript,DayOfWeek,PdDistrict,Resolution,Address,X,Y
0,2015-05-13 23:53:00,WARRANTS,WARRANT ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599
1,2015-05-13 23:53:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599
2,2015-05-13 23:33:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",VANNESS AV / GREENWICH ST,-122.424363,37.800414
3,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,NORTHERN,NONE,1500 Block of LOMBARD ST,-122.426995,37.800873
4,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,PARK,NONE,100 Block of BRODERICK ST,-122.438738,37.771541


In [33]:
categories = train_data['Category'].unique()
categories

array(['WARRANTS', 'OTHER OFFENSES', 'LARCENY/THEFT', 'VEHICLE THEFT',
       'VANDALISM', 'NON-CRIMINAL', 'ROBBERY', 'ASSAULT', 'WEAPON LAWS',
       'BURGLARY', 'SUSPICIOUS OCC', 'DRUNKENNESS',
       'FORGERY/COUNTERFEITING', 'DRUG/NARCOTIC', 'STOLEN PROPERTY',
       'SECONDARY CODES', 'TRESPASS', 'MISSING PERSON', 'FRAUD',
       'KIDNAPPING', 'RUNAWAY', 'DRIVING UNDER THE INFLUENCE',
       'SEX OFFENSES FORCIBLE', 'PROSTITUTION', 'DISORDERLY CONDUCT',
       'ARSON', 'FAMILY OFFENSES', 'LIQUOR LAWS', 'BRIBERY',
       'EMBEZZLEMENT', 'SUICIDE', 'LOITERING',
       'SEX OFFENSES NON FORCIBLE', 'EXTORTION', 'GAMBLING', 'BAD CHECKS',
       'TREA', 'RECOVERED VEHICLE', 'PORNOGRAPHY/OBSCENE MAT'],
      dtype=object)

In [34]:
category_dict = {}
for i, c in enumerate(categories, 1):
    category_dict[c] = i
category_dict

{'WARRANTS': 1,
 'OTHER OFFENSES': 2,
 'LARCENY/THEFT': 3,
 'VEHICLE THEFT': 4,
 'VANDALISM': 5,
 'NON-CRIMINAL': 6,
 'ROBBERY': 7,
 'ASSAULT': 8,
 'WEAPON LAWS': 9,
 'BURGLARY': 10,
 'SUSPICIOUS OCC': 11,
 'DRUNKENNESS': 12,
 'FORGERY/COUNTERFEITING': 13,
 'DRUG/NARCOTIC': 14,
 'STOLEN PROPERTY': 15,
 'SECONDARY CODES': 16,
 'TRESPASS': 17,
 'MISSING PERSON': 18,
 'FRAUD': 19,
 'KIDNAPPING': 20,
 'RUNAWAY': 21,
 'DRIVING UNDER THE INFLUENCE': 22,
 'SEX OFFENSES FORCIBLE': 23,
 'PROSTITUTION': 24,
 'DISORDERLY CONDUCT': 25,
 'ARSON': 26,
 'FAMILY OFFENSES': 27,
 'LIQUOR LAWS': 28,
 'BRIBERY': 29,
 'EMBEZZLEMENT': 30,
 'SUICIDE': 31,
 'LOITERING': 32,
 'SEX OFFENSES NON FORCIBLE': 33,
 'EXTORTION': 34,
 'GAMBLING': 35,
 'BAD CHECKS': 36,
 'TREA': 37,
 'RECOVERED VEHICLE': 38,
 'PORNOGRAPHY/OBSCENE MAT': 39}

In [32]:
district = train_data["PdDistrict"].unique()
district

array(['NORTHERN', 'PARK', 'INGLESIDE', 'BAYVIEW', 'RICHMOND', 'CENTRAL',
       'TARAVAL', 'TENDERLOIN', 'MISSION', 'SOUTHERN'], dtype=object)

In [7]:
district_dict = {}
for i, c in enumerate(district, 1):
    district_dict[c] = i
district_dict

{'NORTHERN': 1,
 'PARK': 2,
 'INGLESIDE': 3,
 'BAYVIEW': 4,
 'RICHMOND': 5,
 'CENTRAL': 6,
 'TARAVAL': 7,
 'TENDERLOIN': 8,
 'MISSION': 9,
 'SOUTHERN': 10}

In [8]:
week_dict = {
    "Monday": 1,
    "Tuesday":2,
    "Wednesday":3,
    "Thursday":4,
    "Friday":5,
    "Saturday":6,
    "Sunday":7
}

In [9]:
train_data["Category"].replace(category_dict, inplace=True)
train_data["PdDistrict"].replace(district_dict, inplace=True)
train_data["DayOfWeek"].replace(week_dict, inplace=True)

In [10]:
train_data['Dates'] = pd.to_datetime(train_data['Dates'])

train_data['Year']  = train_data['Dates'].dt.year
train_data['Month'] = train_data['Dates'].dt.month
train_data['Day']   = train_data['Dates'].dt.day
train_data['Hour']  = train_data['Dates'].dt.hour

In [11]:
train_data.drop(['Dates', 'Descript', 'Resolution', 'Address'] , axis=1, inplace=True)

In [12]:
train_data.head()

Unnamed: 0,Category,DayOfWeek,PdDistrict,X,Y,Year,Month,Day,Hour
0,1,3,1,-122.425892,37.774599,2015,5,13,23
1,2,3,1,-122.425892,37.774599,2015,5,13,23
2,2,3,1,-122.424363,37.800414,2015,5,13,23
3,3,3,1,-122.426995,37.800873,2015,5,13,23
4,3,3,2,-122.438738,37.771541,2015,5,13,23


In [13]:
X_train_ = train_data.drop('Category', axis=1)
y_train_ = train_data['Category']

In [15]:
X_train_[0:5], y_train_[:5]

(   DayOfWeek  PdDistrict           X          Y  Year  Month  Day  Hour
 0          3           1 -122.425892  37.774599  2015      5   13    23
 1          3           1 -122.425892  37.774599  2015      5   13    23
 2          3           1 -122.424363  37.800414  2015      5   13    23
 3          3           1 -122.426995  37.800873  2015      5   13    23
 4          3           2 -122.438738  37.771541  2015      5   13    23,
 0    1
 1    2
 2    2
 3    3
 4    3
 Name: Category, dtype: int64)

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X_train_, y_train_, test_size=0.33, random_state=42)

In [17]:
X_train[0:5], y_train[:5]

(        DayOfWeek  PdDistrict           X          Y  Year  Month  Day  Hour
 617510          4           3 -122.441018  37.716459  2006      8   10     0
 451491          2          10 -122.402844  37.785718  2009      1    6     0
 354809          7           9 -122.412734  37.752642  2010      6    6    18
 329161          5           7 -122.476898  37.743915  2010     10   22    19
 798989          6          10 -122.413161  37.777457  2004      1   24     8,
 617510    6
 451491    6
 354809    2
 329161    3
 798989    3
 Name: Category, dtype: int64)

In [None]:
columns_train = train_data.columns
columns_train 

In [None]:
X_train_ = train_data[["DayOfWeek", "PdDistrict",  "X", "Y"]]
y_train_ = train_data["Category"]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_train_, y_train_, test_size=0.33, random_state=42)

In [None]:
data = pd.concat((X_train, y_train), axis=1, join='inner')
data.corr()

In [None]:
data.skew()

In [None]:
X_train_, X_test_, y_train_, y_test_  = train_test_split(X_train, y_train, test_size=0.33, random_state=42)

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train_, y_train_)

In [None]:
predictions = knn.predict(X_test_)

In [None]:
X_train_[0:5], y_train_[:5], predictions[:5], y_test_[:5]