## [San Francisco Crime Classification(Kaggle)](https://www.kaggle.com/c/sf-crime)

In [1]:
import pandas as pd
import numpy as np
from IPython.display import Image, display
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

  if 'order' in inspect.getargspec(np.copy)[0]:


#### Load train and test data from csv files.

In [2]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

#### Observe top 2 records just as a matter of data preview.

In [None]:
train.head(2)

In [None]:
test.head(2)

#### Get more detailed statistics for all features.

In [None]:
train.describe(include='all')

#### Some features have smaller set of values. Meaning they are categorical by nature so it's better convert them to [pandas category type](https://pandas-docs.github.io/pandas-docs-travis/categorical.html).

In [None]:
train.Category = train.Category.astype('category')
train.DayOfWeek = train.DayOfWeek.astype('category')
train.PdDistrict = train.PdDistrict.astype('category')
train.Resolution = train.Resolution.astype('category')

Y is the Latitude and X - Longitude. Would be better to rename those features to avoid confusion.

In [None]:
train['Longitude'] = train.X
train['Latitude'] = train.Y
train = train.drop(['X', 'Y'], axis = 1)
train.head(2)

#### Describe shows maximum Latitude value is 90. This is suspicious since 90 should be somewhere in the North Pole. Looks like an invalid data entry. Let's how many such entries are there. 

In [None]:
train[train.Latitude == 90].describe(include='all')

#### 67 of them! There are several approaches how to deal with this broken feature. For now, I am going exclude them.

In [None]:
train = train.drop(train[train.Latitude == 90].index)

In [None]:
train.describe(include='all')

In [None]:
ct = pd.crosstab(train.Category, train.DayOfWeek)

### Word cloud of crime categories.

In [None]:
import matplotlib.pyplot as plt
from wordcloud import WordCloud

In [None]:
plt.figure(figsize=(15,10))
wordcloud = WordCloud(width=900, height=600, background_color='white')\
.generate_from_frequencies(train.groupby(['Category']).count().Dates.iteritems())
img=plt.imshow(wordcloud)
plt.axis("off")
plt.show() 

## Baseline model

It's time to create a baseline model. This should be a simple model used to calculate baseline accuracy. 

Let's take for the baseline model an assumption that crime category doesn't depend on any features and its values is always the most frequent category. The most popular is **LARCENY/THEFT**

In [None]:
train.groupby(['Category']).count().sort_values('Dates', ascending=False).head(1)

In [None]:
test['Category'] = 'LARCENY/THEFT'

In [None]:
test.head(2)

In [None]:
testToWrite = pd.DataFrame(0, index=np.arange(len(test)), columns=sorted(train.Category.astype(str).unique()))
testToWrite.index.name = 'Id'
testToWrite['LARCENY/THEFT'] = 1
testToWrite.head(2)

In [None]:
from sklearn.metrics import accuracy_score

In [None]:
trainTest = train[int(-len(train)*0.3):]

In [None]:
accuracy_score(trainTest.Category, ['LARCENY/THEFT' for c in range(0, len(trainTest))])

In [None]:
testToWrite.to_csv('baseline.csv', sep=',', index=True)

Kaggle estimated accuracy of the baseline model as **27.62709** which landed me on **1261** position.
![alt text](https://raw.githubusercontent.com/stormy-ua/Kaggle/master/SanFranciscoCrime/images/BaselineKaggle.png)

## District-based model

Category vs District crosstabulation followed by extracting most frequent crime category in a district produces the following table showing the most popular crime in a district.

In [None]:
train.groupby(['PdDistrict', 'Category']).count().sort_values('Dates', ascending=False)
categoryInDistricts = pd.crosstab(train.Category, train.PdDistrict)\
.apply(lambda c: c.sort_values(ascending=False).index[0], axis=0).to_dict()
categoryInDistricts

In [None]:
test['Category'] = test.PdDistrict.apply(lambda x: categoryInDistricts[x])

In [None]:
accuracy_score(trainTest.Category, trainTest.PdDistrict.apply(lambda x: categoryInDistricts[x]))

In [None]:
mapdf = pd.DataFrame(0, index = sorted(train.Category.unique()), columns = sorted(train.Category.unique()))
np.fill_diagonal(mapdf.values,1)

In [None]:
testToWrite = test.Category.apply(lambda x: mapdf[x])
testToWrite.index.name = 'Id'

In [None]:
testToWrite.to_csv('districtBased.csv', sep=',', index=True)

District based model got **26.88727** score and moved me 35 positions up. Not a big improvement.
![alt text](https://raw.githubusercontent.com/stormy-ua/Kaggle/master/SanFranciscoCrime/images/districtBased.png)

### Geo-based model

In [None]:
from sklearn.preprocessing import MinMaxScaler

In [None]:
geoBasedTrain = train.copy()
minMaxScaler = MinMaxScaler(feature_range=(-1, 1))
geoBasedTrain.Latitude = minMaxScaler.fit_transform(train.Latitude)
geoBasedTrain.Longitude = minMaxScaler.fit_transform(train.Longitude)

In [None]:
disticts = geoBasedTrain.PdDistrict.unique()
distictsColorMap = {d: i for i, d in zip(range(0, len(disticts)), disticts)}

In [None]:
fig = plt.figure(figsize=(15,10))
ax = fig.add_subplot(1,1,1)
colors = geoBasedTrain.PdDistrict.apply(lambda x: distictsColorMap[x])
ax.scatter(geoBasedTrain.Longitude, geoBasedTrain.Latitude, c = colors)

### Datetime-based model

In [None]:
from datetime import datetime
import bisect

In [None]:
train.Dates = train.Dates.apply(lambda x: datetime.strptime(x, '%Y-%m-%d %H:%M:%S'))

In [None]:
datetimeModel = train.copy()

In [None]:
datetimeModel['HoursRange'] = datetimeModel.Dates.apply(lambda x: bisect.bisect(range(0, 24, 2), x.hour))

In [None]:
pd.crosstab(datetimeModel.Category, datetimeModel.HoursRange)

In [None]:
categories = geoBasedTrain.Category.unique()
categoriesColorMap = {d: i for i, d in zip(range(0, len(categories)), categories)}

### Geo-based model (Nearest neighbors)

In [66]:
knnTrain = train.copy()
knnTest = test.copy()

In [46]:
knnTrain = knnTrain.drop(['Descript', 'Resolution', 'Address'], axis=1)

In [None]:
knnTrain.head(2)

In [47]:
knnTest = knnTest.drop(['Address'], axis=1)

In [None]:
knnTest['Longitude'] = knnTest.X
knnTest['Latitude'] = knnTest.Y
knnTest = knnTest.drop(['X', 'Y'], axis=1)

In [None]:
knnTest.head(2)

In [None]:
knnModel = KNeighborsClassifier(n_neighbors=5)

In [None]:
knnModel.fit(knnTrain.iloc[:, [4,5]], knnTrain.iloc[:, 1])

In [None]:
testCategories = knnModel.predict(test.iloc[:, [5,6]])

In [None]:
knnTest['Category'] = testCategories

In [None]:
# move this to a separate function
mapdf = pd.DataFrame(0, index = sorted(train.Category.unique()), columns = sorted(train.Category.unique()))
np.fill_diagonal(mapdf.values,1)

In [None]:
testToWrite = knnTest.Category.apply(lambda x: mapdf[x])
testToWrite.index.name = 'Id'

In [None]:
testToWrite.to_csv('geo_based_knn.csv', sep=',', index=True)

![](https://raw.githubusercontent.com/stormy-ua/Kaggle/master/SanFranciscoCrime/images/geo_based_knn.png)

In [67]:
knnMerged = knnTrain.iloc[:, [0, 3, 4, 6, 7, 8]].copy().append(knnTest.iloc[:, 1:])
TRAIN_RANGE = len(knnTrain)

Convert categorical variable into dummy/indicator variables

In [68]:
minMaxScaler = MinMaxScaler(feature_range=(0, 1))
knnMerged['XNorm'] = minMaxScaler.fit_transform(knnMerged.X)
knnMerged['YNorm'] = minMaxScaler.fit_transform(knnMerged.Y)

In [69]:
knnMerged = knnMerged.merge(pd.get_dummies(knnMerged['DayOfWeek'], prefix='DoW', prefix_sep='_'),\
                left_index=True, right_index=True)

In [None]:
#knnMerged = knnMerged.merge(pd.get_dummies(knnMerged['PdDistrict'], prefix='Distr', prefix_sep='_'),\
#                left_index=True, right_index=True)

In [70]:
knnMerged = knnMerged.drop(['DayOfWeek', 'PdDistrict', 'Address', 'X', 'Y'], axis=1)

In [74]:
knnMerged.iloc[TRAIN_RANGE:, 1:10].head()

Unnamed: 0,XNorm,YNorm,DoW_Friday,DoW_Monday,DoW_Saturday,DoW_Sunday,DoW_Thursday,DoW_Tuesday,DoW_Wednesday
219512,0.051844,0.001406,0,1,0,0,0,0,0
219512,0.052143,0.001273,0,1,0,0,0,0,0
219512,0.052143,0.001273,0,1,0,0,0,0,0
219513,0.035282,0.001175,0,1,0,0,0,0,0
219513,0.035282,0.001175,0,1,0,0,0,0,0


In [9]:
from sklearn.linear_model import LogisticRegression

In [72]:
knnModel = LogisticRegression()

In [73]:
knnModel.fit(knnMerged.iloc[:TRAIN_RANGE, 1:10], knnTrain['Category'])

  args, varargs, kw, default = inspect.getargspec(init)


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr',
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0)

In [76]:
predicted = knnModel.predict(knnMerged.iloc[TRAIN_RANGE:, 1:10])

In [77]:
pd.Series(predicted).unique()

array(['LARCENY/THEFT'], dtype=object)

In [63]:
def saveTestDf(train, test, categories, path):
    predictions = pd.Series(categories)
    mapdf = pd.DataFrame(0, index = sorted(train.Category.unique()), columns = sorted(train.Category.unique()))
    np.fill_diagonal(mapdf.values,1)
    testToWrite = pd.Series(categories).apply(lambda x: mapdf[x])
    testToWrite.index.name = 'Id'
    testToWrite.to_csv(path, sep=',', index=True)

In [64]:
saveTestDf(knnTrain, knnTest, predicted, 'geo_district_based_logReg.csv')