#  MIDS W207 
## Baseline submission
## Authors: Neha Kumar, Suhas Gupta
## Submission Data: 03/16/2019 

In [25]:
# This tells matplotlib not to try opening a new window for each plot.
%matplotlib inline

# General libraries.
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# SK-learn libraries for learning.
from sklearn import preprocessing
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

# SK-learn libraries for evaluation.
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import classification_report

We selected the SF Crime competition

https://www.kaggle.com/c/sf-crime/data

In [26]:
## Import CSV files into pandas data frames

train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
sample_submission = pd.read_csv('sampleSubmission.csv')


In [27]:
## Lets look at the train data frame
train[:5]

Unnamed: 0,Dates,Category,Descript,DayOfWeek,PdDistrict,Resolution,Address,X,Y
0,2015-05-13 23:53:00,WARRANTS,WARRANT ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599
1,2015-05-13 23:53:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599
2,2015-05-13 23:33:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",VANNESS AV / GREENWICH ST,-122.424363,37.800414
3,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,NORTHERN,NONE,1500 Block of LOMBARD ST,-122.426995,37.800873
4,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,PARK,NONE,100 Block of BRODERICK ST,-122.438738,37.771541


In [28]:
## Look at the test data frame 
test[:5]

Unnamed: 0,Id,Dates,DayOfWeek,PdDistrict,Address,X,Y
0,0,2015-05-10 23:59:00,Sunday,BAYVIEW,2000 Block of THOMAS AV,-122.399588,37.735051
1,1,2015-05-10 23:51:00,Sunday,BAYVIEW,3RD ST / REVERE AV,-122.391523,37.732432
2,2,2015-05-10 23:50:00,Sunday,NORTHERN,2000 Block of GOUGH ST,-122.426002,37.792212
3,3,2015-05-10 23:45:00,Sunday,INGLESIDE,4700 Block of MISSION ST,-122.437394,37.721412
4,4,2015-05-10 23:45:00,Sunday,INGLESIDE,4700 Block of MISSION ST,-122.437394,37.721412


(from Kaggle)
## Data fields
- Dates - timestamp of the crime incident
- Category - category of the crime incident (only in train.csv). This is the target variable you are going to predict.
- Descript - detailed description of the crime incident (only in train.csv)
- DayOfWeek - the day of the week
- PdDistrict - name of the Police Department District
- Resolution - how the crime incident was resolved (only in train.csv)
- Address - the approximate street address of the crime incident 
- X - Longitude
- Y - Latitude

The variables we can use to generate the model will need to be in both the train and test dataset. This gives us:
- Dates
- DayOfWeek
- PdDistrict
- Address
- X
- Y

In [29]:
## Split data frames to create data and label numpy arrays
'''
    Convert the pandas data frames to numpy array for using with sklearn.
    We will use the following variable names ->
    X_train: training feature data 
    Y_train: training label data
'''
    
## Split the data frame (category is the predicted variable)
data_train_df = train.drop(['Category','Descript','Resolution'],axis=1)
labels_train_df = pd.DataFrame(train, columns = ['Category'])

## Remove ID column from test data
test_data = test.drop(['Id'],axis=1).values

## Convert data frame to numpy arrays
### change X and Y to features & labels

data_train = data_train_df.values
labels_train = labels_train_df.values
###########################

## Store the feature names in a list
train_features_names = list(data_train_df.keys())

print('Shape of training data: {train_shp1}'.format(train_shp1=data_train.shape))
print('Shape of training labels: {train_shp2}'.format(train_shp2=data_train.shape))
print('List of training Features:', train_features_names)
print('Shape of test data:', test_data.shape)

Shape of training data: (878049, 6)
Shape of training labels: (878049, 6)
List of training Features: ['Dates', 'DayOfWeek', 'PdDistrict', 'Address', 'X', 'Y']
Shape of test data: (884262, 6)


In [36]:
## Preprocess the categorical data by encoding to numerical values
enc = preprocessing.LabelEncoder()

## Encode the output labels in training data set and store the class names 
labels_train_encoded = enc.fit_transform(np.ravel(labels_train.reshape(-1,1)))
data_classes = enc.classes_

data_train_encoded = data_train
for k in range(data_train.shape[1]):
    data_train_encoded[:,k] = enc.fit_transform(np.ravel(data_train[:,k].reshape(-1,1)))
print(data_train_encoded[:5,:])

[[389256 6 4 19790 18787 23130]
 [389256 6 4 19790 18787 23130]
 [389255 6 4 22697 19317 32784]
 [389254 6 4 4266 18403 32936]
 [389254 6 5 1843 13848 21892]]


In [37]:
# Getting the data ready for analysis. Here, for the baseline submission, we will just train the model on the latitude and logitude coordinates
## Not using the custom pre processing function for baseline submission
'''def preProcess(df):
    result = df[['X','Y']]
    return result.values
'''


# Shuffle the data using the format shared in class - first creating a list of shuffled indices
random_seed = 1

## Shuffle training data by applying shuffled indices
#train_dev_data = train_dev_data[shuffle_index,:]
#train_dev_labels = train_dev_labels[shuffle_index

# Split into training, dev and mini training data sets
train_data, dev_data, train_labels, dev_labels = train_test_split(
        data_train_encoded, labels_train_encoded, test_size=0.20, random_state=random_seed)
mini_train_data, mini_train_labels = train_data[:1000,:], train_labels[:1000]

print(dev_data.shape)
print(train_data.shape)
print(mini_train_data.shape)

(175610, 6)
(702439, 6)
(1000, 6)


In [38]:
# baseline model is just a knn of 1 for just latitude and longitude
neighbors = 1
kn = KNeighborsClassifier(n_neighbors=neighbors)
kn.fit(train_data, train_labels)

# Create predicted values
#preds_dev = kn.predict(dev_data)
score = kn.score(dev_data, dev_labels)
#proba = kn.predict_proba(dev_data)

print('Mean score from KNN with {nn} neighbors: {score}'.format(nn=neighbors, score=score))

Mean score from KNN with 1 neighbors: 0.23330106485963215


In [39]:
## Run predictions on test data (need to encode before using)
test_data_encoded = test_data
for k in range(test_data.shape[1]):
    test_data_encoded[:,k] = enc.fit_transform(np.ravel(test_data[:,k].reshape(-1,1)))
preds = kn.predict(test_data_encoded)

In [40]:
## Print formatted output for submission CSV

# Looking at format of output
print(sample_submission.head())
print('sample submission shape', sample_submission.shape)

# Now put results in a dataframe 
all_ids = test["Id"].values

# From encoded categories, get normal labels
labeled_preds = [data_classes[i] for i in preds]

# Make dateframe
d = {'Id': all_ids, 'Label': labeled_preds}

# Make the results dummy columns
df = pd.DataFrame(data = d)
results = pd.get_dummies(df, prefix=[''],prefix_sep = '')

# To ensure we have all columns, copy data into the schema of sample_submission
submission = pd.DataFrame().reindex_like(sample_submission).fillna(0).astype(int)

col_names = list(results.columns.values)
for col in col_names:
    submission[col] = results[col]

    
print('final shape',submission.shape )
## Save the predictions to submission CSV file
submission.to_csv('baseline_submission.csv', index = False)

#print(submission)

   Id  ARSON  ASSAULT  BAD CHECKS  BRIBERY  BURGLARY  DISORDERLY CONDUCT  \
0   0      0        0           0        0         0                   0   
1   1      0        0           0        0         0                   0   
2   2      0        0           0        0         0                   0   
3   3      0        0           0        0         0                   0   
4   4      0        0           0        0         0                   0   

   DRIVING UNDER THE INFLUENCE  DRUG/NARCOTIC  DRUNKENNESS     ...       \
0                            0              0            0     ...        
1                            0              0            0     ...        
2                            0              0            0     ...        
3                            0              0            0     ...        
4                            0              0            0     ...        

   SEX OFFENSES NON FORCIBLE  STOLEN PROPERTY  SUICIDE  SUSPICIOUS OCC  TREA  \
0           