# Kaggle comp demo

In [1]:
# This tells matplotlib not to try opening a new window for each plot.
%matplotlib inline

# General libraries.
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# SK-learn libraries for learning.
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV

# SK-learn libraries for evaluation.
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import classification_report

We selected the SF Crime competition

https://www.kaggle.com/c/sf-crime/data

In [4]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
sample_submission = pd.read_csv('sampleSubmission.csv')

train_features = list(train.keys())


print('Printing the first 5 rows of train data: \n', train.head())
print('Printing the first 5 rows of test data: \n', test.head())

Printing the first 5 rows of train data: 
                  Dates        Category                      Descript  \
0  2015-05-13 23:53:00        WARRANTS                WARRANT ARREST   
1  2015-05-13 23:53:00  OTHER OFFENSES      TRAFFIC VIOLATION ARREST   
2  2015-05-13 23:33:00  OTHER OFFENSES      TRAFFIC VIOLATION ARREST   
3  2015-05-13 23:30:00   LARCENY/THEFT  GRAND THEFT FROM LOCKED AUTO   
4  2015-05-13 23:30:00   LARCENY/THEFT  GRAND THEFT FROM LOCKED AUTO   

   DayOfWeek PdDistrict      Resolution                    Address  \
0  Wednesday   NORTHERN  ARREST, BOOKED         OAK ST / LAGUNA ST   
1  Wednesday   NORTHERN  ARREST, BOOKED         OAK ST / LAGUNA ST   
2  Wednesday   NORTHERN  ARREST, BOOKED  VANNESS AV / GREENWICH ST   
3  Wednesday   NORTHERN            NONE   1500 Block of LOMBARD ST   
4  Wednesday       PARK            NONE  100 Block of BRODERICK ST   

            X          Y  
0 -122.425892  37.774599  
1 -122.425892  37.774599  
2 -122.424363  37.8004

(from Kaggle)
## Data fields
- Dates - timestamp of the crime incident
- Category - category of the crime incident (only in train.csv). This is the target variable you are going to predict.
- Descript - detailed description of the crime incident (only in train.csv)
- DayOfWeek - the day of the week
- PdDistrict - name of the Police Department District
- Resolution - how the crime incident was resolved (only in train.csv)
- Address - the approximate street address of the crime incident 
- X - Longitude
- Y - Latitude

The variables we can use to generate the model with need to be in both the train and test dataset. This gives us:
- Dates
- DayOfWeek
- PdDistrict
- Address
- X
- Y

In [5]:
train_labels = np.array(train['Category'])

# Getting the data ready for analysis. Here, for the baseline submission, we will just train the model on the latitude and logitude coordinates
def preProcess(df):
    result = df[['X','Y']]
    return result.values
    
# Preprocess the training and test data
train_data = preProcess(train)
test_data = preProcess(test)

# Shuffle the data using the format shared in class - first creating a list of shuffled indices
size = train_data.shape[0]
shuffle_index = list(range(size))
np.random.shuffle(shuffle_index)

# Shuffle training data by applying shuffled indices
train_data = train_data[shuffle_index,:]
train_labels = train_labels[shuffle_index]

# Splitting into dev data and creating a mini train data so this is quick to model
n_train = int(0.8 * size)
train_data = train_data[:n_train,:]
train_labels = train_labels[:n_train]
dev_data = train_data[n_train:,:]
dev_labels = train_labels[n_train:]
mini_train_data = train_data[:1000,:]
mini_train_labels = train_labels[:1000]


In [6]:
# baseline model is just a knn of 1 for just latitude and longitude
kn = KNeighborsClassifier(n_neighbors=1)
kn.fit(mini_train_data, mini_train_labels)

# Create predicted values
preds = kn.predict(test_data)
print(preds)

['NON-CRIMINAL' 'OTHER OFFENSES' 'LARCENY/THEFT' ... 'VEHICLE THEFT'
 'VANDALISM' 'VEHICLE THEFT']


In [7]:
# Looking at format of output
print(sample_submission.head())

# Now put results in a dataframe 
all_ids = test["Id"].values
d = {'Id': all_ids, 'Label': preds}

# Make the results dummy columns
df = pd.DataFrame(data = d)
my_results = pd.get_dummies(df, prefix=[''],prefix_sep = '')

# To ensure we have all columns, copy data into the schema of sample_submission
my_submission = pd.DataFrame().reindex_like(sample_submission).fillna(0).astype(int)

col_names = list(my_results.columns.values)
for col in col_names:
    my_submission[col] = my_results[col]

    
print('final shape',my_submission.shape )
my_submission.to_csv('my_submission_practice_nk1.csv', index = False)

   Id  ARSON  ASSAULT  BAD CHECKS  BRIBERY  BURGLARY  DISORDERLY CONDUCT  \
0   0      0        0           0        0         0                   0   
1   1      0        0           0        0         0                   0   
2   2      0        0           0        0         0                   0   
3   3      0        0           0        0         0                   0   
4   4      0        0           0        0         0                   0   

   DRIVING UNDER THE INFLUENCE  DRUG/NARCOTIC  DRUNKENNESS     ...       \
0                            0              0            0     ...        
1                            0              0            0     ...        
2                            0              0            0     ...        
3                            0              0            0     ...        
4                            0              0            0     ...        

   SEX OFFENSES NON FORCIBLE  STOLEN PROPERTY  SUICIDE  SUSPICIOUS OCC  TREA  \
0           

In [None]:
# For later, we wil consider adding these into our preProcesser function
# train['Dates'] = pd.to_datetime(train['Dates'],infer_datetime_format=True)
# train["dow"] = train["Dates"].dt.dayofweek
# train["unix_date"] = train['Dates'].astype(np.int64) // 10 ** 9
# PdDistrict = pd.factorize(train["PdDistrict"])
# pd_ids = PdDistrict[0]
# pd_names = PdDistrict[1]
# print(pd_names)
# print(train.dtypes)
# print(train.head())