In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
matplotlib.rcParams['figure.figsize'] = (10.0, 12.0)
matplotlib.rcParams['axes.titlesize'] = 18


## Preparing the data for training


In [2]:
train = pd.read_csv('input/train.csv', parse_dates=['Dates'])
test = pd.read_csv('input/test.csv', parse_dates=['Dates'])

In [3]:
from sklearn.preprocessing import LabelEncoder
categoryEncoder = LabelEncoder()
pdDistEncoder = LabelEncoder()

def clean(df, isTrain=True):
    ## we need to clean the train and tests sets differently, hence the isTrain argument above
    
    ## transform the variable we will predict into a numeric categories for our model. 
    if (isTrain):
        df['Category'] = categoryEncoder.fit(df.Category).transform(df.Category)
    
    ## parse the Date variable into component parts so that our model can take advantage of that information
    df['dayOfMonth'] = df.Dates.dt.day
    df['dayOfWeek'] = df.Dates.dt.dayofweek
    df['month'] = df.Dates.dt.month
    df['year'] = df.Dates.dt.year
    df['hour'] = df.Dates.dt.hour
    
    # transform the PdDistrict variable into a numeric categories
    df['PdDistrict'] = pdDistEncoder.fit(df.PdDistrict).transform(df.PdDistrict)
    
    ## Descript and Resolution are only in the training set, so we drop them because they won't be helpful in modeling
    ## DayOfWeek and Dates are now redundant after we parsed the Dates field above, so we drop them as well.
    ## Address should be perfectly correlated with (X, Y), so we drop it.
    if (isTrain):
        df = df.drop(['Descript', 'DayOfWeek', 'Dates', 'Resolution', 'Address'], axis=1)
    else:
        df = df.drop(['DayOfWeek', 'Dates', 'Address'], axis=1)
    
    return df
    

In [4]:
trainClean = clean(train)
trainClean.isnull().sum()

Category      0
PdDistrict    0
X             0
Y             0
dayOfMonth    0
dayOfWeek     0
month         0
year          0
hour          0
dtype: int64

In [5]:
trainClean.head()

Unnamed: 0,Category,PdDistrict,X,Y,dayOfMonth,dayOfWeek,month,year,hour
0,37,4,-122.425892,37.774599,13,2,5,2015,23
1,21,4,-122.425892,37.774599,13,2,5,2015,23
2,21,4,-122.424363,37.800414,13,2,5,2015,23
3,16,4,-122.426995,37.800873,13,2,5,2015,23
4,16,5,-122.438738,37.771541,13,2,5,2015,23


In [6]:
testClean = clean(test, isTrain=False)
testClean.isnull().sum()

Id            0
PdDistrict    0
X             0
Y             0
dayOfMonth    0
dayOfWeek     0
month         0
year          0
hour          0
dtype: int64

In [7]:
testClean.head()

Unnamed: 0,Id,PdDistrict,X,Y,dayOfMonth,dayOfWeek,month,year,hour
0,0,0,-122.399588,37.735051,10,6,5,2015,23
1,1,0,-122.391523,37.732432,10,6,5,2015,23
2,2,4,-122.426002,37.792212,10,6,5,2015,23
3,3,2,-122.437394,37.721412,10,6,5,2015,23
4,4,2,-122.437394,37.721412,10,6,5,2015,23


In [8]:
trainClean.to_csv('input/trainClean.csv', index=False)
testClean.to_csv('input/testClean.csv', index=False)