In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train = pd.read_csv("/kaggle/input/sf-crime/train.csv.zip")
test = pd.read_csv("/kaggle/input/sf-crime/test.csv.zip")

In [None]:
train.shape

In [None]:
#No missing values in train set
train.info()

In [None]:
test.info()

In [None]:
#dropping Resolution and Descript columns from train set as they are not present in test set and hence
#cannot be used for analysis
train.drop(['Descript','Resolution'], axis = 1,inplace=True)

In [None]:
train.info()

In [None]:
#check and remove duplicate values if any
train.duplicated().sum()

In [None]:
train.drop_duplicates(inplace=True)

In [None]:
BBox = train.X.min(),train.X.max(),train.Y.min(),train.Y.max()

In [None]:
BBox

In [None]:
from mpl_toolkits.basemap import Basemap
import matplotlib.pyplot as plt
fig = plt.figure(figsize=(12,9))

m = Basemap(projection='mill',
           llcrnrlat = -90,
           urcrnrlat = 90,
           llcrnrlon = -180,
           urcrnrlon = 180,
           resolution = 'c')

m.drawcoastlines()

m.scatter(train['X'].tolist(),train['Y'].tolist(),latlon=True,s=300,c='red')

In [None]:
import seaborn as sns


In [None]:
count = train['Category'].value_counts()

In [None]:
#Maximum crimes belong to category : Larceny/Theft
count.plot(kind='bar', figsize = (10,10) , color = 'black')

In [None]:
time=[]
for i in train['Dates']:
    x=i.split()
    time.append(x[1])
time=pd.Series(time)    

In [None]:
#top 20 times when the occurence of crime was the highest
#the plot suggests the maximum crimes occur at 12 noon and 12 midnight
time.value_counts()[:20].plot(kind='bar',figsize=(8,8))

In [None]:
#Crime frequency doesn't differ alot among different days of the week but it is the maximum on friday
sns.set()
sns.countplot(x=train['DayOfWeek'],data=train)

In [None]:
def transformDataset(dataset):
    dataset['Dates'] = pd.to_datetime(dataset['Dates'])
    
    dataset['Date'] = dataset['Dates'].dt.date
    
    dataset['n_days'] = (dataset['Date'] - dataset['Date'].min()).apply(lambda x: x.days)
    
    dataset['Year'] = dataset['Dates'].dt.year
    dataset['DayOfWeek'] = dataset['Dates'].dt.dayofweek # OVERWRITE
    dataset['WeekOfYear'] = dataset['Dates'].dt.weekofyear
    dataset['Month'] = dataset['Dates'].dt.month
    
    dataset['Hour'] = dataset['Dates'].dt.hour
    
    dataset['Block'] = dataset['Address'].str.contains('block', case=False)
    dataset['Block'] = dataset['Block'].map(lambda x: 1 if  x == True else 0)

    
    dataset = dataset.drop('Dates', 1)
    dataset = dataset.drop('Date', 1)
    dataset = dataset.drop('Address', 1)
    
    dataset = pd.get_dummies(data=dataset, columns=[ 'PdDistrict'], drop_first = True)
    return dataset

In [None]:
train = transformDataset(train)
test = transformDataset(test)

In [None]:
X = train.drop("Category",axis=1)
y = train["Category"]

In [None]:
Y = pd.get_dummies(y)

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test = train_test_split(X,Y)
    
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)

In [None]:
from keras.models import Sequential
from keras.layers import Dense,Activation

In [None]:
model = Sequential()

model.add(Dense(100,input_shape=(18,)))
model.add(Activation('relu'))

model.add(Dense(80))
model.add(Activation("relu"))

model.add(Dense(60))
model.add(Activation("relu"))

model.add(Dense(39))
model.add(Activation("softmax"))

model.summary()

In [None]:
model.compile(optimizer='adam',
             loss = "categorical_crossentropy",
             metrics=['accuracy'])


In [None]:
model.fit(X,Y,
         batch_size=256,
         epochs = 10,
         verbose = 2,
         validation_data=(X_train,Y_train))

In [None]:
preds_vals = model.predict(test.drop("Id",axis=1))
preds = pd.DataFrame(data=preds_vals,columns=Y.columns)
preds

In [None]:
new_df = pd.DataFrame(np.where(preds.T == preds.T.max(), 1, 0),index=preds.columns).T
new_df

In [None]:
new_df['Id'] = test["Id"]

In [None]:
cols = list(new_df.columns)
cols = [cols[-1]] + cols[:-1]
new_df = new_df[cols]

In [None]:
new_df

In [None]:
new_df.to_csv('../working/submission.csv', index=False)