In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
%matplotlib inline

#VIPAX-WAAA-VQJS
#VIPAX-ATWO-EPOY
#VIPAX-ANAP-TTGK
#VIPAX-LAAK-TYQM
#VIPAX-FPDS-RFDR

In [None]:
train_df = pd.read_csv('../input/sf-crime/train.csv.zip')
train_df.head(5)

In [None]:
test_df= pd.read_csv('../input/sf-crime/test.csv.zip')
test_df.head(5)

In [None]:
train_df.info()

In [None]:
test_df.info()

In [None]:
train_df.isnull().any()

In [None]:
test_df.isnull().any()

In [None]:
train_df = train_df.drop('Resolution', axis=1)
train_df.sample(1)

In [None]:
train_df.rename(columns={'Dates':'DateTime'},inplace=True)
test_df.rename(columns={'Dates':'DateTime'},inplace=True)

In [None]:
train_df.DateTime = pd.to_datetime(train_df.DateTime)
test_df.DateTime = pd.to_datetime(test_df.DateTime)
train_df.info()
test_df.info()

In [None]:
train_df['IsDay'] = 0
train_df.loc[ (train_df.DateTime.dt.hour > 6) & (train_df.DateTime.dt.hour < 20), 'IsDay' ] = 1
test_df['IsDay'] = 0
test_df.loc[ (test_df.DateTime.dt.hour > 6) & (test_df.DateTime.dt.hour < 20), 'IsDay' ] = 1


In [None]:
train_df['Hour'] = train_df.DateTime.dt.hour
train_df['Month'] = train_df.DateTime.dt.month
train_df['Year'] = train_df.DateTime.dt.year
train_df['Year'] = train_df['Year'] - 2000 # The Algorithm doesn't know the difference. It's just easier to work like that

test_df['Hour'] = test_df.DateTime.dt.hour
test_df['Month'] = test_df.DateTime.dt.month
test_df['Year'] = test_df.DateTime.dt.year
test_df['Year'] = test_df['Year'] - 2000 # The Algorithm doesn't know the difference. It's just easier to work like that

train_df.sample(1)

In [None]:
days_to_int_dic = {
        'Monday': 1,
        'Tuesday': 2,
        'Wednesday': 3,
        'Thursday': 4,
        'Friday': 5,
        'Saturday': 6,
        'Sunday': 7,
}
train_df['DayOfWeek'] = train_df['DayOfWeek'].map(days_to_int_dic)
test_df ['DayOfWeek'] = test_df ['DayOfWeek'].map(days_to_int_dic)

train_df.DayOfWeek.unique()

In [None]:
train_df['Year'] = train_df.DateTime.dt.year
train_df['Month'] = train_df.DateTime.dt.month
train_df['Hour'] = train_df.DateTime.dt.hour
train_df.info()

In [None]:
train_df.DateTime.min(), train_df.DateTime.max()

In [None]:
train_df['MonthE'] = np.sin((train_df.Month*2*np.pi)/12)
train_df['MonthE'] = np.cos((train_df.Month*2*np.pi)/12)

In [None]:
train_df['DayOfWeekE'] = train_df.DayOfWeek.map({
    'Monday': 1,
    'Tuesday': 2,
    'Wednesday': 3,
    'Thursday': 4,
    'Friday': 5,
    'Saturday': 6,
    'Sunday': 7
})

train_df['DayOfWeekE'] = np.sin((train_df.DayOfWeekE*2*np.pi)/7)
train_df['DayOfWeekE'] = np.cos((train_df.DayOfWeekE*2*np.pi)/7)

In [None]:
train_df['HourE'] = np.sin((train_df.Hour*2*np.pi)/24)
train_df['HourE'] = np.cos((train_df.Hour*2*np.pi)/24)

In [None]:
train_df = pd.get_dummies(train_df, columns=['PdDistrict'])
test_df  = pd.get_dummies(test_df,  columns=['PdDistrict'])
train_df.sample(2)

In [None]:
from sklearn.preprocessing import LabelEncoder

cat_le = LabelEncoder()
train_df['CategoryInt'] = pd.Series(cat_le.fit_transform(train_df.Category))
train_df.sample(5)

In [None]:
train_df['InIntersection'] = 1
train_df.loc[train_df.Address.str.contains('Block'), 'InIntersection'] = 0

test_df['InIntersection'] = 1
test_df.loc[test_df.Address.str.contains('Block'), 'InIntersection'] = 0

In [None]:
train_df.head(5)

In [None]:
train_df.columns

In [None]:
feature_cols = ['X', 'Y', 'IsDay', 'DayOfWeek', 'Month', 'Hour', 'Year', 'InIntersection',
                'PdDistrict_BAYVIEW', 'PdDistrict_CENTRAL', 'PdDistrict_INGLESIDE',
                'PdDistrict_MISSION', 'PdDistrict_NORTHERN', 'PdDistrict_PARK',
                'PdDistrict_RICHMOND', 'PdDistrict_SOUTHERN', 'PdDistrict_TARAVAL', 'PdDistrict_TENDERLOIN']
target_col = 'CategoryInt'

train_x = train_df[feature_cols]
train_y = train_df[target_col]

test_ids = test_df['Id']
test_x = test_df[feature_cols]


In [None]:
as_per_year = train_df.groupby('Year')['Descript'].count()
as_per_year

In [None]:
ax = as_per_year.plot(kind='line', xticks=as_per_year.index.values)
ax.set_ylabel('Count of Crimes')

In [None]:
for y in train_df.Year.unique():
    train_df[train_df['Year'] == y].groupby('DayOfWeek')['Descript'].count().sort_values().plot(kind='barh', title=f'Count of crimes per Day ({y})', fig=(16,5))
    plt.show()

In [None]:
for y in train_df.Year.unique():
    train_df[train_df['Year'] == y].groupby('Month')['Descript'].count().sort_values().plot(kind='barh', title=f'Count of crimes per Day ({y})', fig=(16,5))
    plt.show()

In [None]:
train_df['IsDay']  = train_df.Hour.apply(lambda x: 1 if (x > 7 and x < 20) else 0)

In [None]:
for y in train_df.Year.unique():
    ax = train_df[train_df['Year'] == y].groupby('IsDay')['Descript'].count().sort_values().plot(kind='bar', title=f'Count of crimes per Hour ({y})', fig=(18,7))
    plt.show()

In [None]:
train_df.head()

In [None]:
cats = train_df.Category.unique()
cats.sort()
cats, len(cats)

In [None]:
crimes_per_cat = (train_df.groupby('Category')['Descript'].count()/len(train_df))*100
assert int(crimes_per_cat.sum()) == 100
crimes_per_cat.sort_values(inplace=True)
crimes_per_cat

In [None]:
ax = crimes_per_cat.plot(kind='barh', figsize=(12,12) )
ax.set_xlabel('Crime Percentage')

In [None]:
years = train_df.Year.unique()
for y in years:
    gb = train_df[train_df.Year == y].groupby('Category')['Descript'].count().sort_values()
    gb.plot(kind='barh', figsize=(12,12), title=f'Crime Counts by Category for year {y}')
    plt.show()

In [None]:
days_of_week = train_df.DayOfWeek.unique()
for d in days_of_week:
    gb = train_df[train_df.DayOfWeek == d].groupby('Category')['Descript'].count().sort_values()
    gb.plot(kind='barh', figsize=(13,13), title=f'Crime Counts by Category for Day {d} (2003 ~ mid 2015)')
    plt.show()

In [None]:
ct = pd.crosstab(train_df.Category, train_df.IsDay)
ct.plot(kind='barh', figsize=(14,14), title=f'Crime Categories by Day/Night (2003 ~ mid 2015)')

In [None]:
for y in years:
    ct = pd.crosstab(train_df[train_df.Year == y].Category, train_df[train_df.Year == y].IsDay)
    ct.plot(kind='barh', figsize=(14,14), title=f'Crime Categories by Day/Night ({y})')
    plt.show()

In [None]:
type(train_x), type(train_y)

In [None]:
import xgboost as xgb
train_xgb = xgb.DMatrix(train_x, label=train_y)
test_xgb  = xgb.DMatrix(test_x)

In [None]:
params = {
    'max_depth': 4,  # the maximum depth of each tree
    'eta': 0.3,  # the training step for each iteration
    'silent': 1,  # logging mode - quiet
    'objective': 'multi:softprob',  # error evaluation for multiclass training
    'num_class': 39,
}

In [None]:
CROSS_VAL = False
if CROSS_VAL:
    print('Doing Cross-validation ...')
    cv = xgb.cv(params, train_xgb, nfold=3, early_stopping_rounds=10, metrics='mlogloss', verbose_eval=True)
    cv

In [None]:
SUBMIT = not CROSS_VAL
if SUBMIT:
    print('Fitting Model ...')
    m = xgb.train(params, train_xgb, 10)
    res = m.predict(test_xgb)
    cols = ['Id'] + cat_le.classes_
    submission = pd.DataFrame(res, columns=cat_le.classes_)
    submission.insert(0, 'Id', test_ids)
    submission.to_csv('sampleSubmission.csv', index=False)
    print('Done Outputing !')
    print(submission.sample(3))
else:
    print('NOT SUBMITING')