Note: This Notebook is part of the [SF-Crime Analysis & Prediction](https://www.kaggle.com/yannisp/sf-crime-analysis-prediction). Please start there for a full analysis.

# Importing libraries

In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
import lightgbm as lgb
import numpy as np

# Loading the data

In [None]:
train = pd.read_csv('../input/train.csv', parse_dates=['Dates'])
test = pd.read_csv('../input/test.csv', parse_dates=['Dates'], index_col='Id')

# Wrangling the dataset

In [None]:
train.drop_duplicates(inplace=True)
train.replace({'X': -120.5, 'Y': 90.0}, np.NaN, inplace=True)
test.replace({'X': -120.5, 'Y': 90.0}, np.NaN, inplace=True)

imp = SimpleImputer(strategy='mean')

for district in train['PdDistrict'].unique():
    train.loc[train['PdDistrict'] == district, ['X', 'Y']] = imp.fit_transform(
        train.loc[train['PdDistrict'] == district, ['X', 'Y']])
    test.loc[test['PdDistrict'] == district, ['X', 'Y']] = imp.transform(
        test.loc[test['PdDistrict'] == district, ['X', 'Y']])

# Feature Engineering

In [None]:
def feature_engineering(data):
    data['Date'] = pd.to_datetime(data['Dates'].dt.date)
    data['n_days'] = (
        data['Date'] - data['Date'].min()).apply(lambda x: x.days)
    data['Day'] = data['Dates'].dt.day
    data['DayOfWeek'] = data['Dates'].dt.weekday
    data['Month'] = data['Dates'].dt.month
    data['Year'] = data['Dates'].dt.year
    data['Hour'] = data['Dates'].dt.hour
    data['Minute'] = data['Dates'].dt.minute
    data['Block'] = data['Address'].str.contains('block', case=False)
    
    data.drop(columns=['Dates','Date','Address'], inplace=True)
        
    return data

In [None]:
train = feature_engineering(train)
train.drop(columns=['Descript','Resolution'], inplace=True)
test = feature_engineering(test)

# Encoding Categorical Variables

In [None]:
le1 = LabelEncoder()
train['PdDistrict'] = le1.fit_transform(train['PdDistrict'])
test['PdDistrict'] = le1.transform(test['PdDistrict'])

le2 = LabelEncoder()
y = le2.fit_transform(train.pop('Category'))

# Creating Base Nodel  
## Forming the dataset

In [None]:
train_set = lgb.Dataset(
    train, label=y, categorical_feature=['PdDistrict'], free_raw_data=False)

## Setting the parameters

In [None]:
params = {
    'objective': 'multiclass',
    'num_class': 39
}

## Cross-Validating

In [None]:
cv_results = lgb.cv(params, train_set, metrics='multi_logloss', early_stopping_rounds=10)

In [None]:
print('Best score: ', min(cv_results['multi_logloss-mean']))

In [None]:
num_boost_round = np.argmin(cv_results['multi_logloss-mean'])
print('Best epoch: ', num_boost_round)

## Training

In [None]:
bst = lgb.train(params, train_set, num_boost_round=num_boost_round)

## Predicting

In [None]:
predictions = bst.predict(test)

## Submitting

In [None]:
submission = pd.DataFrame(predictions,
                          columns=le2.classes_,
                          index=test.index)
submission.to_csv('LGBM - base model.csv', index_label='Id')