In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
from tqdm import tqdm

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import lightgbm as lgb
import eli5
from eli5.sklearn import PermutationImportance
from lightgbm import LGBMClassifier

import shap
from shapely.geometry import  Point

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import geopandas as gpd

import os
import re
import ast
import csv
import pickle
for dirname, _, filenames in os.walk('/kaggle/input'):
     for filename in filenames:
        print(os.path.join(dirname, filename))


from hyperopt import hp, Trials, fmin, tpe, STATUS_OK

# Optional if you want to run it locally and inspect it in real time using Tensorboard
#from tensorboardX import SummaryWriter 

out_file = 'LGB.csv'
MAX_EVALS = 5 #This has been set to a small number for demonstration. Increase it!
N_FOLDS = 5
pbar = tqdm(total=MAX_EVALS, desc="Hyperopt")

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
TRAIN = '/kaggle/input/sf-crime/train.csv.zip'
TEST = '/kaggle/input/sf-crime/test.csv.zip'
SUBMISSION = '/kaggle/input/sf-crime/sampleSubmission.csv.zip'

In [None]:
submission = pd.read_csv(SUBMISSION)
df_train = pd.read_csv(TRAIN)
df_test = pd.read_csv(TEST)

In [None]:
# 件数確認
print('train data shape:{}'.format(df_train.shape))
# 重複行件数の確認
print(df_train.duplicated().sum())
# trainに行番号を付与
df_train['row_index'] = df_train.reset_index().index

# 外れ値の排除
df_train = df_train[df_train["Y"]<60]
# 重複行は削除する
df_train = df_train.drop_duplicates(subset=['Dates', 'Category', 'Descript', 'DayOfWeek', 'PdDistrict', 'Resolution', 'Address', 'X', 'Y'])
print('train data shape:{}'.format(df_train.shape))

In [None]:
# 朝昼晩の条件分岐の関数を定義
# 定義は適当にPanasonicのスマート家電からhttps://panasonic.jp/pss/qa/answer167.html
def func_cate(x):
    if  x >= 3 and x < 11:  # 朝は、3時から10時59分まで
        return "朝"
    elif x >= 11 and x < 18: # 昼は、11時から17時59分まで
        return "昼"
    else:  # 夜は18時から26時59分まで
        return "夜"

df_train["Dates"] = pd.to_datetime(df_train['Dates'])
df_train["時刻"] = df_train['Dates'].dt.hour
df_train['朝昼晩区分'] = df_train['時刻'].apply(func_cate)
print('train data shape:{}'.format(df_train.shape))
df_train.head()

### ■日付系の特徴量生成
データの最初の日からの日数、日、月、年、時、分、曜日

In [None]:
def feature_engineering_calendar(data):
    data['n_days'] = (data['Dates'] - data['Dates'].min()).apply(lambda x: x.days)
    data['Day'] = data['Dates'].dt.day
    data['DayOfWeek'] = data['Dates'].dt.weekday
    data['Month'] = data['Dates'].dt.month
    data['Year'] = data['Dates'].dt.year
    data['Minute'] = data['Dates'].dt.minute
    
    return data

In [None]:
df_train = feature_engineering_calendar(df_train)
print('train data shape:{}'.format(df_train.shape))
df_train.head()

In [None]:
def feature_engineering_place(data):
    data['Block'] = data['Address'].str.contains('block', case=False)
    
    return data

In [None]:
df_train = feature_engineering_place(df_train)
print('train data shape:{}'.format(df_train.shape))
df_train.head()

In [None]:
# 周辺のカテゴリ別の犯罪件数を付与する
def add_crime_hist(data, x_range, y_range):
    x_div = (x_range[1]-x_range[0])/(250-1)
    y_div = (y_range[1]-y_range[0])/(250-1)
    data["x_group"] = [x.left for x in pd.cut(data["X"], np.arange(x_range[0], x_range[1] + x_div, x_div),right=False)]
    data["y_group"] = [y.left for y in pd.cut(data["Y"], np.arange(y_range[0], y_range[1] + x_div, y_div),right=False)]
    agg = data
    agg["count"] = 1
    agg = data.groupby(
        [
            "Year",
            "Category",
            "x_group",
            "y_group"
        ],
        as_index = False
    ).agg(
        {"count": np.sum}
    )
    agg=agg.pivot(
        index=["Year", 'x_group', 'y_group'], columns='Category', values='count'
    ).fillna(0).reset_index()
    # print(agg)
    
    return data, agg

In [None]:
# 周辺の大カテゴリ別の犯罪件数を付与する
x_range = [min(df_train["X"]), max(df_train["X"])]
y_range = [min(df_train["Y"]), max(df_train["Y"])]
df_train, xy_group = add_crime_hist(df_train, x_range, y_range)
xy_group.head()

In [None]:
wariai = xy_group.drop(["Year", 'x_group', 'y_group'],axis=1).apply(lambda x:x/sum(x),axis=1)
# aaa = (xy_group[["Year", 'x_group', 'y_group']]).assign(wariai)
wariai = pd.concat([xy_group[["Year", 'x_group', 'y_group']], wariai], axis=1) 
wariai.head(3)

In [None]:
# 年、ｘ、ｙグループをキーにして結合
df_train = pd.merge(df_train, wariai, on=["Year", 'x_group', 'y_group'], how='left')
print('df_train data shape:{}'.format(df_train.shape))
df_train.fillna(0)
df_train.head(3)

## train作成

In [None]:
df_train.columns

In [None]:
crime_hist = xy_group.drop(["Year", 'x_group', 'y_group'],axis=1)
train = df_train[[
    "Category",
    'PdDistrict',
    '時刻',
    #'朝昼晩区分',
    #'n_days',
    #'Day',
    #'DayOfWeek',
    # 'Month',
    #'Year',
    #'Minute',
    'Block',
    *crime_hist.columns,
]]
train.dtypes

In [None]:
# Encoding the Categorical Variables
le1 = LabelEncoder()
train['PdDistrict'] = le1.fit_transform(train['PdDistrict'])

le2 = LabelEncoder()
X = train.drop(columns=['Category'])
y = le2.fit_transform(train['Category'])

# Forming the dataset
train_set = lgb.Dataset(X, label=y, categorical_feature=['PdDistrict'], free_raw_data=False)
train.head(3)

In [None]:
def param_flatten(d, params={}):
    """Function that accepts a dictionary with nested dictionaries and returns a flattened dictionary"""
    for key, value in d.items():
        if not isinstance(value, dict):
            params[key] = value
        else:
            param_flatten(value, params)
            
    return params

In [None]:
def objective(params, n_folds=N_FOLDS):
    """Objective function for LightGBM Hyperparameter Optimization"""

    # Keep track of evals
    global ITERATION
    ITERATION += 1
    
    # We need all the parameters in a flattened dictionary
    params = param_flatten(params)

    # Make sure parameters that need to be integers are integers
    for key, value in params.items():
        if key in ['num_leaves', 'min_data_in_leaf']:
            params[key] = int(value)
            
    print(params)

    # Perform n_folds cross validation.
    # If you download this notebook you can add callbacks=[logspy] to use Tensorboard
    try:
        cv_results = lgb.cv(
            params,
            train_set,
            num_boost_round=100,
            nfold=n_folds,
            early_stopping_rounds=10,
            metrics='multi_logloss')

        # Extract the best score
        loss = min(cv_results['multi_logloss-mean'])
        print('loss: ',loss)

        # Boosting rounds that returned the highest cv score
        epochs = np.argmin(cv_results['multi_logloss-mean']) + 1
        
        # Write to the csv file ('a' means append)
        of_connection = open(out_file, 'a')
        writer = csv.writer(of_connection)
        writer.writerow([loss, params, ITERATION, epochs])

        pbar.update()

        # Dictionary with information for evaluation
        return {
            'loss': loss,
            'params': params,
            'iteration': ITERATION,
            'epochs': epochs,
            'status': STATUS_OK
        }
    except Exception as e:
        print('EXCEPTION\n')
        print(e)
        return{'status': 'fail'}

In [None]:
space = {
    'boosting':
    hp.choice('boosting', [
        {
            'boosting': 'gbdt',
            'max_delta_step': hp.quniform('gbdt_max_delta_step', 0, 2, 0.1),
            'min_data_in_leaf': hp.quniform('gbdt_min_data_in_leaf', 10, 30, 1),
            'num_leaves': hp.quniform('gbdt_num_leaves', 20, 40, 1)
        },
        {
            'boosting': 'dart',
            'max_delta_step': hp.quniform('dart_max_delta_step', 0, 2, 0.1),
            'min_data_in_leaf': hp.quniform('dart_min_data_in_leaf', 10, 30, 1),
            'num_leaves': hp.quniform('dart_num_leaves', 20, 40, 1),
        },
    ]),
    'objective':
    'multiclass',
    'num_class':
    39
}

In [None]:
def run_trials():
    """Function to run the trials and save the results after every iteration.
    This is usefull in case you need to interupt the execution and continue from where you left."""

    trials_step = 1  # how many additional trials to do after loading saved trials. 1 = save after iteration
    max_trials = 1  # initial max_trials. put something small to not have to wait

    try:  # try to load an already saved trials object, and increase the max
        trials = pickle.load(open("LGB.hyperopt", "rb"))
        print("Found saved Trials! Loading...")
        max_trials = len(trials.trials) + trials_step
        print("Rerunning from {} trials to {} (+{}) trials".format(
            len(trials.trials), max_trials, trials_step))
    except:  # create a new trials object and start searching
        trials = Trials()

    best = fmin(
        fn=objective,
        space=space,
        algo=tpe.suggest,
        max_evals=max_trials,
        trials=trials)

    print("Best:", best)

    # save the trials object
    with open("LGB.hyperopt", "wb") as f:
        pickle.dump(trials, f)

In [None]:
# #File to save first results

# of_connection = open(out_file, 'w')
# writer = csv.writer(of_connection)

# # Write the headers to the file
# writer.writerow(
#     ['loss', 'params', 'iteration', 'epochs'])
# of_connection.close()

In [None]:
# ITERATION = 0
# while ITERATION <= MAX_EVALS:
#     run_trials()
# pbar.close()

In [None]:
os.getcwd()

In [None]:
# trials = pickle.load(open("../input/sf-crime-prediction/LGB.hyperopt", "rb"))
# results = pd.DataFrame(trials.results)

# bayes_params = pd.DataFrame(columns = list(results.loc[0, 'params'].keys()),
#                             index = list(range(len(results))))

# # Add the results with each parameter a different column
# for i, params in enumerate(results['params']):
#     bayes_params.loc[i, :] = list(params.values())
    
# bayes_params['loss'] = results['loss']
# bayes_params['iteration'] = results['iteration']
# bayes_params.sort_values('loss', inplace=True)

# bayes_params.head()

In [None]:
# Creating the model
train_data = lgb.Dataset(X, label=y, categorical_feature=['PdDistrict'])

params = {
    'boosting':'gbdt',
    'objective':'multiclass',
    'num_class':39,
    'max_delta_step':0.6,
    'min_data_in_leaf': 18,
    'learning_rate': 0.4,
    'max_bin': 465,
    'num_leaves': 20
}

bst = lgb.train(params, train_data, 100)

In [None]:
# 特徴量の重要度を確認
lgb.plot_importance(bst, height = 0.5, figsize = (8,16))

# 決定木の分岐の可視化
# 最後の数字でleaf番号を指定
lgb.create_tree_digraph(bst, 1)

In [None]:
# 周辺のカテゴリ別の犯罪件数を付与する
def add_crime_hist_for_test(data, x_range, y_range):
    x_div = (x_range[1]-x_range[0])/(250-1)
    y_div = (y_range[1]-y_range[0])/(250-1)
    data["x_group"] = [x.left for x in pd.cut(data["X"], np.arange(x_range[0], x_range[1] + x_div, x_div),right=False)]
    data["y_group"] = [y.left for y in pd.cut(data["Y"], np.arange(y_range[0], y_range[1] + x_div, y_div),right=False)]

    return data

In [None]:
df_test = pd.read_csv(TEST)

# 外れ値処理
df_test.replace({'X': -120.5, 'Y': 90.0}, np.NaN, inplace=True)
df_test.replace({'X': -122.364751, 'Y': max(df_test["Y"])}, np.NaN, inplace=True)

imp = SimpleImputer(strategy='mean')

print('test data shape:{}'.format(df_test.shape))
df_test.head()

In [None]:
# trainでフィットさせたあとに、testデータの外れ値処理実行
for district in df_train['PdDistrict'].unique():
    df_train.loc[df_train['PdDistrict'] == district, ['X', 'Y']] = imp.fit_transform(df_train.loc[df_train['PdDistrict'] == district, ['X', 'Y']])
    df_test.loc[df_test['PdDistrict'] == district, ['X', 'Y']] = imp.transform(df_test.loc[df_test['PdDistrict'] == district, ['X', 'Y']])

print('test data shape:{}'.format(df_test.shape))
df_test.head()

In [None]:
# ## 緯度経度の外れ値確認
# def create_gdf(df):
#     gdf = df.copy()
#     gdf['Coordinates'] = list(zip(gdf.X, gdf.Y))
#     gdf.Coordinates = gdf.Coordinates.apply(Point)
#     gdf = gpd.GeoDataFrame(
#         gdf, geometry='Coordinates', crs={'init': 'epsg:4326'})
#     return gdf

# train_gdf = create_gdf(df_test)

# world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))
# ax = world.plot(color='white', edgecolor='black')
# train_gdf.plot(ax=ax, color='red')
# plt.show()

In [None]:
df_test["Dates"] = pd.to_datetime(df_test['Dates'])
df_test["時刻"] = df_test['Dates'].dt.hour
df_test['朝昼晩区分'] = df_test['時刻'].apply(func_cate)
df_test = feature_engineering_calendar(df_test)
df_test = feature_engineering_place(df_test)
df_test = add_crime_hist_for_test(df_test, x_range, y_range)

In [None]:
df_test = pd.merge(df_test, wariai, on=["Year", 'x_group', 'y_group'], how='left')
print('test data shape:{}'.format(df_test.shape))
df_test.head()

In [None]:
df_test['PdDistrict'] = le1.fit_transform(df_test['PdDistrict'])

# NUll抽出
df_test_null = df_test[df_test['VANDALISM'].isnull()]

df_test = df_test[[
#     "Id",
    'PdDistrict',
    '時刻',
    #'朝昼晩区分',
    #'n_days',
    # 'Day',
    #'DayOfWeek',
    # 'Month',
    #'Year',
    #'Minute',
    'Block',
    *crime_hist.columns,
]]
df_test.dtypes

In [None]:
print('---test data null number---')
print(df_test.isnull().sum())

# ゼロを置換
df_test.fillna(0)
print('---test data null number---')
print(df_test.isnull().sum())

print('df_test data shape:{}'.format(df_test.shape))
df_test.head()

In [None]:
predictions = bst.predict(df_test)

# Submitting the results
predict_result = pd.DataFrame(
    predictions,
    columns=le2.inverse_transform(np.linspace(0, 38, 39, dtype='int16')),
    index=df_test.index
)

In [None]:
submission_raw = submission.copy()
submission.iloc[:, 1:] = predict_result
print('submission data shape:{}'.format(submission.shape))

In [None]:
# # ゼロ置換するやつ
# submission_all_zero = pd.merge(submission_raw, df_test_null[["Id"]], on="Id", how='inner')
# # anti join
# merged = pd.merge(submission,df_test_null[["Id"]], how='outer', indicator=True)
# merged = merged[merged['_merge'] == 'left_only']
# print('merged data shape:{}'.format(merged.shape))
# print('submission_all_zero data shape:{}'.format(submission_all_zero.shape))
# merged.head()

In [None]:
## ユニオン
# submission_final = submission_all_zero.append(merged.drop('_merge', axis=1), ignore_index=True)
# print('submission_final data shape:{}'.format(submission_final.shape))
# submission_final.head()

In [None]:
# csv出力
submission.to_csv('LGBM_final.csv', index=False, header=True)