In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns; sns.set();

import datetime
import re

import umap

In [None]:
TRAIN = '/kaggle/input/sf-crime/train.csv.zip'
TEST = '/kaggle/input/sf-crime/test.csv.zip'

df_train = pd.read_csv(TRAIN)
df_test = pd.read_csv(TEST)

In [None]:
print('train data shape:{}'.format(df_train.shape))
print('test data shape:{}'.format(df_test.shape))

In [None]:
print('train data columns:{}'.format(df_train.columns))
print('test data columns:{}'.format(df_test.columns))

In [None]:
print('---train data null number---')
print(df_train.isnull().sum())
print('---test data null number---')
print(df_test.isnull().sum())

In [None]:
df_train.dtypes

In [None]:
df_train.describe()

In [None]:
df_train.head()

In [None]:
cols_cat = df_train.columns[df_train.dtypes == 'object'].drop('Dates')

In [None]:
for col in cols_cat:
    print("------------" + col + "------------")
    print(df_train[col].value_counts())

In [None]:
plt.figure(figsize=(20,10)) 

ax = sns.countplot(x='Category', data=df_train, order = df_train['Category'].value_counts().index)
ax.set_xticklabels(ax.get_xticklabels(), rotation=60, ha="right")

plt.tight_layout()
plt.show()

In [None]:
# 日付から月、時刻、TimeGroup(朝昼晩区分)を追加する関数
def date_split(df):    
    df["Dates"] = pd.to_datetime(df["Dates"])
    df["Date"] = df["Dates"].dt.date
    df["Year"] = df["Dates"].dt.year
    df["Month"] = df["Dates"].dt.month
    df["Day"] = df["Dates"].dt.day
    df["Hour"] = df["Dates"].dt.hour
    df["Minute"] = df["Dates"].dt.minute
    df["Second"] = df["Dates"].dt.second

    def func_cate(x):
        if  x >= 3 and x < 11:  # 朝は、3時から10時59分まで
            return 0
        elif x >= 11 and x < 18: # 昼は、11時から17時59分まで
            return 1
        else:  # 夜は18時から26時59分まで
            return 2

    df['TimeGroup'] = df["Hour"].apply(func_cate)
    
    return df

In [None]:
df_train = date_split(df_train)
df_test = date_split(df_test)

In [None]:
df_train.dtypes

In [None]:
time_vs_cat = df_train.groupby(['Category', 'TimeGroup'], as_index=False).count()

In [None]:
time_vs_cat

In [None]:
time_vs_cat_pv = time_vs_cat.pivot(index='TimeGroup', columns='Category', values='Dates').fillna(0)

In [None]:
fig, ax = plt.subplots(figsize=(50, 5)) 
sns.heatmap(time_vs_cat_pv.apply(lambda x:x/sum(x),axis=0), square=True, annot=True)

In [None]:
pt = pd.pivot_table(df_train.loc[:, ['Hour', 'Category']],index="Hour",columns="Category",aggfunc=len,fill_value=0)
pt.plot(figsize=(30,10))
plt.legend(bbox_to_anchor=(1.01, 1.0), loc='upper left')

In [None]:
top10_cat_arr = ['LARCENY/THEFT','OTHER OFFENSES','NON-CRIMINAL','ASSAULT','DRUG/NARCOTIC','VEHICLE THEFT','VANDALISM','WARRANTS',
                 'BURGLARY','SUSPICIOUS OCC']
pt = pd.pivot_table(df_train.loc[:, ['Hour', 'Category']],index="Hour",columns="Category",aggfunc=len,fill_value=0)
pt.loc[:, top10_cat_arr].plot(figsize=(30,10))
plt.legend(bbox_to_anchor=(1.01, 1.0), loc='upper left')

In [None]:
plt.figure(figsize=(20, 10))
sns.kdeplot(df_train.groupby('Date').count().iloc[:, 0], shade=True)
plt.xlabel('Incidents')
plt.ylabel('Density')

In [None]:
df_train.head()

In [None]:
# Try Adversarial Validation from this cell
print("Train columns: " + df_train.columns)
print("Test columns: " + df_test.columns)

In [None]:
# Drop Dates, Date, Descript, Resolution, Address
df_train.drop(['Dates', 'Date', 'Descript', 'Resolution', 'Address'], axis=1, inplace=True)
df_test.drop(['Dates', 'Date', 'Address'], axis=1, inplace=True)

In [None]:
print("Train columns: " + df_train.columns)
print("Test columns: " + df_test.columns)

In [None]:
TARGET = 'Category'
x_train = df_train.drop(TARGET, axis=1)
y_train = df_train[TARGET]

id_test = df_test['Id']
x_test = df_test.drop('Id', axis=1)

In [None]:
cat_cols = ['DayOfWeek', 'PdDistrict'] # Columns to be one-hot-encoded
num_cols = ['X', 'Y'] # Colmuns to be standardized

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder

# Standardized based on only training data
scaler = StandardScaler()
scaler.fit(x_train[num_cols])
x_train[num_cols] = scaler.transform(x_train[num_cols])
x_test[num_cols] = scaler.transform(x_test[num_cols])

In [None]:
# Add labels for adversarial validation
x_train['IsTest'] = 0
x_test['IsTest'] = 1

In [None]:
# Combine train data & test data, and one-hot-encoding by pandas.get_dummies
x_all = pd.concat([x_train, x_test])
x_all = pd.get_dummies(x_all, columns=cat_cols)

In [None]:
x_all['IsTest'].value_counts()

In [None]:
x_all

In [None]:
y_all = x_all['IsTest']
x_all = x_all.drop(['IsTest'], axis=1)

In [None]:
print(x_all.isnull().sum())
y_all

In [None]:
from sklearn.model_selection import train_test_split
import lightgbm as lgb
X_train_adv, X_valid_adv, y_train_adv, y_valid_adv = train_test_split(x_all, y_all, test_size=0.3, random_state=42, shuffle=True)

model = lgb.LGBMClassifier(
    n_estimators=1000,
    random_state=42
)

model.fit(
    X_train_adv,
    y_train_adv,
    eval_set=[(X_train_adv, y_train_adv), (X_valid_adv, y_valid_adv)],
    eval_names=['train', 'valid'],
    eval_metric='auc',
    verbose=100)

In [None]:
print(y_train_adv.value_counts())
print(y_valid_adv.value_counts())

In [None]:
ax = lgb.plot_metric(model.evals_result_, metric='auc')
plt.show()

In [None]:
# 特徴量重要度の算出 (データフレームで取得)
feature_imp = pd.DataFrame(sorted(zip(model.feature_importances_,x_all.columns)), columns=['Value','Feature'])

feature_imp['Value'] = feature_imp['Value'] / feature_imp['Value'].sum()

plt.figure(figsize=(20, 10))
sns.barplot(x="Value", y="Feature", data=feature_imp.sort_values(by="Value", ascending=False))
plt.title('LightGBM Features (avg over folds)')
plt.tight_layout()
plt.show()

In [None]:
sns.countplot(x_train['Day'])

In [None]:
sns.countplot(x_test['Day'])

In [None]:
sns.countplot(x_train['Month'])

In [None]:
sns.countplot(x_test['Month'])

In [None]:
sns.countplot(x_train['Year'])

In [None]:
sns.countplot(x_test['Year'])

In [None]:
x_all_2 = x_all.drop(['Day', 'Month'], axis=1)

In [None]:
# Retry model fitting
X_train_adv, X_valid_adv, y_train_adv, y_valid_adv = train_test_split(x_all_2, y_all, test_size=0.3, random_state=42, shuffle=True)

model = lgb.LGBMClassifier(
    n_estimators=1000,
    random_state=42
)

model.fit(
    X_train_adv,
    y_train_adv,
    eval_set=[(X_train_adv, y_train_adv), (X_valid_adv, y_valid_adv)],
    eval_names=['train', 'valid'],
    eval_metric='auc',
    verbose=100)

In [None]:
ax = lgb.plot_metric(model.evals_result_, metric='auc')
plt.show()

In [None]:
x_train = x_all_2[y_all==0]
x_test = x_all_2[y_all==1]

In [None]:
## メモリえらー
# um = umap.UMAP()
# um.fit(x_train)

### Ref: Adversarial Validation
https://www.acceluniverse.com/blog/developers/2020/01/kaggleadversarial-validation.html
https://qiita.com/shota-imazeki/items/6f48c78edf0ce3b316e1

In [None]:
# # address前処理遺産
# df_train['Address'].str.contains('block', case=False)

# address = df_train['Address'].value_counts()

# address.index

# add_arr = [re.split('of|/', s.lower()) for s in np.array(address.index)]

# add_arr[0][0]

# target = 'block'
# t_idx = add_arr[0][0].find(target)
# print(t_idx)
# add_arr[0][0][:t_idx]

# target = 'block'
# [[s[:s.find(target)] for s in s_list] for s_list in add_arr]

# target = 'block|st|av|ln'
# [[s[:re.search(target, s).start()] if re.search(target, s) is not None else s for s in s_list] for s_list in add_arr]

# re.search(target, 'aaa') is not None

# # Word2Vec Try
# sentences = []
# for s in df_train["Address"]:
#     sentences.append(s.split(" "))
# address_model = gensim.models.Word2Vec(sentences, min_count=1)
# encoded_address = np.zeros((df_train.shape[0], 100))
# for i in range(len(sentences)):
#     for j in range(len(sentences[i])):
#         encoded_address[i] += address_model.wv[sentences[i][j]]
#     encoded_address[i] /= len(sentences[i])

# encoded_address.shape