# Notebook概要
- 社内研修「AI基礎研修」用ノートブック
- 研修参加者以外への共有はお控えください

# 共通事前準備

In [None]:
# 日本語表示用フォントインストール
!apt-get -qy install fonts-ipafont-gothic
!rm /root/.cache/matplotlib/fontlist-v310.json

# 決定木可視化用ライブラリ
!pip -q install dtreeviz
!apt-get -qy install pdf2svg
!apt-get -qy install graphviz

# 可視化用ライブラリ
!pip -q install mglearn

* 上記セルを実行後ランタイムを再起動してください

In [None]:
# 日本語フォント設定
import matplotlib.pyplot as plt
plt.rcParams['font.family'] = 'IPAPGothic'

# 一般的な分析用ライブラリ
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from scipy.stats import multivariate_normal
import mglearn

# 機械学習用ライブラリ
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression, SGDRegressor, Lasso, Ridge, ElasticNet
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler

# ディープラーニング用ライブラリ
import keras
from keras.utils.vis_utils import model_to_dot
from keras.utils.vis_utils import plot_model
from IPython.display import SVG
from tensorflow.keras.utils import to_categorical

import warnings
warnings.filterwarnings('ignore')

# 計算時間計測用
import time

%matplotlib inline

# 回帰

|カラム名（英）|カラム名（日）|
|---|---|
|CRIM|人口 1 人当たりの犯罪発生数|
|ZN|25,000 平方フィート以上の住居区画の占める割合|
|INDUS|小売業以外の商業が占める面積の割合|
|CHAS|チャールズ川によるダミー変数 (1: 川の周辺, 0: それ以外)|
|NOX|NOx の濃度|
|RM|住居の平均部屋数|
|AGE|1940 年より前に建てられた物件の割合|
|DIS|5 つのボストン市の雇用施設からの距離 (重み付け済)|
|RAD|環状高速道路へのアクセスしやすさ|
|TAX|$10,000 ドルあたりの不動産税率の総計|
|PTRATIO|町毎の児童と教師の比率|
|B|町毎の黒人 (Bk) の比率を次の式で表したもの。 1000(Bk – 0.63)^2|
|LSTAT|給与の低い職業に従事する人口の割合 (%)|

## データの読み込み

In [None]:
boston = load_boston()
df_boston = pd.DataFrame(boston['data'], columns=boston['feature_names'])
df_boston['target'] = boston['target']
columns_jp = [
              '犯罪発生数',
              '25,000平方f以上住居の割合',
              '小売業以外商業の割合',
              'チャールズ川周辺',
              'NOx濃度',
              '平均部屋数',
              '1940年以前割合',
              '雇用施設の距離',
              '高速道路アクセス',
              '不動産税率',
              '児童教師比率',
              '黒人比率',
              '低所得職業比率',
              '住宅価格'
]
df_boston.columns = columns_jp
print(f'shape: {df_boston.shape}\n')

X = df_boston.iloc[:,:-1]
y = df_boston.iloc[:,-1]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0)

df_boston.head(20)

## データの可視化

### 特徴量別に統計量を見てみよう

In [None]:
df_boston.describe()

### 特徴量別にデータの分布を見てみよう

In [None]:
# 描画領域準備
fig, axes = plt.subplots(4, 4, dpi=80, figsize=(20,10))
fig.subplots_adjust(wspace=0.3, hspace=0.5, top=0.92)

# ヒストグラム描画
for i, column in enumerate(df_boston.columns):
    ax = axes[i//4, i%4]
    ax.hist(df_boston[column], bins=20)
    ax.set_title(column)
    ax.grid()
    ax.set_ylim(0,500)

# 余剰グラフ非表示
for i in range(df_boston.shape[1], axes.flatten().shape[0]):
    ax = axes[i//4, i%4]
    ax.axis("off")

plt.show()

### どの特徴量が効いていそうか

In [None]:
fig, axes = plt.subplots(4, 4, dpi=80, figsize=(20,10))
fig.subplots_adjust(wspace=0.3, hspace=0.6, top=0.92)

for i, column in enumerate(df_boston.columns):
    ax = axes[i//4, i%4]
    ax.scatter(df_boston[column], df_boston['住宅価格'], marker='.', alpha=0.5)
    ax.set_xlabel(column),
    ax.set_ylabel('住宅価格')
    ax.grid()

# 余剰グラフ非表示
for i in range(df_boston.shape[1], axes.flatten().shape[0]):
    ax = axes[i//4, i%4]
    ax.axis("off")

plt.show()

### 線形回帰してみる

In [None]:
fig, ax = plt.subplots(figsize=(10,5), dpi=160)
X_reg = np.array(df_boston['平均部屋数']).reshape(-1,1)
y_reg = df_boston['住宅価格']
reg = LinearRegression().fit(X_reg, y_reg)
ax.scatter(X_reg, y_reg, alpha=0.5)
ax.set_xlabel('平均部屋数')
ax.set_ylabel('住宅価格')
ax.grid()
plt.show()

In [None]:
def boston_linear_regression(feature):
    fig, ax = plt.subplots(figsize=(10,5), dpi=160)
    X_reg = np.array(df_boston[feature]).reshape(-1,1)
    y_reg = df_boston['住宅価格']
    line = np.linspace(X_reg.min(), X_reg.max(), X_reg.shape[0], endpoint=False).reshape(-1, 1)
    reg = LinearRegression().fit(X_reg, y_reg)
    ax.scatter(X_reg, y_reg, alpha=0.5)
    ax.plot(line, reg.predict(line), label='線形回帰', c='k', alpha=0.5)
    ax.set_xlabel(feature)
    ax.set_ylabel('住宅価格')
    ax.legend()
    ax.grid()
    plt.show()

In [None]:
boston_linear_regression('平均部屋数')

In [None]:
boston_linear_regression('児童教師比率')

### 2変数で見てみる（3D）

In [None]:
x_ = np.array(df_boston['低所得職業比率']).reshape(-1,1)
y_ = np.array(df_boston['平均部屋数']).reshape(-1,1)
z_ = df_boston['住宅価格']

X1, X2 = np.meshgrid(x_, y_)
X_plot = np.c_[np.ravel(X1), np.ravel(X2)]

fig = plt.figure(dpi=160, figsize=(10,5))
ax = fig.add_subplot(111, projection='3d')
# ax.scatter3D(x_, y_, z_, s=z_, alpha=0.3)
ax.scatter3D(x_, y_, z_, alpha=0.3)
ax.set_title("平均部屋数・低所得職業比率 と 住宅価格の関係")
ax.set_xlabel('低所得職業比率')
ax.set_ylabel('平均部屋数')
ax.set_zlabel('住宅価格')
plt.show()

### 2変数で見てみる

In [None]:
fig, ax = plt.subplots(figsize=(10,5), dpi=160)
y_reg = np.array(df_boston['平均部屋数']).reshape(-1,1)
X_reg = np.array(df_boston['低所得職業比率']).reshape(-1,1)
price = df_boston['住宅価格'] * 3

ax.scatter(X_reg, y_reg, s=price, cmap='winter', alpha=0.4, label='住宅価格')
ax.set_xlabel('低所得職業比率')
ax.set_ylabel('平均部屋数')
ax.legend()
ax.grid(color='gray', alpha=0.2)
plt.show()

## 決定木回帰で予測

### 決定木回帰のモデル

In [None]:
from sklearn.datasets import *
from sklearn import tree
from dtreeviz.trees import *
import graphviz

for depth in range(1,5,1):
    regr = tree.DecisionTreeRegressor(max_depth=depth)
    boston = load_boston()
    regr.fit(boston.data, boston.target)

    viz = dtreeviz(regr,
                boston.data,
                boston.target,
                target_name='price',
                feature_names=boston.feature_names)
    
    print(f'★★★ depth: {depth}\n\n\n')
    display(viz)
    plt.show()
    viz.save(f"decision_tree_regressor_depth_{depth}.svg")

### 特徴量別重要度

In [None]:
tree = DecisionTreeRegressor().fit(X_train, y_train)
feature_importances = tree.feature_importances_

df_tree = pd.DataFrame(feature_importances, index=X.columns, columns=['重要度']).sort_values('重要度', ascending=True)

fig, ax = plt.subplots(figsize=(10,5), dpi=160)
ax.barh(df_tree.index, df_tree['重要度'])
ax.set_title('特徴量別重要度')

plt.show()

## いろいろなアルゴリズム

### モデル別精度

In [None]:
np.random.seed(0)
regressors = {
    '線形回帰': LinearRegression(),
    '決定木回帰': DecisionTreeRegressor(random_state=0, max_depth=4, max_features='sqrt'),
    'SVM(rbf)': SVR(kernel='rbf', C=1e3, gamma=0.1),
    'Lasso': Lasso(),
    'Ridge': Ridge(),
    'ElasticNet': ElasticNet()
}

result_list = list()

for regressor_name, regressor in regressors.items():
    start = time.time()
    scores = cross_val_score(regressor, X, y, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
    elapsed_time = time.time() - start
    result_list.append([regressor_name, abs(round(np.mean(scores),2)), round(elapsed_time, 2)])

fig, ax = plt.subplots(figsize=(10,5), dpi=160)
df_result = pd.DataFrame(result_list, columns=['estimator', 'MAE', 'processing_time']).set_index('estimator').sort_values('MAE')

# モデル別スコア
index = df_result.index
values = df_result.MAE
ax.bar(index, values)
ax.set_title('モデル別スコア（MAE）')
plt.show()

### 計算コストとスコアのトレードオフ

In [None]:
# 時間・スコア別モデル
fig, ax = plt.subplots(figsize=(10,5), dpi=160)
y_ = df_result.MAE
x_ = df_result.processing_time
names = df_result.index
ax.scatter(x_, y_)

ax.set_title('モデル別スコア（MAE）・計算時間')
ax.set_xlabel('計算時間 [s]')
ax.set_ylabel('スコア（MAE）[$1,000]')

for (i,j,k) in zip(x_, y_,names):
        ax.annotate(k, xy=(i, j))

ax.grid()
plt.show()

## グリッドサーチ

In [None]:
np.random.seed(10)
tuned_parameters = [
    {'max_depth': [None, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,],
    #  'max_features': [None, 'auto', 'sqrt', 'log2']
    }
]
clf = GridSearchCV(
    DecisionTreeRegressor(), # 識別器
    tuned_parameters, # 最適化したいパラメータセット 
    cv=5, # 交差検定の回数
    scoring='neg_mean_squared_error')

clf.fit(X_train, y_train)

df_grid_search = pd.concat([pd.DataFrame(clf.cv_results_['params']), abs(pd.DataFrame(clf.cv_results_['mean_test_score'], columns=['MAE']))], sort=False, axis=1).set_index('max_depth')
fig, ax = plt.subplots(figsize=(10,5), dpi=160)
ax.plot(df_grid_search.index, df_grid_search['MAE'], marker='o')
ax.set_title('max_depth 別MAE')
ax.set_xlabel('max_depth')
ax.set_ylabel('MAE')
ax.legend()
ax.grid()
plt.show()

# 分類

## データの読み込み

### 各項目の説明

|カラム名（英）|カラム名（日）|
|---|---|
|mean radius|平均半径|
|mean texture|テクスチャをグレースケールにした際の平均|
|mean perimeter|平均外周の長さ|
|mean area|平均面積|
|mean smoothness|平均なめらかさ（半径の分散）|
|mean compactness|外周長さ^2 / 面積 - 1|
|mean concavity|輪郭の凹部の重要度の平均|
|mean concave points|輪郭の凹部の数の平均|
|mean symmetry|対称性|
|mean fractal dimension|フラクタル次元の平均|
|radius error|半径誤差|
|texture error|テクスチャの誤差|
|perimeter error|外周の誤差|
|area error|面積の誤差|
|smoothness error|なめらかさの誤差|
|compactness error|コンパクトさの誤差|
|concavity error|輪郭の凹部の重要度の誤差|
|concave points error|輪郭の凹部の数の誤差|
|symmetry error|対称性の誤差|
|fractal dimension error|フラクタル次元の誤差|
|worst radius|半径最悪値|
|worst texture|テクスチャ最悪値|
|worst perimeter|外周の長さ最悪値|
|worst area|面積の最悪値|
|worst smoothness|なめらかさの最悪値|
|worst compactness|コンパクトさの最悪値|
|worst concavity|輪郭の凹部の重要度の最悪値|
|worst concave|輪郭の凹部の数の最悪値|
|worst symmetry|対称性の最悪値|
|worst fractal dimension|フラクタル次元の最悪値|

### データの中身確認

In [None]:
from sklearn.datasets import load_breast_cancer
cancer = load_breast_cancer()

X = pd.DataFrame(cancer.data, columns=cancer.feature_names)
y = cancer.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0)

df_cancer = X.copy()
df_cancer['target'] = y

column_jp = [
    '平均半径',
    'グレースケール時の平均',
    '平均外周の長さ',
    '平均面積',
    '平均なめらかさ（半径の分散）',
    '外周長さ^2 / 面積 - 1',
    '輪郭の凹部の重要度の平均',
    '輪郭の凹部の数の平均',
    '対称性',
    'フラクタル次元の平均',
    '半径誤差',
    'テクスチャの誤差',
    '外周の誤差',
    '面積の誤差',
    'なめらかさの誤差',
    'コンパクトさの誤差',
    '輪郭の凹部の重要度の誤差',
    '輪郭の凹部の数の誤差',
    '対称性の誤差',
    'フラクタル次元の誤差',
    '半径最悪値',
    'テクスチャ最悪値',
    '外周の長さ最悪値',
    '面積の最悪値',
    'なめらかさの最悪値',
    'コンパクトさの最悪値',
    '輪郭凹部重要度の最悪値',
    '輪郭の凹部の数の最悪値',
    '対称性の最悪値',
    'フラクタル次元の最悪値',
]

X.columns = column_jp
X.head(10)

### 特徴量別のヒストグラム確認

In [None]:
column_jp.extend(['診断'])
df_cancer.columns = column_jp
df_cancer.head()

fig, axes = plt.subplots(4, 8, dpi=80, figsize=(20,10))
fig.subplots_adjust(wspace=0.3, hspace=0.5, top=0.92)

for i, column in enumerate(df_cancer.columns):
    ax = axes[i//8, i%8]
    ax.hist(df_cancer[column], bins=20)
    ax.set_title(column)
    ax.grid()
    ax.set_ylim(0,400)

# 余剰グラフ非表示
for i in range(df_cancer.shape[1], axes.flatten().shape[0]):
    ax = axes[i//8, i%8]
    ax.axis("off")

plt.show()

### 特徴量別の診断結果別ヒストグラムの確認

In [None]:
# ビンの取得
fig, axes = plt.subplots(4, 8)
bins_list = list()
for i, column in enumerate(df_cancer.columns):
    ax_tmp = axes[i//8, i%8]
    n, bins, pathces = ax_tmp.hist(df_cancer[column].values.flatten(), alpha=0.5, bins=20)
    bins_list.append(bins)
plt.clf()

# ヒストグラム描画
fig, axes = plt.subplots(4, 8, dpi=80, figsize=(20,10))
fig.subplots_adjust(wspace=0.3, hspace=0.5, top=0.92)
fig.patch.set_facecolor('white')

for diagnosis in [0, 1]:
    df_plot = df_cancer.query('診断 == @diagnosis')
    for i, column in enumerate(df_cancer.columns):
        ax = axes[i//8, i%8]
        ax.hist(df_plot[column].values.flatten(), alpha=0.5, bins=bins_list[i], label=diagnosis)
        ax.set_xlabel(column)
        ax.legend()

# 余剰グラフ非表示
for i in range(df_cancer.shape[1], axes.flatten().shape[0]):
    ax = axes[i//8, i%8]
    ax.axis("off")

plt.show()

### 半径最悪値のヒストグラム

In [None]:
column = '外周の長さ最悪値'
# ビンの取得
fig, ax = plt.subplots()
ax_tmp = ax
n, bins, pathces = ax_tmp.hist(df_cancer[column].values.flatten(), alpha=0.5, bins=20)
plt.clf()

# ヒストグラム描画
fig, ax = plt.subplots(dpi=160, figsize=(10,5))
fig.patch.set_facecolor('white')

for diagnosis in [0, 1]:
    df_plot = df_cancer.query('診断 == @diagnosis')
    ax.hist(df_plot[column].values.flatten(), alpha=0.5, bins=bins, label=diagnosis)
    ax.set_xlabel(column)
    ax.legend()

plt.show()

### 外周の長さ最悪値と何かを使って...

In [None]:
fig, axes = plt.subplots(4, 8, dpi=80, figsize=(30,15))
fig.subplots_adjust(wspace=0.4, hspace=0.4, top=0.92)
fig.patch.set_facecolor('white')

for i, column in enumerate(df_cancer.columns):
    ax = axes[i//8, i%8]
    if column in ['診断', '外周の長さ最悪値']:
        continue
    sns.scatterplot(x="外周の長さ最悪値", y=column, hue="診断", data=df_cancer[[column, '外周の長さ最悪値', '診断']], ax=ax, marker='.', alpha=0.5)

# 余剰グラフ非表示
for i in range(df_cancer.shape[1], axes.flatten().shape[0]):
    ax = axes[i//8, i%8]
    ax.axis("off")

for i, column in enumerate(df_cancer.columns):
    ax = axes[i//8, i%8]
    if column in ['診断', '外周の長さ最悪値']:
        ax.axis("off")

plt.show()

### 2特徴量で分離

In [None]:
fig, ax = plt.subplots(dpi=160, figsize=(10,5))
fig.patch.set_facecolor('white')
sns.scatterplot(x="外周の長さ最悪値", y='輪郭の凹部の数の最悪値', hue="診断", data=df_cancer[['輪郭の凹部の数の最悪値', '外周の長さ最悪値', '診断']], ax=ax, marker='.', alpha=0.5)

plt.show()

## 決定木で予測

### 特徴量別重要度

In [None]:
tree = DecisionTreeClassifier().fit(X_train, y_train)
feature_importances = tree.feature_importances_

df_tree = pd.DataFrame(feature_importances, index=X.columns, columns=['重要度']).sort_values('重要度', ascending=True)

fig, ax = plt.subplots(figsize=(10,5), dpi=160)
ax.barh(df_tree.index, df_tree['重要度'])
ax.set_title('特徴量別重要度')

plt.show()

### 決定木作成過程

In [None]:
for depth in range(1,6,1):
    clf = DecisionTreeClassifier(max_depth=depth)
    clf.fit(X_train, y_train)

    from dtreeviz.trees import dtreeviz

    viz = dtreeviz(
        clf,
        X_train, 
        y_train,
        target_name='診断',
        feature_names=X_train.columns,
        class_names=[1,0],
    ) 

    display(viz)
    viz.save(f"decision_tree_classifier_depth_{depth}.svg")

## 様々なアルゴリズム

In [None]:
# モジュール読み込み
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.gaussian_process.kernels import RBF
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

np.random.seed(0)

classifier_names = ["Nearest Neighbors", "Linear SVM", "RBF SVM", "Gaussian Process",
         "Decision Tree", "Random Forest", "Neural Net", "AdaBoost",
         "Naive Bayes", "QDA"]

classifiers = [
    KNeighborsClassifier(3),
    SVC(kernel="linear", C=0.025),
    SVC(gamma=2, C=1),
    GaussianProcessClassifier(1.0 * RBF(1.0)),
    DecisionTreeClassifier(max_depth=5),
    RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
    MLPClassifier(alpha=1, max_iter=1000),
    AdaBoostClassifier(),
    GaussianNB(),
    QuadraticDiscriminantAnalysis()
]

classifier_list = zip(classifier_names, classifiers)
result_list = list()

for classifier_name, classifier in classifier_list:
    if classifier_name == "RBF SVM":
        continue
    print(f'▼計算中：{classifier_name}')
    start = time.time()
    kfold = KFold(n_splits=10, random_state=42, shuffle=True)
    scoring = 'recall'
    scores = cross_val_score(classifier, X, y, cv=kfold, scoring=scoring, n_jobs=-1)
    elapsed_time = time.time() - start
    result_list.append([classifier_name, abs(round(np.mean(scores),4)), round(elapsed_time, 2)])
    print(f'{round(elapsed_time, 2)}秒\n')

# 可視化
fig, ax = plt.subplots(figsize=(20,10), dpi=80)
df_result = pd.DataFrame(result_list, columns=['estimator', scoring, 'processing_time']).set_index('estimator').sort_values(scoring)

# モデル別スコア
index = df_result.index
values = df_result[scoring]
ax.bar(index, values)
ax.set_title('モデル別スコア')
ax.set_ylim(0.9,1.0)
ax.grid()
plt.show()

In [None]:
# 時間・スコア別モデル
fig, ax = plt.subplots(figsize=(10,5), dpi=160)
y_ = df_result[scoring]
x_ = df_result.processing_time
names = df_result.index
ax.scatter(x_, y_)
ax.set_title('モデル別処理時間 - スコア（検出率）')

for (i,j,k) in zip(x_,y_,names):
        ax.annotate(k, xy=(i, j))

ax.set_xlabel('処理時間 [s]')
ax.set_ylabel('スコア（検出率）')
ax.grid()
plt.show()

# クラスタリング

### データ読み込み

In [None]:
# df_cc = pd.read_csv('https://raw.githubusercontent.com/lokesharma92/CreditCard_Segmentation/master/CC%20GENERAL.csv')
df_cc = pd.read_csv('https://raw.githubusercontent.com/aknr-t/CreditCard_Segmentation/master/CC%20GENERAL.csv')
df_cc.columns = [
    '顧客ID',
    '残高',
    '残高更新頻度',
    '購入額',
    '一括購入額',
    '分割購入額',
    'キャッシング額',
    '購入頻度',
    '一括購入頻度',
    '分割購入頻度',
    'キャッシング頻度',
    'キャッシングトランザクション数',
    '購入トランザクション数',
    '限度額',
    '返済総額',
    '最小返済額',
    '返済割合',
    '返済期間'
]
print(df_cc.shape)
df_cc = df_cc.dropna().reset_index().drop(['顧客ID', 'index'], axis=1)
df_cc.head(10)

### データ説明

|カラム名（英）|カラム名（日）|詳細|
|---|---|---|
|CUST_ID |顧客ID| Identification of Credit Card holder (Categorical)|
|BALANCE |残高| Balance amount left in their account to make purchases|
|BALANCE_FREQUENCY |残高更新頻度| How frequently the Balance is updated, score between 0 and 1 (1 = frequently updated, 0 = not frequently updated)|
|PURCHASES |購入額| Amount of purchases made from account|
|ONEOFF_PURCHASES |一括購入額| Maximum purchase amount done in one-go|
|INSTALLMENTS_PURCHASES |分割購入額| Amount of purchase done in installment|
|CASH_ADVANCE |キャッシング額| Cash in advance given by the user|
|PURCHASES_FREQUENCY |購入頻度| How frequently the Purchases are being made, score between 0 and 1 (1 = frequently purchased, 0 = not frequently purchased)|
|ONEOFFPURCHASESFREQUENCY | 一括購入頻度|How frequently Purchases are happening in one-go (1 = frequently purchased, 0 = not frequently purchased)|
|PURCHASESINSTALLMENTSFREQUENCY |分割購入頻度| How frequently purchases in installments are being done (1 = frequently done, 0 = not frequently done)|
|CASHADVANCEFREQUENCY | キャッシング頻度|How frequently the cash in advance being paid|
|CASHADVANCETRX | キャッシングトランザクション数|Number of Transactions made with "Cash in Advanced"|
|PURCHASES_TRX | 購入トランザクション数|Number of purchase transactions made|
|CREDIT_LIMIT | 限度額|Limit of Credit Card for user|
|PAYMENTS |返済総額| Amount of Payment done by user|
|MINIMUM_PAYMENTS | 最小返済額|Minimum amount of payments made by user|
|PRCFULLPAYMENT | 返済割合 |Percent of full payment paid by user TENURE |
|TENURE| 返済期間 | -|

### 特徴量別ヒストグラム

In [None]:
# 欠損処理
df_cc.loc[(df_cc['最小返済額'].isnull()==True),'最小返済額']=df_cc['最小返済額'].mean()
df_cc.loc[(df_cc['限度額'].isnull()==True),'限度額']=df_cc['限度額'].mean()

fig, axes = plt.subplots(4, 5, dpi=120, figsize=(20,10))
fig.subplots_adjust(wspace=0.3, hspace=0.5, top=0.92)

# ヒストグラム描画
for i, column in enumerate(df_cc.columns):
    ax = axes[i//5, i%5]
    ax.hist(df_cc[column], bins=20)
    ax.set_title(column)
    ax.grid()
    ax.set_ylim(0,10000)

# 余剰グラフ非表示
for i in range(df_cc.shape[1], axes.flatten().shape[0]):
    ax = axes[i//5, i%5]
    ax.axis("off")

plt.show()

### 散布図行列

In [None]:
## ★実行に時間がかかるためスライドで説明
sns.pairplot(df_cc)
plt.show()

### KMeans

In [None]:
mglearn.plots.plot_kmeans_algorithm()

### KMeansを適用

In [None]:
# 標準化
X= np.asarray(df_cc)
scale = StandardScaler()
X = scale.fit_transform(X)

# クラスタリング時の最小二乗誤差総和推移（1〜30）
n_clusters=30
cost=[]
for i in range(1,n_clusters):
    kmean= KMeans(i, random_state=0)
    kmean.fit(X)
    cost.append(kmean.inertia_)

# 描画
fig, ax = plt.subplots(figsize=(10,5),dpi=200)
ax.plot(cost, 'bo-')
ax.set_xlabel('クラスタ数')
ax.set_ylabel('最小二乗誤差総和')

### クラスタ数6で分析

In [None]:
# クラスタ数を定義
n_clusters = 6

# クラスタリング
kmean= KMeans(n_clusters=n_clusters, random_state=0)
kmean.fit(X)
labels=kmean.labels_
clusters=pd.concat([df_cc.iloc[:,1:], pd.DataFrame({'cluster':labels})], axis=1)
clusters.head()

### クラスタ分類結果

In [None]:
# クラスタ別顧客数確認
clusters.groupby('cluster').count()[['残高更新頻度']].plot(kind='bar', figsize=(12,6))
plt.show()

### クラスタ毎・属性毎分布

In [None]:
n_rows = len(clusters.columns)
n_columns = n_clusters

stat_all = list()
for i, column in enumerate(clusters.columns):
    if column == '顧客ID':
        continue

    # binの取得
    fig, axes = plt.subplots()
    n, bins, patches = axes.hist(clusters[column])
    plt.clf()

    fig, axes = plt.subplots(1, n_columns, figsize=(12,2), dpi=200)
    fig.suptitle(column)
    fig.tight_layout()

    stat_cluster = list()
    for j, cluster in enumerate(range(n_clusters)):
        ax = axes[j]
        n_cluster, bins_cluster, patches_cluster = ax.hist(clusters.query('cluster == @cluster')[column], bins=bins)
        ax.grid(color='gray', alpha=0.5)

        # # 中央値の取得
        # stat = clusters.query('cluster == @cluster')[column].median()

        # 平均値の取得
        stat = clusters.query('cluster == @cluster')[column].mean()

        # 最大値の取得
        bins_max = bins.max()
        relative_stat = stat / bins_max
        stat_cluster.append(relative_stat)

    stat_all.append(stat_cluster)
    plt.show()

In [None]:
df_mode

In [None]:
df_mode = pd.DataFrame(stat_all)
df_mode.columns = ['cluster_' + str(x) for x in df_mode.columns]
df_mode.index = clusters.columns

df_mode_summary = df_mode.loc[[
             '購入額',
             '限度額',
             '返済総額',
             '一括購入額',
             '残高更新頻度',
             '購入頻度',
             '一括購入頻度',
             '分割購入頻度',
             'キャッシング頻度',
             ],
            :
]

def plot_polar(labels, values, imgname, cluster):
    angles = np.linspace(0, 2 * np.pi, len(labels) + 1, endpoint=True)
    values = np.concatenate((values, [values[0]]))  # 閉じた多角形にする
    fig = plt.figure(figsize=(8,8))
    ax = fig.add_subplot(111, polar=True)
    # ax.boxplot(angles, values)
    ax.plot(angles, values, 'o-')  # 外枠
    ax.fill(angles, values, alpha=0.25)  # 塗りつぶし
    ax.set_thetagrids(angles[:-1] * 180 / np.pi, labels)  # 軸ラベル
    ax.set_rlim(0 ,1)
    ax.set_title(cluster, fontsize=14)
    fig.savefig(imgname)
    plt.show()
    plt.close(fig)

for i, cluster in enumerate(df_mode_summary.columns):
    labels = df_mode_summary.index
    values = df_mode_summary[cluster].values
    plot_polar(labels, values, "radar.png", cluster)
    print('')

# ディープラーニング（分類）

## データ読み込み

### 学習データの確認

In [None]:
from keras.datasets import mnist
(train_images, train_labels), (test_images, test_labels) = mnist.load_data()

#MNISTデータの表示
fig = plt.figure(figsize=(9, 9))
fig.subplots_adjust(left=0, right=1, bottom=0, top=0.5, hspace=0.05, wspace=0.05)
for i in range(9*9):
    ax = fig.add_subplot(9, 9, i + 1, xticks=[], yticks=[])
    ax.imshow(train_images[i].reshape((28, 28)), cmap='gray')

plt.show()

## 学習用サンプル数
print(f'\n学習用サンプル\t：{train_images.shape[0]}\n高さ\t\t\t：{train_images.shape[1]}\n幅\t\t\t：{train_images.shape[2]}\n\n')

print(f'学習用正解数：{len(train_labels)}')
print('\n',train_labels[:81].reshape(9,9))

### テストデータの確認

In [None]:
#MNISTデータの表示
fig = plt.figure(figsize=(9, 9))
fig.subplots_adjust(left=0, right=1, bottom=0, top=0.5, hspace=0.05, wspace=0.05)
for i in range(9*9):
    ax = fig.add_subplot(9, 9, i + 1, xticks=[], yticks=[])
    ax.imshow(test_images[i].reshape((28, 28)), cmap='gray')

plt.show()

print(f'\n学習用サンプル\t：{test_images.shape[0]}\n高さ\t\t\t：{test_images.shape[1]}\n幅\t\t\t：{test_images.shape[2]}\n\n')
print(f'テスト用正解数：{len(test_labels)}')
print('\n',test_labels[:81].reshape(9,9))

## モデル構築

In [None]:
from keras import models
from keras import layers

network = models.Sequential()
network.add(layers.Dense(512, activation='relu', input_shape=(28 * 28,)))
network.add(layers.Dense(10, activation='softmax'))

network.compile(optimizer='rmsprop',
                loss='categorical_crossentropy',
                metrics=['accuracy'])

SVG(model_to_dot(network, show_shapes=True, show_layer_names=False, dpi=72).create(prog='dot', format='svg'))

## 前処理

In [None]:
train_images = train_images.reshape((60000, 28 * 28))
train_images = train_images.astype('float32') / 255
test_images = test_images.reshape((10000, 28 * 28))
test_images = test_images.astype('float32') / 255
train_labels = to_categorical(train_labels)
test_labels = to_categorical(test_labels)
print(train_labels[:10])

## 訓練・検証

In [None]:
history = network.fit(train_images, train_labels, epochs=20, batch_size=512, validation_data=(test_images, test_labels))

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(10,4), dpi=200)

for i, columns in enumerate([['loss', 'val_loss'], ['accuracy', 'val_accuracy']]):
    train = history.history[columns[0]]
    test = history.history[columns[1]]

    ax = axes[i]
    ax.plot(range(1, len(train)+1), train, label=columns[0], marker='o', ms=5)
    ax.plot(range(1, len(test)+1), test, label=columns[1], marker='o', ms=5)

    # plot best score
    if columns[0] == 'loss':
        best_score = min(test)
    elif columns[0] == 'accuracy':
        best_score = max(test)
    ax.hlines(best_score, 0, 20, color='red', alpha=0.5, linestyle='--', label='best val score')

    ax.set_xlabel('学習回数（epochs）')
    ax.set_ylabel(columns[0])
    ax.set_title(f'{columns[0]} and {columns[1]}')
    ax.legend()

plt.show()
print('')