In [1]:
import numpy as np
import pandas as pd
from IPython.display import Markdown, display

In [2]:
%load_ext memory_profiler

In [3]:
# train_xは学習データ、train_yは目的変数、test_xはテストデータ
# pandasのDataFrame, Seriesで保持します。（numpyのarrayで保持することもあります）

train = pd.read_csv('../../input/sample-data/train_preprocessed_onehot.csv')
train_x = train.drop(['target'], axis=1)
train_y = train['target']
test_x = pd.read_csv('../../input/sample-data/test_preprocessed_onehot.csv')

# 説明用に学習データとテストデータの元の状態を保存しておく
train_x_saved = train_x.copy()
test_x_saved = test_x.copy()

In [4]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [5]:
# 標準化を行った学習データとテストデータを返す関数
def load_standarized_data():
    train_x, test_x = train_x_saved.copy(), test_x_saved.copy()

    scaler = StandardScaler()
    scaler.fit(train_x)
    train_x = scaler.transform(train_x)
    test_x = scaler.transform(test_x)
    return pd.DataFrame(train_x), pd.DataFrame(test_x)


# MinMaxスケーリングを行った学習データとテストデータを返す関数
def load_minmax_scaled_data():
    train_x, test_x = train_x_saved.copy(), test_x_saved.copy()

    # Min-Max Scalingを行う
    scaler = MinMaxScaler()
    scaler.fit(pd.concat([train_x, test_x], axis=0))
    train_x = scaler.transform(train_x)
    test_x = scaler.transform(test_x)

    return pd.DataFrame(train_x), pd.DataFrame(test_x)

In [6]:
train_x, test_x = load_standarized_data()
train_x.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,49,50,51,52,53,54,55,56,57,58
0,0.367868,0.81259,0.058742,0.38811,-0.496404,-0.335182,-0.500312,-0.202792,-0.143584,-0.20492,...,-0.330921,0.501562,-0.660895,-0.230621,-1.009848,-1.29642,-1.449062,-1.526426,-0.649981,0.499687
1,1.19832,-1.230633,-0.165636,-0.402399,2.01449,-0.335182,-0.500312,-0.202792,-0.143584,-0.20492,...,3.021872,0.501562,-0.660895,-0.230621,-1.009848,-0.433942,-0.766514,-1.093119,1.538507,0.499687
2,1.613547,0.81259,0.166873,-0.617178,-0.496404,-0.335182,1.998751,-0.202792,-0.143584,-0.20492,...,-0.330921,0.501562,-0.660895,-0.230621,0.990248,-1.29642,-0.311481,0.206803,-0.649981,0.499687
3,-1.154629,0.81259,1.190962,0.960153,-0.496404,-0.335182,-0.500312,4.931172,-0.143584,-0.20492,...,-0.330921,0.501562,-0.660895,-0.230621,-1.009848,0.141044,-1.107788,-0.804247,-0.649981,-2.001251
4,0.921503,-1.230633,-0.821325,0.409007,-0.496404,2.983453,-0.500312,-0.202792,-0.143584,-0.20492,...,3.021872,0.501562,1.513101,-0.230621,0.990248,0.716029,0.143551,1.217854,-0.649981,0.499687


In [7]:
train_x, test_x = load_minmax_scaled_data()
train_x.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,49,50,51,52,53,54,55,56,57,58
0,0.608108,1.0,0.517477,0.430295,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.090909,0.066667,0.043478,0.0,1.0
1,0.851351,0.0,0.487401,0.33159,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,0.0,0.0,0.0,0.363636,0.266667,0.173913,1.0,1.0
2,0.972973,1.0,0.531971,0.304772,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.090909,0.4,0.565217,0.0,1.0
3,0.162162,1.0,0.669243,0.501723,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.545455,0.166667,0.26087,0.0,0.0
4,0.77027,0.0,0.399511,0.432905,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,1.0,1.0,0.0,1.0,0.727273,0.533333,0.869565,0.0,1.0


## 学習・評価
- 次元圧縮前後での性能の変化を見るために、簡易的な学習・評価用の関数を作成する

In [8]:
from xgboost import XGBClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score

In [9]:
def train_and_eval(train_x, train_y):
    skf = StratifiedKFold(n_splits=4, shuffle=True, random_state=0)
    accs = []
    for tr_idx, va_idx in skf.split(train_x, train_y):
        tr_x, va_x = train_x.iloc[tr_idx], train_x.iloc[va_idx]
        tr_y, va_y = train_y.iloc[tr_idx], train_y.iloc[va_idx]
        model = XGBClassifier(n_estimators=20, random_state=0)
        model.fit(tr_x, tr_y)
        pred = model.predict(va_x)
        accs.append(accuracy_score(va_y, pred))
    return np.mean(accs)

## PCA
- 主成分分析
- **各特徴量が正規分布に従っていると仮定して使用**

In [10]:
from sklearn.decomposition import PCA

In [11]:
train_x, test_x = load_standarized_data()

display(Markdown('##### 圧縮前'))
print('num col: ', len(train_x.columns))
print('acc: ', train_and_eval(train_x, train_y))

##### 圧縮前

num col:  59
acc:  0.842999987295998


In [12]:
%%time
%%memit
pca = PCA(n_components=5)
pca.fit(train_x)

# 変換の適用
train_x = pd.DataFrame(pca.transform(train_x))
test_x = pd.DataFrame(pca.transform(test_x))

display(Markdown('##### 圧縮後'))
print('num col: ', train_x.shape[1])
print('acc: ', train_and_eval(train_x, train_y))

##### 圧縮後

num col:  5
acc:  0.807500024592004
peak memory: 366.08 MiB, increment: 4.48 MiB
CPU times: user 306 ms, sys: 47.5 ms, total: 354 ms
Wall time: 595 ms


⇒ 列数が大幅に減少したが、精度はそこまで落ちていない

## SVD（singular value decomposition）
- 特異値分解
- PCAとほぼ同じ。疎行列を扱える（＝メモリ効率の良い）SVDのほうがよく使われる

In [13]:
from sklearn.decomposition import TruncatedSVD

In [14]:
train_x, test_x = load_standarized_data()

display(Markdown('##### 圧縮前'))
print('num col: ', len(train_x.columns))
print('acc: ', train_and_eval(train_x, train_y))

##### 圧縮前

num col:  59
acc:  0.842999987295998


In [15]:
%%time
%%memit
svd = TruncatedSVD(n_components=5)
svd.fit(train_x)

# 変換の適用
train_x = pd.DataFrame(svd.transform(train_x))
test_x = pd.DataFrame(svd.transform(test_x))

display(Markdown('##### 圧縮後'))
print('num col: ', train_x.shape[1])
print('acc: ', train_and_eval(train_x, train_y))

##### 圧縮後

num col:  5
acc:  0.8075001045920167
peak memory: 362.52 MiB, increment: 0.01 MiB
CPU times: user 262 ms, sys: 36.4 ms, total: 299 ms
Wall time: 403 ms


⇒ 確かにほぼPCAと同じ。例が悪いのか時間もメモリ効率も変わらなかった。

## NMF（non-negative matrix factorization）
- 非負値行列因子分解
- 全ての特徴量は非負である必要がある。

In [16]:
from sklearn.decomposition import NMF

In [17]:
train_x, test_x = load_minmax_scaled_data()

display(Markdown('##### 圧縮前'))
print('num col: ', len(train_x.columns))
print('acc: ', train_and_eval(train_x, train_y))

##### 圧縮前

num col:  59
acc:  0.842999987295998


In [18]:
%%time
%%memit
nmf = NMF(n_components=5, init='random', random_state=0)
nmf.fit(train_x)

# 変換の適用
train_x = pd.DataFrame(nmf.transform(train_x))
test_x = pd.DataFrame(nmf.transform(test_x))

display(Markdown('##### 圧縮後'))
print('num col: ', train_x.shape[1])
print('acc: ', train_and_eval(train_x, train_y))

##### 圧縮後

num col:  5
acc:  0.8077000246240039
peak memory: 375.61 MiB, increment: 0.16 MiB
CPU times: user 372 ms, sys: 24.2 ms, total: 396 ms
Wall time: 500 ms


In [19]:
nmf.components_.shape

(5, 59)

⇒ 5個の非負行列の和に分解された。

## LDA（Latent Dirichlet Allocation）
- 潜在的ディリクレ配分法
- 文章を分類するトピックモデルに使用される

In [20]:
from sklearn.decomposition import LatentDirichletAllocation

In [21]:
# カウント行列ではないが、非負の値であれば計算は可能
train_x, test_x = load_minmax_scaled_data()

display(Markdown('##### 圧縮前'))
print('num col: ', len(train_x.columns))
print('acc: ', train_and_eval(train_x, train_y))

##### 圧縮前

num col:  59
acc:  0.842999987295998


In [22]:
%%time
%%memit
lda = LatentDirichletAllocation(n_components=5, random_state=0)
lda.fit(train_x)

# 変換の適用
train_x = pd.DataFrame(lda.transform(train_x))
test_x = pd.DataFrame(lda.transform(test_x))

display(Markdown('##### 圧縮後'))
print('num col: ', train_x.shape[1])
print('acc: ', train_and_eval(train_x, train_y))

##### 圧縮後

num col:  5
acc:  0.807600024624004
peak memory: 384.98 MiB, increment: 0.44 MiB
CPU times: user 33.9 s, sys: 43.3 ms, total: 33.9 s
Wall time: 34.1 s


⇒ 処理時間結構遅め。

## LDA（Linear Discriminant Analysis）
- 線形判別分析
- 分類タスクについて教師ありで次元削減を行う。

In [23]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

In [24]:
# 標準化されたデータを用いる
train_x, test_x = load_standarized_data()

display(Markdown('##### 圧縮前'))
print('num col: ', len(train_x.columns))
print('acc: ', train_and_eval(train_x, train_y))

##### 圧縮前

num col:  59
acc:  0.842999987295998


In [25]:
%%time
%%memit
lda = LinearDiscriminantAnalysis(n_components=1)
lda.fit(train_x, train_y)

# 変換の適用
train_x = pd.DataFrame(lda.transform(train_x))
test_x = pd.DataFrame(lda.transform(test_x))

display(Markdown('##### 圧縮後'))
print('num col: ', train_x.shape[1])
print('acc: ', train_and_eval(train_x, train_y))



##### 圧縮後

num col:  1
acc:  0.8563007087361135
peak memory: 396.85 MiB, increment: 0.82 MiB
CPU times: user 263 ms, sys: 52 ms, total: 315 ms
Wall time: 494 ms


⇒ 教師ありという点で上記の方法と異なる。適切な使い分けが必要。

### t-SNE
- 割と最近の手法
- ２次元平面に圧縮し、可視化の目的で使われることが多い

In [26]:
from sklearn.manifold import TSNE

In [27]:
# 標準化されたデータを用いる
train_x, test_x = load_standarized_data()

display(Markdown('##### 圧縮前'))
print('num col: ', len(train_x.columns))
print('acc: ', train_and_eval(train_x, train_y))

##### 圧縮前

num col:  59
acc:  0.842999987295998


In [28]:
%%time
%%memit
tsne = TSNE(n_components=2)

# 変換の適用
train_x = pd.DataFrame(tsne.fit_transform(train_x))

display(Markdown('##### 圧縮後'))
print('num col: ', train_x.shape[1])
print('acc: ', train_and_eval(train_x, train_y))

##### 圧縮後

num col:  2
acc:  0.8073999846079976
peak memory: 436.38 MiB, increment: 36.33 MiB
CPU times: user 1min 18s, sys: 80 ms, total: 1min 18s
Wall time: 1min 18s


⇒ かなり遅い。しかし２次元への圧縮で上記の方法と同程度の精度。

## UMAP
- 2018年に提案された手法。t-SNEよりもこちらのほうが処理時間が早い。2, 3次元を超える圧縮も可能。

In [29]:
import umap

In [30]:
# 標準化されたデータを用いる
train_x, test_x = load_standarized_data()

display(Markdown('##### 圧縮前'))
print('num col: ', len(train_x.columns))
print('acc: ', train_and_eval(train_x, train_y))

##### 圧縮前

num col:  59
acc:  0.842999987295998


In [31]:
%%time
%%memit
um = umap.UMAP()
um.fit(train_x)

# 変換の適用
train_x = pd.DataFrame(um.transform(train_x))
test_x = pd.DataFrame(um.transform(test_x))

display(Markdown('##### 圧縮後'))
print('num col: ', train_x.shape[1])
print('acc: ', train_and_eval(train_x, train_y))

##### 圧縮後

num col:  2
acc:  0.8185002656160426
peak memory: 494.06 MiB, increment: 34.86 MiB
CPU times: user 37.3 s, sys: 96.1 ms, total: 37.4 s
Wall time: 35.6 s


⇒ t-SNE の半分ほどの処理時間。しかも精度も高そうである。

## クラスタリング
- 次元削減とは違う手法だが、クラスタ中心からの距離を特徴量にしていい結果が出ることもあるようである。

In [32]:
from sklearn.cluster import MiniBatchKMeans

In [33]:
# 標準化されたデータを用いる
train_x, test_x = load_standarized_data()

display(Markdown('##### 圧縮前'))
print('num col: ', len(train_x.columns))
print('acc: ', train_and_eval(train_x, train_y))

##### 圧縮前

num col:  59
acc:  0.842999987295998


In [34]:
%%time
%%memit
kmeans = MiniBatchKMeans(n_clusters=10, random_state=0)
kmeans.fit(train_x)

# 変換の適用
train_x = pd.DataFrame(kmeans.transform(train_x))
test_x = pd.DataFrame(kmeans.transform(test_x))

display(Markdown('##### 圧縮後'))
print('num col: ', train_x.shape[1])
print('acc: ', train_and_eval(train_x, train_y))

##### 圧縮後

num col:  10
acc:  0.8146999852159975
peak memory: 498.22 MiB, increment: 0.17 MiB
CPU times: user 468 ms, sys: 36 ms, total: 504 ms
Wall time: 603 ms


⇒ メソッドの返り値の意味に注意
* kmeans.predict
  * 各レコードの属するクラスタを出力
* kmeans.transform
  * 各レコードの各クラスタ中心からの距離を出力