In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

In [None]:
# シード固定
random_state = 2021
np.random.seed(random_state)

In [None]:
train_df = pd.read_csv('../input/shopee-product-matching/train.csv')

In [None]:
train_df.head()

In [None]:
print("総グループ数: {}".format(len(train_df.label_group.unique())))

In [None]:
# サンプル抽出
n = 10
sample_groups = np.random.choice(train_df.label_group.unique(), size=n, replace=False, p=None)
# print(sample_groups)
sample_train_df = train_df[train_df.label_group.isin(sample_groups)]

In [None]:
# サンプルのみのtrain_df
sample_train_df

In [None]:
# TF-IDFベクトル化
from sklearn.feature_extraction.text import TfidfVectorizer

corpus = sample_train_df.title.values
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(corpus)
# print(vectorizer.get_feature_names())
print(X.shape)

In [None]:
# Xはスパース表現
X

In [None]:
# T-SNE次元削減
from sklearn.manifold import TSNE
# 最急降下法でKLダイバージェンスを最小化(n_componentsは何次元まで削減するか)
tsne = TSNE(n_components=2, random_state=random_state, perplexity=30, n_iter=1000)
X_embedded = tsne.fit_transform(X)

In [None]:
# 削減したベクトルを取得
print(X_embedded.shape)
# X_embedded

In [None]:
# ベクトル可視化
ddf = pd.concat([sample_train_df.reset_index(), pd.DataFrame(X_embedded, columns=['dim1', 'dim2'])], axis=1)
plt.figure(figsize = (30, 30))

for i, v in enumerate(sample_groups, 1):
    tmp_df = ddf[ddf.label_group == v]
    color = [np.random.randint(255)/255 for _ in range(3)]
    plt.scatter(tmp_df.dim1, tmp_df.dim2, s=200, c=color)