In [None]:
!pip install bayesian-optimization
!pip install keras-tuner
!pip install catboost
!pip install vecstack
!pip install gensim --upgrade

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
%matplotlib inline
import seaborn as sns

# Preprocessing&Feature Engineering
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from sklearn.preprocessing import MinMaxScaler, StandardScaler, PowerTransformer, RobustScaler, MaxAbsScaler
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectPercentile
from gensim.models import word2vec
from sklearn.decomposition import PCA

# Hyperparameter Optimization
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from bayes_opt import BayesianOptimization
import kerastuner as kt

# Modeling
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet, ARDRegression, BayesianRidge, Lars, LogisticRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.ensemble import VotingRegressor, StackingRegressor
from sklearn.neural_network import MLPRegressor
import tensorflow as tf
from vecstack import StackingTransformer
from vecstack import stacking

# Eveluation
from sklearn.metrics import mean_squared_error # squared=False시 RMSE
from sklearn.metrics import make_scorer, log_loss
from sklearn.model_selection import cross_val_score

# Utility
import os
import time
import random
import warnings; warnings.filterwarnings("ignore")
from IPython.display import Image
import pickle
from tqdm import tqdm
import platform
from itertools import combinations
from scipy.stats.mstats import gmean
import pickle

In [None]:
train = pd.read_csv('/content/drive/MyDrive/D&A_ML_Competition/L.POINT_train.csv', encoding='UTF-8')
test = pd.read_csv('/content/drive/MyDrive/D&A_ML_Competition/L.POINT_test.csv', encoding='UTF-8')
y_target = pd.read_csv('/content/drive/MyDrive/D&A_ML_Competition/y_train.csv')

In [None]:
train['PD_BUY_AM']  = train['PD_BUY_AM'].map(lambda x: int(str(x).replace(',','')) )
train['PD_BUY_CT']  = train['PD_BUY_CT'].map(lambda x: int(str(x).replace(',','')) )
train['TOT_SESS_HR_V']  = train['TOT_SESS_HR_V'].map(lambda x: int(str(x).replace(',','')) )
test['PD_BUY_AM']  = test['PD_BUY_AM'].map(lambda x: int(str(x).replace(',','')) )
test['PD_BUY_CT']  = test['PD_BUY_CT'].map(lambda x: int(str(x).replace(',','')) )
test['TOT_SESS_HR_V']  = test['TOT_SESS_HR_V'].map(lambda x: int(str(x).replace(',','')) )

In [None]:
train['date'] = pd.to_datetime(train['SESS_DT'], format= '%Y%m%d')
test['date'] = pd.to_datetime(test['SESS_DT'], format= '%Y%m%d')

In [None]:
train['AMOUNT'] = train['PD_BUY_AM'] * train['PD_BUY_CT']
test['AMOUNT'] = test['PD_BUY_AM'] * test['PD_BUY_CT']

In [None]:
features_pca = []
features_te_pca = []

In [None]:
train.groupby('PD_C').PD_C.count().sort_values(ascending=False).plot.box()
plt.show()

In [None]:
len(train.groupby('PD_C').PD_C.count().sort_values(ascending=False)[train.groupby('PD_C').PD_C.count().sort_values(ascending=False)>=200])

In [None]:
train.groupby('PD_BRA_NM').PD_BRA_NM.count().sort_values(ascending=False).plot.box()
plt.show()

In [None]:
len(train.groupby('PD_BRA_NM').PD_BRA_NM.count().sort_values(ascending=False)[train.groupby('PD_BRA_NM').PD_BRA_NM.count().sort_values(ascending=False)>=1000])

In [None]:
train.groupby('KWD_NM').KWD_NM.count().sort_values(ascending=False).plot.box()
plt.show()

In [None]:
len(train.groupby('KWD_NM').KWD_NM.count().sort_values(ascending=False)[train.groupby('KWD_NM').KWD_NM.count().sort_values(ascending=False)>=500])

In [None]:
df = train
col_name = 'PD_C'

max_seq = 373 # 구매 수가 200개 이상
max_d = 373
col_count = df.groupby(col_name)[col_name].count()

if len(col_count) > max_seq:
    tops = col_count.sort_values(ascending=False)[:max_seq].index
    f = df.loc[df[col_name].isin(tops)][['CLNT_ID', col_name]]
    f_te = test.loc[test[col_name].isin(tops)][['CLNT_ID', col_name]]
else:
    tops = col_count.index
    f = df[['CLNT_ID', col_name]]
    f_te = test[['CLNT_ID', col_name]]
f = pd.get_dummies(f, columns=[col_name])
f = f.groupby('CLNT_ID').mean()
f = pd.merge(pd.DataFrame({'CLNT_ID':df.CLNT_ID.unique()}), f, on='CLNT_ID', how='left').fillna(0).set_index('CLNT_ID')
f_te = pd.get_dummies(f_te, columns=[col_name])
f_te = f_te.groupby('CLNT_ID').mean()
f_te = pd.merge(pd.DataFrame({'CLNT_ID':test.CLNT_ID.unique()}), f_te, on='CLNT_ID', how='left').fillna(0).set_index('CLNT_ID')

if len(tops) < max_d:
    max_d = len(tops)
pca = PCA(max_d)
pca.fit(f)
cumsum = np.cumsum(pca.explained_variance_ratio_)

sns.lineplot(np.arange(1,max_d+1),cumsum, marker = 'o' )
plt.grid()
plt.xlabel('Number of PCA')
plt.ylabel('Explained variance ratio')
plt.show()

In [None]:
num_d =np.argmax(cumsum>=.99) + 1
if num_d == 1:
    num_d = max_d
pca = PCA(num_d)
f = pca.fit_transform(f)
f = pd.DataFrame(f)
f.columns = [col_name + '_' + str(col) for col in f.columns]
f.index = train.CLNT_ID.unique()
f_te = pca.transform(f_te)
f_te = pd.DataFrame(f_te)
f_te.columns = [col_name + '_' + str(col) for col in f_te.columns]
f_te.index = test.CLNT_ID.unique()

features_pca.append(f)
features_te_pca.append(f_te)

In [None]:
df = train
df_te = test
col_name = 'PD_BRA_NM'

max_seq = 438 # 구매 수가 1000개 이상
max_d = 438
col_count = df.groupby(col_name)[col_name].count()

if len(col_count) > max_seq:
    tops = col_count.sort_values(ascending=False)[:max_seq].index
    f = df.loc[df[col_name].isin(tops)][['CLNT_ID', col_name]]
    f_te = test.loc[test[col_name].isin(tops)][['CLNT_ID', col_name]]
else:
    tops = col_count.index
    f = df[['CLNT_ID', col_name]]
    f_te = test[['CLNT_ID', col_name]]
f = pd.get_dummies(f, columns=[col_name])
f = f.groupby('CLNT_ID').mean()
f = pd.merge(pd.DataFrame({'CLNT_ID':df.CLNT_ID.unique()}), f, on='CLNT_ID', how='left').fillna(0).set_index('CLNT_ID')
f_te = pd.get_dummies(f_te, columns=[col_name])
f_te = f_te.groupby('CLNT_ID').mean()
f_te = pd.merge(pd.DataFrame({'CLNT_ID':test.CLNT_ID.unique()}), f_te, on='CLNT_ID', how='left').fillna(0).set_index('CLNT_ID')

if len(tops) < max_d:
    max_d = len(tops)
pca = PCA(max_d)
pca.fit(f)
cumsum = np.cumsum(pca.explained_variance_ratio_)

sns.lineplot(np.arange(1,max_d+1),cumsum, marker = 'o' )
plt.grid()
plt.xlabel('Number of PCA')
plt.ylabel('Explained variance ratio')
plt.show()

In [None]:
num_d =np.argmax(cumsum>=.99) + 1
if num_d == 1:
    num_d = max_d
pca = PCA(num_d)
f = pca.fit_transform(f)
f = pd.DataFrame(f)
f.columns = [col_name + '_' + str(col) for col in f.columns]
f.index = train.CLNT_ID.unique()
f_te = pca.transform(f_te)
f_te = pd.DataFrame(f_te)
f_te.columns = [col_name + '_' + str(col) for col in f_te.columns]
f_te.index = test.CLNT_ID.unique()

features_pca.append(f)
features_te_pca.append(f_te)

In [None]:
df = train
col_name = 'KWD_NM'

max_seq = 403 # 구매 수가 500개 이상
max_d = 403
col_count = df.groupby(col_name)[col_name].count()

if len(col_count) > max_seq:
    tops = col_count.sort_values(ascending=False)[:max_seq].index
    f = df.loc[df[col_name].isin(tops)][['CLNT_ID', col_name]]
    f_te = test.loc[test[col_name].isin(tops)][['CLNT_ID', col_name]]
else:
    tops = col_count.index
    f = df[['CLNT_ID', col_name]]
    f_te = test[['CLNT_ID', col_name]]
f = pd.get_dummies(f, columns=[col_name])
f = f.groupby('CLNT_ID').mean()
f = pd.merge(pd.DataFrame({'CLNT_ID':df.CLNT_ID.unique()}), f, on='CLNT_ID', how='left').fillna(0).set_index('CLNT_ID')
f_te = pd.get_dummies(f_te, columns=[col_name])
f_te = f_te.groupby('CLNT_ID').mean()
f_te = pd.merge(pd.DataFrame({'CLNT_ID':test.CLNT_ID.unique()}), f_te, on='CLNT_ID', how='left').fillna(0).set_index('CLNT_ID')

if len(tops) < max_d:
    max_d = len(tops)
pca = PCA(max_d)
pca.fit(f)
cumsum = np.cumsum(pca.explained_variance_ratio_)

sns.lineplot(np.arange(1,max_d+1),cumsum, marker = 'o' )
plt.grid()
plt.xlabel('Number of PCA')
plt.ylabel('Explained variance ratio')
plt.show()

In [None]:
num_d =np.argmax(cumsum>=.99) + 1
if num_d == 1:
    num_d = max_d
pca = PCA(num_d)
f = pca.fit_transform(f)
f = pd.DataFrame(f)
f.columns = [col_name + '_' + str(col) for col in f.columns]
f.index = train.CLNT_ID.unique()
f_te = pca.transform(f_te)
f_te = pd.DataFrame(f_te)
f_te.columns = [col_name + '_' + str(col) for col in f_te.columns]
f_te.index = test.CLNT_ID.unique()

features_pca.append(f)
features_te_pca.append(f_te)

In [None]:
df = train
col_name = 'CLAC3_NM'

max_seq = 683 # 구매 수가 100개 이상
max_d = 683
col_count = df.groupby(col_name)[col_name].count()

if len(col_count) > max_seq:
    tops = col_count.sort_values(ascending=False)[:max_seq].index
    f = df.loc[df[col_name].isin(tops)][['CLNT_ID', col_name]]
    f_te = test.loc[test[col_name].isin(tops)][['CLNT_ID', col_name]]
else:
    tops = col_count.index
    f = df[['CLNT_ID', col_name]]
    f_te = test[['CLNT_ID', col_name]]
f = pd.get_dummies(f, columns=[col_name])
f = f.groupby('CLNT_ID').mean()
f = pd.merge(pd.DataFrame({'CLNT_ID':df.CLNT_ID.unique()}), f, on='CLNT_ID', how='left').fillna(0).set_index('CLNT_ID')
f_te = pd.get_dummies(f_te, columns=[col_name])
f_te = f_te.groupby('CLNT_ID').mean()
f_te = pd.merge(pd.DataFrame({'CLNT_ID':test.CLNT_ID.unique()}), f_te, on='CLNT_ID', how='left').fillna(0).set_index('CLNT_ID')

if len(tops) < max_d:
    max_d = len(tops)
pca = PCA(max_d)
pca.fit(f)
cumsum = np.cumsum(pca.explained_variance_ratio_)

sns.lineplot(np.arange(1,max_d+1),cumsum, marker = 'o' )
plt.grid()
plt.xlabel('Number of PCA')
plt.ylabel('Explained variance ratio')
plt.show()

In [None]:
num_d =np.argmax(cumsum>=.99) + 1
if num_d == 1:
    num_d = max_d
pca = PCA(num_d)
f = pca.fit_transform(f)
f = pd.DataFrame(f)
f.columns = [col_name + '_' + str(col) for col in f.columns]
f.index = train.CLNT_ID.unique()
f_te = pca.transform(f_te)
f_te = pd.DataFrame(f_te)
f_te.columns = [col_name + '_' + str(col) for col in f_te.columns]
f_te.index = test.CLNT_ID.unique()

features_pca.append(f)
features_te_pca.append(f_te)

In [None]:
train_pcas = pd.concat(features_pca, axis=1).reset_index().rename(columns={'index':'CLNT_ID'})
test_pcas = pd.concat(features_te_pca, axis=1).reset_index().rename(columns={'index':'CLNT_ID'})

In [None]:
myscore = make_scorer(log_loss, greater_is_better=False, needs_proba=True)
model = LogisticRegression()
skf = StratifiedKFold(n_splits=4, random_state=50, shuffle=True)
score = cross_val_score(model, train_pcas.iloc[:,1:], y_target.LABEL, scoring=myscore, cv=skf, n_jobs=-1)
score
# array([-1.28671977, -1.28739031, -1.28400246, -1.28178781])

In [None]:
len([x for x in train_pcas.columns if x.startswith('PD_C_')])

In [None]:
len([x for x in train_pcas.columns if x.startswith('PD_BRA_')])

In [None]:
len([x for x in train_pcas.columns if x.startswith('KWD_')])

In [None]:
len([x for x in train_pcas.columns if x.startswith('CLAC3')])

In [None]:
myscore = make_scorer(log_loss, greater_is_better=False, needs_proba=True)
model = LogisticRegression()
skf = StratifiedKFold(n_splits=4, random_state=50, shuffle=True)
score = cross_val_score(model, train_pcas.iloc[:,1:357], y_target.LABEL, scoring=myscore, cv=skf, n_jobs=-1)
score
# array([-1.36475292, -1.36774562, -1.36553693, -1.36572444])

In [None]:
myscore = make_scorer(log_loss, greater_is_better=False, needs_proba=True)
model = LogisticRegression()
skf = StratifiedKFold(n_splits=4, random_state=50, shuffle=True)
score = cross_val_score(model, train_pcas.iloc[:,357:776], y_target.LABEL, scoring=myscore, cv=skf, n_jobs=-1)
score
# array([-1.30864686, -1.31179117, -1.30761357, -1.30856556])

In [None]:
myscore = make_scorer(log_loss, greater_is_better=False, needs_proba=True)
model = LogisticRegression()
skf = StratifiedKFold(n_splits=4, random_state=50, shuffle=True)
score = cross_val_score(model, train_pcas.iloc[:,-941:553], y_target.LABEL, scoring=myscore, cv=skf, n_jobs=-1)
score
# array([-1.34043121, -1.34368841, -1.3420203 , -1.33860857])

In [None]:
myscore = make_scorer(log_loss, greater_is_better=False, needs_proba=True)
model = LogisticRegression()
skf = StratifiedKFold(n_splits=4, random_state=50, shuffle=True)
score = cross_val_score(model, train_pcas.iloc[:,-553:], y_target.LABEL, scoring=myscore, cv=skf, n_jobs=-1)
score

In [None]:
train_pcas.to_csv('/content/drive/MyDrive/D&A_ML_Competition/data_pca.csv')
test_pcas.to_csv('/content/drive/MyDrive/D&A_ML_Competition/data_te_pca.csv')

#### Word2Vector

- (150000, 10) -1.391584264975139 [-1.39163782 -1.39150907 -1.39152844 -1.39166173]
- (150000, 20) -1.3917360969780401 [-1.3917819  -1.39189615 -1.3914813  -1.39178504]
- (150000, 30) -1.3920146535104996 [-1.39185172 -1.39207638 -1.39193946 -1.39219106]
- (150000, 40) -1.3923098039794046 [-1.39201331 -1.39273929 -1.39220208 -1.39228454]
- (150000, 10) -1.391645125830269 [-1.39164684 -1.39168474 -1.39163048 -1.39161845]
- (150000, 20) -1.3918681632940457 [-1.39225561 -1.39174002 -1.39175899 -1.39171803]
- (150000, 30) -1.3920208915545458 [-1.39245037 -1.39162586 -1.39217227 -1.39183507]
- (150000, 40) -1.3923408683722063 [-1.3926594  -1.39197569 -1.39240877 -1.3923196 ]
- (150000, 10) -1.391614548388076 [-1.39169175 -1.39157229 -1.39161577 -1.39157839]
- (150000, 20) -1.391951600203473 [-1.39185483 -1.39186362 -1.39213477 -1.39195318]
- (150000, 30) -1.3921633738912433 [-1.39196909 -1.39223055 -1.39240663 -1.39204723]
- (150000, 40) -1.3923058192577034 [-1.39217331 -1.3922105  -1.39240873 -1.39243073]
- (150000, 10) -1.3916098537362616 [-1.3917189  -1.39165303 -1.39160227 -1.39146521]
- (150000, 20) -1.39178103301549 [-1.39179396 -1.39190022 -1.39157325 -1.3918567 ]
- (150000, 30) -1.3920578542935227 [-1.39231264 -1.39207574 -1.39186597 -1.39197707]
- (150000, 40) -1.3922848321587575 [-1.39246096 -1.39241334 -1.39223379 -1.39203124]


- (150000, 50) -1.3924210514384463 [-1.39207312 -1.39240334 -1.39223486 -1.39297289]
- (150000, 100) -1.3935169715879145 [-1.39254709 -1.39413538 -1.39358272 -1.39380271]
- (150000, 150) -1.394847077617179 [-1.39387935 -1.39477794 -1.39565039 -1.39508064]
- (150000, 200) -1.395884411986339 [-1.39494597 -1.39543975 -1.3966379  -1.39651403]
- (150000, 50) -1.3925069257805474 [-1.39260477 -1.39231705 -1.39277892 -1.39232696]
- (150000, 100) -1.3936529431848728 [-1.39372767 -1.39348871 -1.39424204 -1.39315336]
- (150000, 150) -1.3949489965851951 [-1.39513132 -1.39437674 -1.39493779 -1.39535015]
- (150000, 200) -1.396087491730313 [-1.39578495 -1.39555048 -1.3970575  -1.39595704]
- (150000, 50) -1.3926409675906217 [-1.39229057 -1.39236649 -1.39292014 -1.39298666]
- (150000, 100) -1.3938629304827024 [-1.39357904 -1.39387585 -1.3941306  -1.39386623]
- (150000, 150) -1.3947230901513157 [-1.3941017  -1.39504711 -1.3948626  -1.39488096]
- (150000, 200) -1.3963756566913978 [-1.39671046 -1.39634822 -1.39572195 -1.396722  ]
- (150000, 50) -1.3925967476636045 [-1.39257193 -1.39300356 -1.39246118 -1.39235032]
- (150000, 100) -1.3936814159484527 [-1.39403541 -1.39382537 -1.39366851 -1.39319637]
- (150000, 150) -1.3946445923839317 [-1.39446867 -1.39488788 -1.39482072 -1.39440109]
- (150000, 200) -1.3960656868587407 [-1.39607291 -1.39606308 -1.39668642 -1.39544034]


- (150000, 300) -1.398504529845314 [-1.3978481  -1.39844453 -1.39833775 -1.39938775]
- (150000, 400) -1.4003019768144234 [-1.39918328 -1.40076047 -1.40065739 -1.40060676]
- (150000, 500) -1.4020359024873499 [-1.40122757 -1.40193524 -1.40182285 -1.40315795]
- (150000, 200) -1.3959654155881365 [-1.39570996 -1.39562539 -1.39574183 -1.39678448]
- (150000, 300) -1.398414326709888 [-1.39885853 -1.3974504  -1.39836608 -1.3989823 ]
- (150000, 400) -1.4006730529540263 [-1.4013576  -1.3994311  -1.40084217 -1.40106134]
- (150000, 500) -1.403109583599627 [-1.40330624 -1.40275183 -1.40310661 -1.40327366]
- (150000, 200) -1.3961965957137719 [-1.39518571 -1.39594697 -1.39670219 -1.39695151]
- (150000, 300) -1.3984154185080602 [-1.39780007 -1.39789277 -1.39879586 -1.39917298]
- (150000, 400) -1.4011140732528233 [-1.4000908  -1.40095234 -1.40068126 -1.40273189]
- (150000, 500) -1.4030048403564372 [-1.40271878 -1.4028182  -1.40325208 -1.4032303 ]

In [None]:
features_w2v = []
features_te_w2v = []

In [None]:
def oversample(x, n):
    lst = []
    for i in x:
        tmp = []
        for j in range(n):
            random.shuffle(i)
            tmp += list(i)
        lst.append(tmp)
    return lst

In [None]:
level = 'PD_C'
tr = list(train[['CLNT_ID',level]].astype('str').groupby('CLNT_ID')[level].unique())
te = list(test[['CLNT_ID',level]].astype('str').groupby('CLNT_ID')[level].unique())
sentences_tr = oversample(tr, 200)
sentences_te = oversample(te, 200)
print('Succeed in Making a Sentence')

In [None]:
max_features = 10
window_size = 3

w2v = word2vec.Word2Vec(sentences = sentences_tr,
                        vector_size = max_features,
                        window = window_size,
                        min_count =1,
                        sg = 1, workers=4)
print('Succeed in Word2Vec')

w2v_lst = []
for words in tqdm(tr):
        tmp = np.zeros(max_features)
        cnt = 0
        for word in words:
            try:
                tmp += w2v.wv[str(word)]
                cnt += 1
            except:
                pass
        tmp /= cnt
        w2v_lst.append(tmp)
features_w2v.append(pd.DataFrame(np.array(w2v_lst), index=train.CLNT_ID.unique(), columns=[f'{level}_w2v_{i}' for i in range(max_features)]))

w2v_lst = []
for words in tqdm(te):
        tmp = np.zeros(max_features)
        cnt = 0
        for word in words:
            try:
                tmp += w2v.wv[str(word)]
                cnt += 1
            except:
                pass
        tmp /= cnt
        w2v_lst.append(tmp)
features_te_w2v.append(pd.DataFrame(np.array(w2v_lst), index=test.CLNT_ID.unique(), columns=[f'{level}_w2v_{i}' for i in range(max_features)]))

In [None]:
level = 'PD_BRA_NM'
tr = list(train[['CLNT_ID',level]].astype('str').groupby('CLNT_ID')[level].unique())
te = list(test[['CLNT_ID',level]].astype('str').groupby('CLNT_ID')[level].unique())
sentences_tr = oversample(tr, 200)
sentences_te = oversample(te, 200)
print('Succeed in Making a Sentence')

In [None]:
max_features = 10
window_size = 3

w2v = word2vec.Word2Vec(sentences = sentences_tr,
                        vector_size = max_features,
                        window = window_size,
                        min_count =1,
                        sg = 1, workers=4)
print('Succeed in Word2Vec')

w2v_lst = []
for words in tqdm(tr):
        tmp = np.zeros(max_features)
        cnt = 0
        for word in words:
            try:
                tmp += w2v.wv[str(word)]
                cnt += 1
            except:
                pass
        tmp /= cnt
        w2v_lst.append(tmp)
features_w2v.append(pd.DataFrame(np.array(w2v_lst), index=train.CLNT_ID.unique(), columns=[f'{level}_w2v_{i}' for i in range(max_features)]))

w2v_lst = []
for words in tqdm(te):
        tmp = np.zeros(max_features)
        cnt = 0
        for word in words:
            try:
                tmp += w2v.wv[str(word)]
                cnt += 1
            except:
                pass
        tmp /= cnt
        w2v_lst.append(tmp)
features_te_w2v.append(pd.DataFrame(np.array(w2v_lst), index=test.CLNT_ID.unique(), columns=[f'{level}_w2v_{i}' for i in range(max_features)]))

In [None]:
level = 'KWD_NM'
tr = list(train[['CLNT_ID',level]].astype('str').groupby('CLNT_ID')[level].unique())
te = list(test[['CLNT_ID',level]].astype('str').groupby('CLNT_ID')[level].unique())
sentences_tr = oversample(tr, 200)
sentences_te = oversample(te, 200)
print('Succeed in Making a Sentence')

In [None]:
max_features = 10
window_size = 3

w2v = word2vec.Word2Vec(sentences = sentences_tr,
                        vector_size = max_features,
                        window = window_size,
                        min_count =1,
                        sg = 1, workers=4)
print('Succeed in Word2Vec')

w2v_lst = []
for words in tqdm(tr):
        tmp = np.zeros(max_features)
        cnt = 0
        for word in words:
            try:
                tmp += w2v.wv[str(word)]
                cnt += 1
            except:
                pass
        tmp /= cnt
        w2v_lst.append(tmp)
features_w2v.append(pd.DataFrame(np.array(w2v_lst), index=train.CLNT_ID.unique(), columns=[f'{level}_w2v_{i}' for i in range(max_features)]))

w2v_lst = []
for words in tqdm(te):
        tmp = np.zeros(max_features)
        cnt = 0
        for word in words:
            try:
                tmp += w2v.wv[str(word)]
                cnt += 1
            except:
                pass
        tmp /= cnt
        w2v_lst.append(tmp)
features_te_w2v.append(pd.DataFrame(np.array(w2v_lst), index=test.CLNT_ID.unique(), columns=[f'{level}_w2v_{i}' for i in range(max_features)]))

In [None]:
level = 'CLAC3_NM'
tr = list(train[['CLNT_ID',level]].astype('str').groupby('CLNT_ID')[level].unique())
te = list(test[['CLNT_ID',level]].astype('str').groupby('CLNT_ID')[level].unique())
sentences_tr = oversample(tr, 200)
sentences_te = oversample(te, 200)
print('Succeed in Making a Sentence')

In [None]:
max_features = 10
window_size = 3

w2v = word2vec.Word2Vec(sentences = sentences_all,
                        vector_size = max_features,
                        window = window_size,
                        min_count =1,
                        sg = 1, workers=4)
print('Succeed in Word2Vec')

w2v_lst = []
for words in tqdm(tr):
        tmp = np.zeros(max_features)
        cnt = 0
        for word in words:
            try:
                tmp += w2v.wv[str(word)]
                cnt += 1
            except:
                pass
        tmp /= cnt
        w2v_lst.append(tmp)
features_w2v.append(pd.DataFrame(np.array(w2v_lst), index=train.CLNT_ID.unique(), columns=[f'{level}_w2v_{i}' for i in range(max_features)]))

w2v_lst = []
for words in tqdm(te):
        tmp = np.zeros(max_features)
        cnt = 0
        for word in words:
            try:
                tmp += w2v.wv[str(word)]
                cnt += 1
            except:
                pass
        tmp /= cnt
        w2v_lst.append(tmp)
features_te_w2v.append(pd.DataFrame(np.array(w2v_lst), index=test.CLNT_ID.unique(), columns=[f'{level}_w2v_{i}' for i in range(max_features)]))

In [None]:
features_te_w2v[0].isna().sum().sum()

In [None]:
data_w2v = pd.read_csv('/content/drive/MyDrive/D&A_ML_Competition/data_w2v.csv')
data_te_w2v = pd.read_csv('/content/drive/MyDrive/D&A_ML_Competition/data_te_w2v.csv')

In [None]:
features_te_w2v[0].shape

In [None]:
data_te_w2v.iloc[:,0].isna().sum()

In [None]:
data_te_w2v.columns[data_te_w2v.isna().sum() != 0]

In [None]:
data_w2v.iloc[:,-10:] = features_w2v[0]

In [None]:
data_w2v = pd.concat(features_w2v, axis=1)
data_te_w2v = pd.concat(features_te_w2v, axis=1)

In [None]:
data_w2v.to_csv('/content/drive/MyDrive/D&A_ML_Competition/data_w2v.csv', index=False)
data_te_w2v.to_csv('/content/drive/MyDrive/D&A_ML_Competition/data_te_w2v.csv', index=False)