In [None]:
# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Scikit-Learn ≥0.20 is required
# import sklearn
# print("sklearn version: ", sklearn.__version__)
# assert sklearn.__version__ >= "0.20"

try:
    # %tensorflow_version only exists in Colab.
    %tensorflow_version 2.x
    IS_COLAB = True
except Exception:
    IS_COLAB = False

# TensorFlow ≥2.0 is required
import tensorflow as tf
from tensorflow import keras
print("TF version: ", tf.__version__)
assert tf.__version__ >= "2.0"

if not tf.config.list_physical_devices('GPU'):
    print("No GPU was detected. CNNs can be very slow without a GPU.")
    if IS_COLAB:
        print("Go to Runtime > Change runtime and select a GPU hardware accelerator.")

# GPU test
print("GPU installed: ",tf.test.is_built_with_gpu_support())

# To prevent "CUDNN_STATUS_ALLOC_FAILED" error with GPUs
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
  try:
    # Currently, memory growth needs to be the same across GPUs
    for gpu in gpus:
      tf.config.experimental.set_memory_growth(gpu, True)
    logical_gpus = tf.config.experimental.list_logical_devices('GPU')
    print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
  except RuntimeError as e:
    # Memory growth must be set before GPUs have been initialized
    print(e)
    
# Common imports
import numpy as np
import os

# to make this notebook's output stable across runs
np.random.seed(42)
tf.random.set_seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Where to save the figures
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "cnn"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID)
os.makedirs(IMAGES_PATH, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)
    
# Ignore useless warnings (see SciPy issue #5998)
import warnings
warnings.filterwarnings(action="ignore", message="^internal gelsd")    

In [1]:
import numpy as np
import pandas as pd
import os
import time
# import scoring as scoring
import pickle
import gzip
from pyarrow import csv
import csv
# train_path = "D:/2022AIComp_data/train.csv"
# test_path = "D:/2022AIComp_data/test.csv"

#---------------------- Load Train,Test DF
train_pd = pd.read_csv("D:/2022AIComp_data/train.csv")
test_pd = pd.read_csv("D:/2022AIComp_data/test.csv")

print("Train: %s, Test: %s" %(train_pd.shape, test_pd.shape))
train_pd.head()
print(train_pd.iloc[:,0].unique())
# test_pd.head()

def data_shape(data_li):
    for data in data_li:
        vnames = [name for name in globals() if globals()[name] is data]
        print(vnames[0],'.shape : ',data.shape)
        
        
X_train = np.array(train_pd.iloc[:,1:])
y_train = train_pd.iloc[:,0].replace(['out','in', 'normal', 'other', 'noise'],[0,1,2,3,4])
# y_train = np.argmax(np.array(pd.get_dummies(train_pd.iloc[:,0])),axis=1)

X_test = np.array(test_pd.iloc[:,1:])
# y_test = np.array(pd.get_dummies(test_pd.iloc[:,0]))


data_shape([X_train,y_train,X_test])
y_train.shape

Train: (33600, 514), Test: (7820, 514)
['out' 'in' 'normal' 'other' 'noise']
X_train .shape :  (33600, 513)
y_train .shape :  (33600,)
X_test .shape :  (7820, 513)


(33600,)

In [3]:
from sklearn.manifold import TSNE
X_embedded = TSNE(n_components=2).fit_transform(X_train)



In [24]:
from sklearn.cluster import DBSCAN
dbscan = DBSCAN(eps = 2, min_samples = 8, metric = "euclidean")
dbscan_labels = dbscan.fit_predict(X_embedded)
# cluster label 추가


iris_df = pd.DataFrame(X_train)
iris_df["dbscan_cluster"] = dbscan_labels
iris_df["target"] = y_train
iris_df.groupby(["target", "dbscan_cluster"]).size()






target  dbscan_cluster
0       -1                 43
         0                 36
         1                 91
         2                464
         3                272
                         ... 
4        135                8
         136               71
         137               28
         138               11
         139               18
Length: 334, dtype: int64

In [None]:
from sklearn.decomposition import PCA

# pca로 피처 2개만 사용
pca = PCA(n_components=40, random_state=0)
pca_transformed = pca.fit_transform(X_train)

# 데이터 프레임에 주성분 추가
iris_df["ftr1"] = pca_transformed[:,0]
iris_df["ftr2"] = pca_transformed[:,1]

visualize_cluster_plot(dbscan, iris_df, "dbscan_cluster", iscenter=False)

In [19]:
predict

DBSCAN(eps=2.4, min_samples=100)

In [17]:
iris_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,505,506,507,508,509,510,511,512,dbscan_cluster,target
0,0,2,2,0,2,0,2,2,2,2,...,2,2,5,2,2,5,2,5,"DBSCAN(eps=2.4, min_samples=100)",0
1,0,0,0,3,0,3,0,0,0,0,...,3,3,3,3,3,3,3,6,"DBSCAN(eps=2.4, min_samples=100)",0
2,0,4,4,4,4,5,4,4,5,5,...,6,5,6,6,6,5,6,4,"DBSCAN(eps=2.4, min_samples=100)",0
3,0,6,5,5,6,5,6,6,5,5,...,7,7,5,6,5,5,7,7,"DBSCAN(eps=2.4, min_samples=100)",0
4,0,3,0,0,3,0,0,3,3,3,...,3,3,3,3,3,3,3,3,"DBSCAN(eps=2.4, min_samples=100)",0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33595,0,5,6,6,23,15,7,10,15,16,...,7,7,7,7,6,7,6,7,"DBSCAN(eps=2.4, min_samples=100)",4
33596,0,3,3,3,2,3,3,3,3,7,...,5,5,5,3,3,3,3,3,"DBSCAN(eps=2.4, min_samples=100)",4
33597,0,0,0,0,0,0,0,0,0,0,...,0,0,0,3,3,0,0,3,"DBSCAN(eps=2.4, min_samples=100)",4
33598,0,8,7,7,5,8,8,8,8,7,...,8,8,10,10,7,7,7,5,"DBSCAN(eps=2.4, min_samples=100)",4


In [14]:
def visualize_cluster_plot(clusterobj, dataframe, label_name, iscenter=True):
    
    # 군집별 중심 위치: K-Means, Mean Shift 등
    if iscenter:
        centers = clusterobj.cluster_centers_
    
    # Cluster 값 종류
    unique_labels = np.unique(dataframe[label_name].values)
    
    markers=['o', 's', '^', 'x', '*']
    isNoise=False

    for label in unique_labels:
        # 군집별 데이터 프레임
        label_cluster = dataframe[dataframe[label_name]==label]
        
        if label == -1:
            cluster_legend = 'Noise'
            isNoise=True
        else:
            cluster_legend = 'Cluster '+str(label)
        
        # 각 군집 시각화
        plt.scatter(x=label_cluster['ftr1'], y=label_cluster['ftr2'], s=70,
                    edgecolor='k', marker=markers[label], label=cluster_legend)
        
        # 군집별 중심 위치 시각화
        if iscenter:
            center_x_y = centers[label]
            plt.scatter(x=center_x_y[0], y=center_x_y[1], s=250, color='white',
                        alpha=0.9, edgecolor='k', marker=markers[label])
            plt.scatter(x=center_x_y[0], y=center_x_y[1], s=70, color='k',\
                        edgecolor='k', marker='$%d$' % label)
            
    if isNoise:
        legend_loc='upper center'
    else: 
        legend_loc='upper right'
    
    plt.legend(loc=legend_loc)

In [4]:
import pandas
df["x_component"]=X_embedded[:,0] 
df["y_component"]=X_embedded[:,1]

import plotly.express as px
    
fig = px.scatter(df, x="x_component", y="y_component", hover_name="Country Year", color = "labels", size_max=60)
fig.update_layout(
     height=800)
fig.show()


NameError: name 'df' is not defined

## XGBOOST

In [None]:


# 넘파이 형태의 학습 데이터 세트와 테스트 데이터를 DMatrix로 변환하는 예제
dtrain = xgb.DMatrix(data=X_train, label = y_train)
dtest = xgb.DMatrix(data=X_test, label=y_test)