In [1]:
from pyspark.sql import SparkSession

# MySQL JDBC 드라이버 경로
mysql_driver_path = "/home/ubuntu/mysql-connector-j-9.2.0/mysql-connector-j-9.2.0.jar"

# SparkSession 생성
spark = SparkSession.builder \
    .appName("model_test") \
    .config("spark.hadoop.fs.defaultFS", "hdfs://localhost:9000") \
    .config("spark.driver.memory", "8g") \
    .config("spark.executor.memory", "8g") \
    .config("spark.driver.maxResultSize", "4g") \
    .config("spark.jars", mysql_driver_path) \
    .config("spark.sql.execution.arrow.pyspark.enabled", "true") \
    .getOrCreate()

In [2]:
# MySQL 연결 정보
mysql_url = "jdbc:mysql://15.168.145.74:3306/my_db?useUnicode=true&characterEncoding=UTF-8"
mysql_properties = {
    "user": "root",
    "password": "root",
    "driver": "com.mysql.cj.jdbc.Driver"
}


In [3]:
# MySQL 테이블 불러오기
df = spark.read.jdbc(url=mysql_url, table="merge_car_horn", properties=mysql_properties)

# 데이터 확인
df.show(5)
df.printSchema()

+--------------------+--------------------+----------+----------+----------+----------+----------+-----------+----------+----------+-----------+-----------+-----------+-----------+-----------+----------+--------+-----------+-----------+-----------+-------+--------------------+------------+-----------+--------+--------+----------+--------+------------+----------+-------------+---------+-------+------------+--------+---------+--------+--------+--------+---------+--------+--------+------+-------------+----------+-------+
|        fileName_wav|       fileName_json|mfcc_1_wav|mfcc_2_wav|mfcc_3_wav|mfcc_4_wav|mfcc_5_wav| mfcc_6_wav|mfcc_7_wav|mfcc_8_wav| mfcc_9_wav|mfcc_10_wav|mfcc_11_wav|mfcc_12_wav|mfcc_13_wav|area_start|area_end|category_01|category_02|category_03|decibel|           labelName|soundQuality|subCategory| bitRate|duration|fileFormat|fileSize|recodingType|sampleRate|    acqDevice|acqMethod|acqType|     areaUse|dayNight|direction|distance|district|latitude|longitude|micClass|ob

In [4]:
display(df.toPandas())

  PyArrow >= 1.0.0 must be installed; however, it was not found.
Attempting non-optimization as 'spark.sql.execution.arrow.pyspark.fallback.enabled' is set to true.


Unnamed: 0,fileName_wav,fileName_json,mfcc_1_wav,mfcc_2_wav,mfcc_3_wav,mfcc_4_wav,mfcc_5_wav,mfcc_6_wav,mfcc_7_wav,mfcc_8_wav,...,distance,district,latitude,longitude,micClass,obstacle,place,recordingTime,urban,weather
0,1.car_horn_30849.wav,1.car_horn_30849.wav,-328.99643,110.927765,13.474552,37.183453,-10.836115,25.585962,0.232516,5.053774,...,10m,영등포동,37.51,126.91,무지향성,없음,도로변,05:15,서울특별시,흐림
1,1.car_horn_30865.wav,1.car_horn_30865.wav,-380.94937,82.016785,-38.369390,10.472818,-25.531940,-0.819976,-21.547447,16.138535,...,10m,영등포동,37.51,126.91,무지향성,없음,도로변,03:39,서울특별시,흐림
2,1.car_horn_30869.wav,1.car_horn_30869.wav,-498.20813,114.061424,1.980113,-2.516794,-12.093688,21.846487,10.806868,14.971516,...,10m,영등포동,37.51,126.91,무지향성,없음,도로변,05:51,서울특별시,흐림
3,1.car_horn_30874.wav,1.car_horn_30874.wav,-456.39767,111.490440,-6.244432,18.060278,-17.298338,28.503597,14.064431,7.452422,...,10m,영등포동,37.51,126.91,무지향성,없음,도로변,06:01,서울특별시,흐림
4,1.car_horn_59.wav,1.car_horn_59.wav,-199.50888,199.641110,-13.681638,21.661055,-9.801945,12.513287,2.843365,8.194932,...,20m,한남동,37.53,127.01,무지향성,없음,도로변,11:58,서울특별시,맑음
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1932,1.car_horn_82.wav,1.car_horn_82.wav,-296.59660,201.823230,-41.995213,38.293407,15.336274,8.951137,2.428992,0.401730,...,10m,혜화동,37.50,127.02,무지향성,없음,도로변,10:32,서울특별시,맑음
1933,1.car_horn_85.wav,1.car_horn_85.wav,-264.83923,197.210750,-77.277880,23.032236,11.380154,7.896886,11.269576,-4.560492,...,10m,혜화동,37.50,127.02,무지향성,없음,도로변,10:31,서울특별시,맑음
1934,1.car_horn_87561.wav,1.car_horn_87561.wav,-228.09323,64.343980,-13.663012,41.246460,-7.827433,21.293640,-13.415158,3.358162,...,1m,은현면,37.86,126.02,무지향성,없음,도로변,17:27,양주시,맑음
1935,1.car_horn_88376.wav,1.car_horn_88376.wav,-426.90427,130.722400,47.745327,8.306962,5.675391,15.449170,5.677291,6.208929,...,1m,은현면,37.86,126.02,무지향성,없음,도로변,11:09,양주시,맑음


## 데이터 전처리

In [5]:
from pyspark.sql.functions import col
import pandas as pd

# 필요한 MFCC 컬럼을 선택하고, 타겟 컬럼을 'micClass'로 설정
features = ['mfcc_1_wav', 'mfcc_2_wav', 'mfcc_3_wav', 'mfcc_4_wav', 'mfcc_5_wav', 'mfcc_6_wav', 'mfcc_7_wav', 'mfcc_8_wav']
target = 'micClass'

# PySpark DataFrame에서 Pandas DataFrame으로 변환
df_pandas = df.select(features + [target]).toPandas()

# 결측치 제거
df_pandas = df_pandas.dropna()

# X (특징)와 y (타겟)으로 분리
X = df_pandas[features].values
y = df_pandas[target].values

# 타겟값을 숫자형으로 변환 (라벨 인코딩)
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
y = encoder.fit_transform(y)

# 데이터를 훈련 세트와 테스트 세트로 나누기
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


  PyArrow >= 1.0.0 must be installed; however, it was not found.
Attempting non-optimization as 'spark.sql.execution.arrow.pyspark.fallback.enabled' is set to true.


## CNN 모델 정의

In [11]:
pip install tensorflow

Collecting tensorflow
  Downloading tensorflow-2.6.2-cp36-cp36m-manylinux2010_x86_64.whl (458.3 MB)
     |██████████████████████████████▌ | 437.0 MB 76.3 MB/s eta 0:00:01 

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



     |████████████████████████████████| 458.3 MB 15 kB/s               
Collecting protobuf>=3.9.2
  Downloading protobuf-3.19.6-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.1 MB)
     |████████████████████████████████| 1.1 MB 103.6 MB/s            
[?25hCollecting grpcio<2.0,>=1.37.0
  Downloading grpcio-1.48.2-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (4.6 MB)
     |████████████████████████████████| 4.6 MB 27.1 MB/s            
[?25hCollecting termcolor~=1.1.0
  Downloading termcolor-1.1.0.tar.gz (3.9 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting astunparse~=1.6.3
  Downloading astunparse-1.6.3-py2.py3-none-any.whl (12 kB)
Collecting clang~=5.0
  Downloading clang-5.0.tar.gz (30 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting h5py~=3.1.0
  Downloading h5py-3.1.0-cp36-cp36m-manylinux1_x86_64.whl (4.0 MB)
     |████████████████████████████████| 4.0 MB 79.9 MB/s            
[?25hCollecting opt-einsum~=3.3.0
  D

      Successfully uninstalled six-1.17.0
Successfully installed absl-py-0.15.0 astunparse-1.6.3 cached-property-1.5.2 cachetools-4.2.4 clang-5.0 flatbuffers-1.12 gast-0.4.0 google-auth-1.35.0 google-auth-oauthlib-0.4.6 google-pasta-0.2.0 grpcio-1.48.2 h5py-3.1.0 keras-2.6.0 keras-preprocessing-1.1.2 markdown-3.3.7 oauthlib-3.2.2 opt-einsum-3.3.0 protobuf-3.19.6 pyasn1-0.5.1 pyasn1-modules-0.3.0 requests-oauthlib-2.0.0 rsa-4.9 six-1.15.0 tensorboard-2.6.0 tensorboard-data-server-0.6.1 tensorboard-plugin-wit-1.8.1 tensorflow-2.6.2 tensorflow-estimator-2.6.0 termcolor-1.1.0 typing-extensions-3.7.4.3 werkzeug-2.0.3 wrapt-1.12.1
Note: you may need to restart the kernel to use updated packages.


In [24]:
import tensorflow as tf
print(tf.__version__)

2.6.2


In [25]:
print(tf.test.is_gpu_available())

Instructions for updating:
Use `tf.config.list_physical_devices('GPU')` instead.
True


In [17]:
import tensorflow as tf
from tensorflow.keras import layers, models

# CNN 모델 구축
model = models.Sequential([
    # 첫 번째 Conv2D 레이어 (이미지의 특징 추출)
    layers.Conv2D(32, (3, 3), activation='relu', input_shape=(28, 28, 1)),
    layers.MaxPooling2D((2, 2)),
    
    # 두 번째 Conv2D 레이어 (더 복잡한 특징 추출)
    layers.Conv2D(64, (3, 3), activation='relu'),
    layers.MaxPooling2D((2, 2)),
    
    # 세 번째 Conv2D 레이어
    layers.Conv2D(64, (3, 3), activation='relu'),
    
    # 전역 평탄화 (flatten)
    layers.Flatten(),
    
    # 완전 연결층 (Dense layer)
    layers.Dense(64, activation='relu'),
    
    # 출력층 (출력: 10개의 클래스)
    layers.Dense(10, activation='softmax')
])

# 모델 요약
model.summary()

# 모델 컴파일
model.compile(optimizer='adam', 
              loss='sparse_categorical_crossentropy', 
              metrics=['accuracy'])



Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d (Conv2D)              (None, 26, 26, 32)        320       
_________________________________________________________________
max_pooling2d (MaxPooling2D) (None, 13, 13, 32)        0         
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 11, 11, 64)        18496     
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 5, 5, 64)          0         
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 3, 3, 64)          36928     
_________________________________________________________________
flatten_1 (Flatten)          (None, 576)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 64)               

In [18]:
# MNIST 데이터셋 로드
(train_images, train_labels), (test_images, test_labels) = tf.keras.datasets.mnist.load_data()

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/mnist.npz


In [19]:
# 데이터 전처리: 크기 변경 및 정규화
train_images = train_images.reshape((train_images.shape[0], 28, 28, 1)).astype('float32') / 255
test_images = test_images.reshape((test_images.shape[0], 28, 28, 1)).astype('float32') / 255

In [29]:
# 모델 학습
history = model.fit(train_images, train_labels, epochs=10, batch_size=8, validation_data=(test_images, test_labels))

Epoch 1/10


UnknownError:  Failed to get convolution algorithm. This is probably because cuDNN failed to initialize, so try looking to see if a warning log message was printed above.
	 [[node sequential_1/conv2d/Conv2D (defined at <ipython-input-28-2781c0f43632>:2) ]] [Op:__inference_train_function_2306]

Function call stack:
train_function


In [26]:
import tensorflow as tf
print("사용 가능한 GPU 수: ", len(tf.config.experimental.list_physical_devices('GPU')))


사용 가능한 GPU 수:  1


In [27]:
from tensorflow.keras import backend as K
K.clear_session()