# 데이터 품질 테스트

## 0. Spark Session 생성

In [1]:
pip install pyspark==3.1.2

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [2]:
import pyspark
print(pyspark.__version__)

3.1.2


In [3]:
from pyspark.sql import SparkSession

# SparkSession 생성
spark = SparkSession.builder \
    .appName("HDFS File Reading") \
    .config("spark.hadoop.fs.defaultFS", "hdfs://localhost:9000") \
    .getOrCreate()

25/02/13 17:04:23 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [18]:
spark.stop()

## 1. 데이터 분포 확인

In [12]:
# HDFS에서 JSON 파일 읽기
file_path = "hdfs://localhost:9000/shared_data/label_data/1.Car/1.horn_of_car"

In [13]:
file_path

'hdfs://localhost:9000/shared_data/label_data/1.Car/1.horn_of_car'

In [14]:
df = spark.read.json(file_path, multiLine= True)

                                                                                

In [15]:
df.printSchema()

root
 |-- annotations: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- area: struct (nullable = true)
 |    |    |    |-- end: double (nullable = true)
 |    |    |    |-- start: double (nullable = true)
 |    |    |-- categories: struct (nullable = true)
 |    |    |    |-- category_01: string (nullable = true)
 |    |    |    |-- category_02: string (nullable = true)
 |    |    |    |-- category_03: string (nullable = true)
 |    |    |-- decibel: long (nullable = true)
 |    |    |-- labelName: string (nullable = true)
 |    |    |-- soundQuality: string (nullable = true)
 |    |    |-- subCategory: string (nullable = true)
 |-- audio: struct (nullable = true)
 |    |-- bitRate: string (nullable = true)
 |    |-- duration: double (nullable = true)
 |    |-- fileFormat: string (nullable = true)
 |    |-- fileName: string (nullable = true)
 |    |-- fileSize: long (nullable = true)
 |    |-- recodingType: string (nullable = true)
 |    |-- sample

In [16]:
df.show()

+---------------------+--------------------+-----------------------------+--------------------+--------------------+
|          annotations|               audio|                  environment|                info|             license|
+---------------------+--------------------+-----------------------------+--------------------+--------------------+
| [{{10.55, 9.99}, ...|{705kbps, 11.584,...|         {iPHONE 12 mini, ...|{IMR, 2021-09-20,...|{CC 0, https://ww...|
| [{{16.62, 16.05},...|{705kbps, 22.4, w...|         {iPHONE 12 mini, ...|{IMR, 2021-09-20,...|{CC 0, https://ww...|
| [{{7.401, 2.0}, {...|{1411kbps, 9.401,...|  {갤럭시S6Tab, 자연적, 제...|{IMR, 2021-09-04,...|{CC 0, https://ww...|
| [{{4.13, 2.68}, {...|{1411kbps, 7.178,...|  {갤럭시S6Tab, 자연적, 제...|{IMR, 2021-09-04,...|{CC 0, https://ww...|
| [{{12.944, 2.0}, ...|{705kbps, 14.944,...|        {TASCAM DR-05X, 자...|{IMR, 2021-10-26,...|{CC 0, https://ww...|
| [{{5.48, 3.11}, {...|{1411kbps, 8.733,...|{갤럭시탭S6, 자연적, 제작,...|{IMR, 2021-09-

In [5]:
import os

# HDFS 경로 설정
hdfs_base_path = "hdfs://localhost:9000/shared_data"

# 폴더 내 모든 JSON 파일 경로 수집 함수 정의
def get_all_json_files(folder_path):
    json_files = []
    for root, dirs, files in os.walk(folder_path):
        for file in files:
            if file.endswith(".json"):
                json_files.append(os.path.join(root, file))
    return json_files

# 주요 폴더 경로 설정
folder_paths = [
    f"{hdfs_base_path}/label_data",
    f"{hdfs_base_path}/raw_data",
    f"{hdfs_base_path}/test_data"
]

# 모든 JSON 파일 경로 수집
json_files = []
for folder_path in folder_paths:
    json_files.extend(get_all_json_files(folder_path))

In [6]:
# JSON 파일을 읽어와 데이터프레임으로 결합
df_union = None
for path in json_files:
    try:
        df_temp = spark.read.json(path, multiLine=True)
        if df_union is None:
            df_union = df_temp
        else:
            df_union = df_union.union(df_temp)
    except Exception as e:
        print(f"Error reading path {path}: {e}")

In [7]:
# 데이터프레임 내용 확인
if df_union:
    df_union.show()
    
    # Pandas 데이터프레임으로 변환
    pandas_df = df_union.toPandas()
else:
    print("No JSON files found or read.")

No JSON files found or read.
