# Spreadsheet
## Dask

In [1]:
# import numpy as np
# import pandas as pd

In [2]:
import dask.dataframe as dd

# import dask.array as da
# import dask.bag as db

## 데이터

In [3]:
from pathlib import Path

project_dir = Path().resolve().parent  # spreadsheet
data_dir = project_dir.joinpath("data")  # spreadsheet/data

In [4]:
csv_files = {
    "fruits": str(data_dir.joinpath("fruits.csv")),
    "sejongpac_performs": str(data_dir.joinpath("sejongpac_performs.csv")),
}

### 과일

In [5]:
df = dd.read_csv(csv_files["fruits"])

In [6]:
df.head()

Unnamed: 0,id,name,price,count
0,1,apple,3.0,15
1,2,banana,2.7,8
2,3,coconut,9.1,12
3,4,dragonfruit,5.0,2
4,5,eggplant,2.4,13


In [7]:
df.dtypes

id         int64
name      object
price    float64
count      int64
dtype: object

In [8]:
df.describe()

Unnamed: 0_level_0,id,price,count
npartitions=1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
,float64,float64,float64
,...,...,...


### 서울시 세종문화회관 공연 및 전시 정보

In [9]:
df = dd.read_csv(
    csv_files["sejongpac_performs"],
    dtype={
        "공연시작일": "float64",
        "공연종료일": "float64",
        "썸네일이미지": "object",
        "출연진 소개": "object",
        "프로그램 정보": "object",
    },
    blocksize=1e6,
)

In [10]:
df.head(2)

Unnamed: 0,공연코드,공연명,공연시작일,공연종료일,장소,시간,연령,기획사,문의전화,티켓정보,할인정보,유료할인율,장르,게시여부,게시여부명,썸네일이미지,공연개요,프로그램 정보,출연진 소개
0,b1dt2206091514a01,(TEST) 전시연동테스트 공연_세종미술관,20250601.0,20250630.0,세종미술관 1관,,,,,,,,기획전시,Y,공개,,,,
1,dt2206091515a01,(TEST) 전시연동테스트 공연_상상톡톡미술관,20250601.0,20250630.0,꿈의숲 상상톡톡미술관,,,,,,,,기획전시,Y,공개,,,,


In [11]:
len(df)

1799

#### 컬럼

In [12]:
df.dtypes

공연코드        object
공연명         object
공연시작일      float64
공연종료일      float64
장소          object
시간          object
연령          object
기획사         object
문의전화        object
티켓정보        object
할인정보        object
유료할인율      float64
장르          object
게시여부        object
게시여부명       object
썸네일이미지      object
공연개요        object
프로그램 정보     object
출연진 소개      object
dtype: object

In [13]:
df[["공연코드", "공연시작일"]].compute()

Unnamed: 0,공연코드,공연시작일
0,b1dt2206091514a01,20250601.0
1,dt2206091515a01,20250601.0
2,chpi2204141747a01,20221126.0
3,chpi2204141743a01,20220917.0
4,chpi2205022005a01,20220917.0
...,...,...
221,chpi2011051510001,20210103.0
222,chpi2011271017001,20210102.0
223,pi2012211006001,20210102.0
224,pi2012221340001,20210102.0


## CSV 처리 클래스

In [189]:
from dataclasses import dataclass
from pathlib import Path
from typing import Optional, List

import dask.dataframe as dd


@dataclass
class Spreadsheet:
    src: str
    dest: Optional[str] = None
    length: Optional[int] = None

    def __post_init__(self):
        self.df = dd.read_csv(self.src)
        self.length = len(s.raw().index)

    def raw(self):
        return self.df.copy()

    def head(self):
        return self.df.head()

    def columns(self):
        return self.df.columns  # Index([columns...], dtype='object')

    def loc(self, rows=slice(0, None, 1), columns=None):
        return self.df.loc[rows, columns]

    def iloc(self, columns=slice(0, None, 1)):
        return self.df.iloc[:, columns]

In [190]:
s = Spreadsheet(src=csv_files["fruits"])

In [197]:
s.raw().loc[slice(4, 9), ['name', 'count']].compute()

Unnamed: 0,name,count
4,eggplant,13
5,fig,39
6,grapes,21
7,huckleberry,4
8,mango,1
9,orange,16


In [199]:
s.raw().iloc[:, slice(1, 3)].compute()

Unnamed: 0,name,price
0,apple,3.0
1,banana,2.7
2,coconut,9.1
3,dragonfruit,5.0
4,eggplant,2.4
5,fig,11.3
6,grapes,6.7
7,huckleberry,7.1
8,mango,5.4
9,orange,4.8


In [108]:
s.raw().loc[1:4].compute()

Unnamed: 0,id,name,price,count
1,2,banana,2.7,8
2,3,coconut,9.1,12
3,4,dragonfruit,5.0,2
4,5,eggplant,2.4,13


In [177]:
s.iloc().head()

Unnamed: 0,id,name,price,count
0,1,apple,3.0,15
1,2,banana,2.7,8
2,3,coconut,9.1,12
3,4,dragonfruit,5.0,2
4,5,eggplant,2.4,13


In [250]:
s.raw().info(verbose=True, memory_usage=True)

<class 'dask.dataframe.core.DataFrame'>
RangeIndex: 15 entries, 0 to 14
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   id      15 non-null      int64
 1   name    15 non-null      object
 2   price   15 non-null      float64
 3   count   15 non-null      int64
dtypes: object(1), float64(1), int64(2)
memory usage: 608.0 bytes


In [143]:
s.columns()

Index(['id', 'name', 'price', 'count'], dtype='object')