# 1. pickle 모듈 정리

* 파이썬 객체를 저장하고 불러올수 있게 만들어 주는 모듈
    * 리스트, 딕셔너리, 클래스, 함수 등등

* binary형태로 저장해줘서 용량 압축이됨
    * binary형태로 저장한는 다른 방법도 있지만 pickle모듈이 가장 유용함

data를 pickle을 이용해 저장

In [None]:
import pickle

data = {
    'a': [1, 2.0, 3, 4+6j],
    'b': ("character string", b"byte string"),
    'c': {None, True, False}
}

# save
with open('data.pickle', 'wb') as f:
    pickle.dump(data, f, pickle.HIGHEST_PROTOCOL) # highest_protocol : 객체 저장시 메모리가 부족할 경우 해결해준다

# load
with open('data.pickle', 'rb') as f:
    data_res = pickle.load(f)

In [None]:
data_res

{'a': [1, 2.0, 3, (4+6j)],
 'b': ('character string', b'byte string'),
 'c': {False, None, True}}

gzip 이용

* 파일을 압축할때 사용하는 모듈

In [None]:
import pickle
import gzip

data = {
    'a': [1, 4+6j],
    'b': ("character string", b"byte string"),
    'c': {None, True, False}
}

# save and compress.
with gzip.open('testPickleFile.pickle', 'wb') as f:
    pickle.dump(data, f)

# load and uncompress.
with gzip.open('testPickleFile.pickle','rb') as f:
    data_res = pickle.load(f)

In [None]:
data_res

{'a': [1, (4+6j)],
 'b': ('character string', b'byte string'),
 'c': {False, None, True}}

# 2. 시간 측정 함수

1. time모듈

In [None]:
import time # time 라이브러리 import
start = time.time() # 시작

time.sleep(1) # time.sleep : 1초간 대기하는 함수

end = time.time()

print(f"{end-start:.4f} sec") # 종료와 함께 수행시간 출력

1.0014 sec


2. datetime모듈을 활용하여 좀더 자세한 시간 측정

In [None]:
import time
import datetime # datetime 라이브러리 import

start = time.time() # 시작
time.sleep(1)
sec = time.time()-start # 종료 - 시작 (걸린 시간)

times = str(datetime.timedelta(seconds=sec)) # 걸린시간 보기좋게 바꾸기
short = times.split(".")[0] # 초 단위 까지만
print(f"{times} sec")
print(f"{short} sec")

0:00:01.001486 sec
0:00:01 sec


# 예외 처리

try-except문

```
try:  
    ...   
except [발생오류 [as오류변수]]:  
    ...  
```

In [None]:
try:
    4/0
except ZeroDivisionError as e: #발생오류 오타시 오류발생 가능
    print(e)

division by zero


In [None]:
#여러 except도 가능

try:
    a = [1,2]
    print(a[3])
    4/0
except ZeroDivisionError as e:
    print(e)
except IndexError as e:
    print(e)

list index out of range


try-finally

* try문 수행 도중 예외 발생 여부에 상관없이 항상 실행됨

In [None]:
try:
    f = open('foo.txt', 'w')
    # 무언가를 수행한다.

    #(... 생략 ...)

finally:
    f.close()  # 중간에 오류가 발생하더라도 무조건 실행된다.


try-else문

```
try:
    ...
except [발생오류 [as 오류변수]]:
    ...
else:  # 오류가 없을 경우에만 수행
    ...
```

In [None]:
try:
    age=int(input('나이를 입력하세요: '))
except:
    print('입력이 정확하지 않습니다.')
else:
    if age <= 18:
        print('미성년자는 출입금지입니다.')
    else:
        print('환영합니다.')


나이를 입력하세요: 10
미성년자는 출입금지입니다.


오류 회피하기 : pass이용

In [None]:
try:
    f = open("나없는파일", 'r')
except FileNotFoundError:
    pass

traceback모듈을 이용해 어떤 오류인지 확인가능

In [None]:
import traceback

try:
    4/0
except:
    print(traceback.format_exc())

Traceback (most recent call last):
  File "<ipython-input-21-bb9c303fbce5>", line 4, in <cell line: 3>
    4/0
ZeroDivisionError: division by zero



# numpy 모듈 정리

In [2]:
import numpy as np

## 1. np.unique

* 배열내의 원소들중 고유한 값을 뽑아준다

|parameter|설명|값
|:-:|-|-|
|return_index|각 고유 원소가 처음등장한 index반환|default = False / True, False
|return_inverse|하단 참조|default = False / True, False
|return_counts|각 고유 원소들이 등장하는 횟수 반환|default = False / True, False

In [None]:
np.unique([1,2,2,3,4,5,5,3,3,2,2,1])

array([1, 2, 3, 4, 5])

In [None]:
np.unique([[1,2,4],
           [1,2,5]]) #axis를 지정해주지 않으면 1차원배열로 변환뒤 고유한값 반환

array([1, 2, 4, 5])

In [None]:
np.unique([[1,2,4,1],
           [1,2,5,1],
           [1,2,5,1]], axis = 0)

#axis를 지정해주면 axis별(블록 단위로) 고유값 반환
# [1,2,5,1]이 겹치므로 삭제

array([[1, 2, 4, 1],
       [1, 2, 5, 1]])

In [None]:
np.unique([[1,2,4,1],
           [1,2,5,1],
           [1,2,5,1]], axis = 1)
#위와 동일
#[1,1,1]이 겹치므로 삭제

array([[1, 2, 4],
       [1, 2, 5],
       [1, 2, 5]])

return_inverse

In [None]:
# 3, 4, 5, 8을 0, 1, 2, 3으로 취급했을 때, 원래 값들에 매칭되는 번호

np.unique([3, 5, 8, 8, 5, 5, 4], return_inverse = True)

(array([3, 4, 5, 8]), array([0, 2, 3, 3, 2, 2, 1]))

## np.nditer

* numpy객체 반복에 사용한다

In [None]:
arr = np.array([[1,2,3],
                [3,4,5]])

for row in range(0, arr.shape[0]):
    for col in range(0, arr.shape[1]):
        print(arr[row][col])

# 차원이 2차원일때는 괜찮지만 고차원일경우 for문으로 해결하기 어려워진다

1
2
3
3
4
5


In [None]:
arr = np.array([[1,2,3],
                [4,5,6]])

it = np.nditer(arr, flags = ['multi_index'], op_flags=['readwrite']) #op_flags는 왜 쓰는지 모르겠

while not it.finished:
    idx = it.multi_index
    print(arr[idx])
    it.iternext()

1
2
3
4
5
6


In [None]:
# 1차원일때

arr = np.arange(10)
it = np.nditer(arr, flags = ['c_index'])

while not it.finished:
    idx = it.index
    print(arr[idx])
    it.iternext()

0
1
2
3
4
5
6
7
8
9


In [6]:
np.argmax(np.arange(10) >= 7)

7

## random모듈

### np.random.rand

* np.random.rand(d0, d1, d2, ...)
* d : 차원
* 0~1사이의 균일분포

In [None]:
import numpy as np

np.random.rand(2, 1,2)

array([[[0.62476487, 0.36116332]],

       [[0.54006562, 0.34187394]]])

### np.random.choice

* np.random.choice(a, size, replace, p)

* a : 뽑을 대상, 그냥 정수로 주어질경우 0~a-1 중 추출
* size : 뽑을 갯수
* replace : 복원추출 여부
* p : 뽑을때 확률을 다르게 줄 수 있음, 기본은 균일확률

In [None]:
np.random.choice(100, 10)

array([59, 19, 50, 39, 81, 42, 82,  7, 63, 41])

### np.random.permutation

* np.random.permutation(x)  
* x : int or array_like,  
If x is an integer, randomly permute np.arange(x). If x is an array, make a copy and shuffle the elements randomly.

In [3]:
# integer case

x = np.random.permutation(10)
print(x)

[4 8 5 6 2 9 7 1 0 3]


In [4]:
# array case

x = np.random.permutation(np.array([1,2,3,4,5]))
print(x)

[2 4 1 5 3]


# 데이터 처리 함수 정리

In [None]:
import pandas as pd

In [None]:
import seaborn as sns

In [None]:
sns.get_dataset_names()

['anagrams',
 'anscombe',
 'attention',
 'brain_networks',
 'car_crashes',
 'diamonds',
 'dots',
 'dowjones',
 'exercise',
 'flights',
 'fmri',
 'geyser',
 'glue',
 'healthexp',
 'iris',
 'mpg',
 'penguins',
 'planets',
 'seaice',
 'taxis',
 'tips',
 'titanic']

In [None]:
df = sns.load_dataset('penguins')

## 컬럼 삭제

* df.drop(['컬럼명1', '컬럼명2'], axis = 1, inplace = False)
* df.drop(['컬럼명1', '컬럼명2'], axis = 'columns', inplace = False)


In [None]:
df.head(3)

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female


In [None]:
df.drop(['species'], axis = 1)

Unnamed: 0,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Torgersen,39.1,18.7,181.0,3750.0,Male
1,Torgersen,39.5,17.4,186.0,3800.0,Female
2,Torgersen,40.3,18.0,195.0,3250.0,Female
3,Torgersen,,,,,
4,Torgersen,36.7,19.3,193.0,3450.0,Female
...,...,...,...,...,...,...
339,Biscoe,,,,,
340,Biscoe,46.8,14.3,215.0,4850.0,Female
341,Biscoe,50.4,15.7,222.0,5750.0,Male
342,Biscoe,45.2,14.8,212.0,5200.0,Female


## 컬럼추가

* df['추가할 컬럼'] = data
* df.assign('컬럼명1' = data1, '컬럼명2' = data2)
* df.insert(위치 = 1, 컬럼명, data, allow_duplicates=False)
    * allow_duplicates : 추가하는 컬럼명과 기존의 컬럼명이 동일할때 True를 사용하면 오류없이 추가 가능

In [None]:
# 1

df['a'] = 1
df.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,a
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male,1
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female,1
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female,1
3,Adelie,Torgersen,,,,,,1
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,Female,1


In [None]:
# 2

df.assign(b = 1, c = 1)
df.head(3)

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,a
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male,1
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female,1
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female,1


In [None]:
df.assign(b=1, c=2, inplace = True) #inplace를 사용 불가, 새로운 열 이름으로 할당해줘야함

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,a,b,c,inplace
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male,1,1,2,True
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female,1,1,2,True
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female,1,1,2,True
3,Adelie,Torgersen,,,,,,1,1,2,True
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,Female,1,1,2,True
...,...,...,...,...,...,...,...,...,...,...,...
339,Gentoo,Biscoe,,,,,,1,1,2,True
340,Gentoo,Biscoe,46.8,14.3,215.0,4850.0,Female,1,1,2,True
341,Gentoo,Biscoe,50.4,15.7,222.0,5750.0,Male,1,1,2,True
342,Gentoo,Biscoe,45.2,14.8,212.0,5200.0,Female,1,1,2,True


In [None]:
df1 = df.assign(b=1, c=2)

In [None]:
df1.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,a,b,c
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male,1,1,2
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female,1,1,2
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female,1,1,2
3,Adelie,Torgersen,,,,,,1,1,2
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,Female,1,1,2


In [None]:
# lambda 사용

df1.assign(d = lambda x : x['a'] + x['b']).head(3)

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,a,b,c,d
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male,1,1,2,2
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female,1,1,2,2
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female,1,1,2,2


In [None]:
# 3


df1.insert(loc = 0, column = 'e', value = 1)

In [None]:
df1 #inplace나, 새로운 변수로 할당할필요없이 바로 바꿔줌

Unnamed: 0,e,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,a,b,c
0,1,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male,1,1,2
1,1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female,1,1,2
2,1,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female,1,1,2
3,1,Adelie,Torgersen,,,,,,1,1,2
4,1,Adelie,Torgersen,36.7,19.3,193.0,3450.0,Female,1,1,2
...,...,...,...,...,...,...,...,...,...,...,...
339,1,Gentoo,Biscoe,,,,,,1,1,2
340,1,Gentoo,Biscoe,46.8,14.3,215.0,4850.0,Female,1,1,2
341,1,Gentoo,Biscoe,50.4,15.7,222.0,5750.0,Male,1,1,2
342,1,Gentoo,Biscoe,45.2,14.8,212.0,5200.0,Female,1,1,2


## 컬럼명 변경

* df.rename(columns = {'기존컬럼명1' : '바꿀컬럼명1',   '기존컬럼명2':'바꿀컬럼명2'}

* 바뀐결과를 적용하려면 새로운 변수명으로 할당 필요

In [None]:
df2 = df1.rename(columns = {'a' : 'aa',
                            'b' : 'bb'})
df2

Unnamed: 0,e,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,aa,bb,c
0,1,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male,1,1,2
1,1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female,1,1,2
2,1,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female,1,1,2
3,1,Adelie,Torgersen,,,,,,1,1,2
4,1,Adelie,Torgersen,36.7,19.3,193.0,3450.0,Female,1,1,2
...,...,...,...,...,...,...,...,...,...,...,...
339,1,Gentoo,Biscoe,,,,,,1,1,2
340,1,Gentoo,Biscoe,46.8,14.3,215.0,4850.0,Female,1,1,2
341,1,Gentoo,Biscoe,50.4,15.7,222.0,5750.0,Male,1,1,2
342,1,Gentoo,Biscoe,45.2,14.8,212.0,5200.0,Female,1,1,2


## 중복행 제거

* drop_duplicates

|parameter|설명|값|
|:-:|-|-|
|subset|중복값을 검사할 열, 기본적으로 모든 열을 검사|default = 'None'|
|keep|중복 제거를 할때 남길 행, first면 첫값을 남기고, last면 마지막 값을 남김|default = 'first', 'last'|
|inplace|원본 변경 여부|default = False|
|ingore_index|원래 index를 무시할지 여부, True일 경우 새로운 인덱스가 0~n으로 부여됨|default = False|

In [None]:
import pandas as pd

col = ['col1','col2','col3']
data = [['A','x','-'],['A','x','-'],['B','x','앞'],['B','y','-'],['B','y','뒤']]
df = pd.DataFrame(data=data, columns=col)
df

Unnamed: 0,col1,col2,col3
0,A,x,-
1,A,x,-
2,B,x,앞
3,B,y,-
4,B,y,뒤


In [None]:
df.drop_duplicates() #따로 인자를 주지않았기 때문에 모든 열에대해 값이 중복인 값을 제거

Unnamed: 0,col1,col2,col3
0,A,x,-
2,B,x,앞
3,B,y,-
4,B,y,뒤


In [None]:
df.drop_duplicates(subset = 'col1') #col1기준으로 중복인 값을 제거

Unnamed: 0,col1,col2,col3
0,A,x,-
2,B,x,앞


In [None]:
df.drop_duplicates(subset = ['col1', 'col2']) #col1, col2 기준으로 중복인 값을 제거

Unnamed: 0,col1,col2,col3
0,A,x,-
2,B,x,앞
3,B,y,-


# 정렬

1. 데이터프레임 정렬  
    * df.sort_values()
2. 리스트 정렬
    * list.sort()
    * sorted(list)

## 데이터 프레임 정렬

|parameter|설명|값
|:-:|-|-|
|by|정렬할 기준 변수(필수)|default = None/column_name
|axis|index기준 : 0, column 기준 : 1|default = 0|
|ascending|오름차순, 내림차순 설정|default = True
|inplace|내부 적용 여부|default = False
|na_position|결측값 위치|default = 'last'/first

In [None]:
import numpy as np
import pandas as pd

df = pd.DataFrame({'sequence': [1, 3, 2],
                   'name': ['park', 'lee', 'choi'],
                   'age': [30, 20, 40],
                   'home' : ['seoul', np.nan, np.nan]})
df

Unnamed: 0,sequence,name,age,home
0,1,park,30,seoul
1,3,lee,20,
2,2,choi,40,


In [None]:
df.sort_values(by = 'sequence')

Unnamed: 0,sequence,name,age,home
0,1,park,30,seoul
2,2,choi,40,
1,3,lee,20,


In [None]:
df.sort_values(by = 'home', na_position = 'first')

Unnamed: 0,sequence,name,age,home
1,3,lee,20,
2,2,choi,40,
0,1,park,30,seoul


## 리스트 정렬

In [None]:
a1 = [1,2,3,4,5,1,3,3,2,2,4,4]

In [None]:
a1.sort()

In [None]:
a1

[1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5]

In [None]:
b1 = [1,2,2,2,5,9,1,2,3]

In [None]:
sorted(b1)

[1, 1, 2, 2, 2, 2, 3, 5, 9]

In [None]:
b1

[1, 2, 2, 2, 5, 9, 1, 2, 3]

# 함수 내부 구조 보기

* inspect모듈

In [None]:
import inspect

In [None]:
import numpy as np

In [None]:
code = inspect.getsource(np.mean)
print(code)

@array_function_dispatch(_mean_dispatcher)
def mean(a, axis=None, dtype=None, out=None, keepdims=np._NoValue, *,
         where=np._NoValue):
    """
    Compute the arithmetic mean along the specified axis.

    Returns the average of the array elements.  The average is taken over
    the flattened array by default, otherwise over the specified axis.
    `float64` intermediate and return values are used for integer inputs.

    Parameters
    ----------
    a : array_like
        Array containing numbers whose mean is desired. If `a` is not an
        array, a conversion is attempted.
    axis : None or int or tuple of ints, optional
        Axis or axes along which the means are computed. The default is to
        compute the mean of the flattened array.

        .. versionadded:: 1.7.0

        If this is a tuple of ints, a mean is performed over multiple axes,
        instead of a single axis or all the axes as before.
    dtype : data-type, optional
        Type to use in computin

In [1]:
print('what is problem')
print('what is problem1')

what is problem
