### 판다스, 넘파이 응용

판다스 기본형 DataFrame, Series의 각 데이터타입 Numpy 타입을 사용

In [1]:
import pandas as pd
import numpy as np

In [5]:
# 시리즈 생성
s1 = pd.Series(data=np.random.randn(100))
s1

0    -1.216798
1     0.911087
2     0.433894
3     1.035603
4     0.141235
        ...   
95   -1.675667
96    0.945987
97   -0.017682
98   -2.930232
99    0.354854
Length: 100, dtype: float64

In [6]:
# 음수제거, 10을 곱해서 10단위 수로 만들고, 반올림해서 소수점
s2 = s1.abs().map(lambda x: x * 10).round()
s2

0     12.0
1      9.0
2      4.0
3     10.0
4      1.0
      ... 
95    17.0
96     9.0
97     0.0
98    29.0
99     4.0
Length: 100, dtype: float64

In [7]:
np.mean(s2)

9.06

In [8]:
# 시리즈, DF의 기본적인 정보
s2.info()

<class 'pandas.core.series.Series'>
RangeIndex: 100 entries, 0 to 99
Series name: None
Non-Null Count  Dtype  
--------------  -----  
100 non-null    float64
dtypes: float64(1)
memory usage: 932.0 bytes


In [9]:
# 판다스 데이터 기본통계 함수
s2.describe()

count    100.000000
mean       9.060000
std        6.661847
min        0.000000
25%        4.000000
50%        8.000000
75%       12.500000
max       29.000000
dtype: float64

In [10]:
s3 = pd.Series(data=[1, 3, 5, np.nan] * 5)
s3

0     1.0
1     3.0
2     5.0
3     NaN
4     1.0
5     3.0
6     5.0
7     NaN
8     1.0
9     3.0
10    5.0
11    NaN
12    1.0
13    3.0
14    5.0
15    NaN
16    1.0
17    3.0
18    5.0
19    NaN
dtype: float64

In [11]:
s3.info()

<class 'pandas.core.series.Series'>
RangeIndex: 20 entries, 0 to 19
Series name: None
Non-Null Count  Dtype  
--------------  -----  
15 non-null     float64
dtypes: float64(1)
memory usage: 292.0 bytes


In [13]:
# 결측치 때문에 20개 데이터에서 통계를 구할 때 15개밖에 안나옴
s3.describe()

count    15.000000
mean      3.000000
std       1.690309
min       1.000000
25%       1.000000
50%       3.000000
75%       5.000000
max       5.000000
dtype: float64

In [16]:
# 값별 빈도(count)수, 속성(option) dropna=False, normalize=True/False
s3.value_counts(dropna=True, normalize=True)

1.0    0.333333
3.0    0.333333
5.0    0.333333
Name: proportion, dtype: float64

In [19]:
# 결측치 확인함수
s3.isnull().sum()

5

### 데이터프레임과 넘파이

In [27]:
# 데이터프레임 생성 -> 이렇게 만들일은 거의 없음
# C# Bogus로 샘플데이터 생성과 동일작업
size = 10
df1 = pd.DataFrame(data={
    'class': [['A', 'B', 'C', 'D', 'F'][np.random.randint(0, 5)] for _ in range(0, size)],
    'year': [np.random.randint(2010, 2024) for _ in range(0, size)],
    'month': [np.random.randint(1, 13) for _ in range(0, size)],
    'val1': [np.random.randint(1, 11) for _ in range(0, size)],
    'val2': [np.random.randint(100, 1000) for _ in range(0, size)],
    'val3': [np.random.randint(10000, 20000) for _ in range(0, size)]
})
df1

Unnamed: 0,class,year,month,val1,val2,val3
0,F,2015,10,9,388,10234
1,B,2023,5,2,684,15913
2,F,2023,12,4,772,11589
3,D,2019,5,9,581,18254
4,D,2017,3,3,770,19073
5,C,2023,1,9,666,14480
6,C,2010,8,4,950,18608
7,D,2010,8,7,411,16192
8,D,2018,11,1,148,16067
9,D,2012,8,5,123,12222


In [28]:
# shape 출력값 2차원배열과 1차원배열 의미가 상이
df1.shape

(10, 6)

In [29]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   class   10 non-null     object
 1   year    10 non-null     int64 
 2   month   10 non-null     int64 
 3   val1    10 non-null     int64 
 4   val2    10 non-null     int64 
 5   val3    10 non-null     int64 
dtypes: int64(5), object(1)
memory usage: 612.0+ bytes


In [30]:
df1.describe()

Unnamed: 0,year,month,val1,val2,val3
count,10.0,10.0,10.0,10.0,10.0
mean,2017.0,7.1,5.3,549.3,15263.2
std,5.163978,3.541814,3.020302,274.782763,3077.391565
min,2010.0,1.0,1.0,123.0,10234.0
25%,2012.75,5.0,3.25,393.75,12786.5
50%,2017.5,8.0,4.5,623.5,15990.0
75%,2022.0,9.5,8.5,748.5,17738.5
max,2023.0,12.0,9.0,950.0,19073.0
