* https://pandas.pydata.org/Pandas_Cheat_Sheet.pdf

In [1]:
import pandas as pd
import seaborn as sns
import numpy as np

In [2]:
df = sns.load_dataset('iris')
df.shape

(150, 5)

In [3]:
df.head(2)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa


In [4]:
# 특정 컬럼에 있는 값들의 그룹화된 개수
df['species'].value_counts()

virginica     50
setosa        50
versicolor    50
Name: species, dtype: int64

In [9]:
pd.DataFrame(df['species'].value_counts())

Unnamed: 0,species
virginica,50
setosa,50
versicolor,50


In [5]:
len(df) #몇개의 행이 있는지 확인

150

In [10]:
df.shape[0]

150

In [6]:
len(df) == df.shape[0] # 행의 개수 확인하는 동일한 방법

True

In [7]:
df['species'].nunique() #유니크한 것의 개수

3

In [11]:
df.describe() #데이터프레임의 요약

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
count,150.0,150.0,150.0,150.0
mean,5.843333,3.057333,3.758,1.199333
std,0.828066,0.435866,1.765298,0.762238
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


In [8]:
df.describe(include='all') # 모든 컬럼에 대한 요약을 보여줌

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
count,150.0,150.0,150.0,150.0,150
unique,,,,,3
top,,,,,virginica
freq,,,,,50
mean,5.843333,3.057333,3.758,1.199333,
std,0.828066,0.435866,1.765298,0.762238,
min,4.3,2.0,1.0,0.1,
25%,5.1,2.8,1.6,0.3,
50%,5.8,3.0,4.35,1.3,
75%,6.4,3.3,5.1,1.8,


In [12]:
df.describe(include=[np.object])

Unnamed: 0,species
count,150
unique,3
top,virginica
freq,50


In [13]:
df.describe(exclude=[np.object]) #특정데이터타입을 제외하고 봄

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
count,150.0,150.0,150.0,150.0
mean,5.843333,3.057333,3.758,1.199333
std,0.828066,0.435866,1.765298,0.762238
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


In [14]:
df.describe(include=[np.number])

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
count,150.0,150.0,150.0,150.0
mean,5.843333,3.057333,3.758,1.199333
std,0.828066,0.435866,1.765298,0.762238
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


In [15]:
#특정컬럼 값의 전체합
df['petal_width'].sum()

179.90000000000003

In [16]:
#특정컬럼 값의 개수
df['petal_width'].count()

150

In [17]:
# 중간값
df.median()

sepal_length    5.80
sepal_width     3.00
petal_length    4.35
petal_width     1.30
dtype: float64

In [18]:
# 평균값
df.mean()

sepal_length    5.843333
sepal_width     3.057333
petal_length    3.758000
petal_width     1.199333
dtype: float64

In [19]:
# 25%, 75%에 해당하는 값 조회
df['petal_width'].quantile([0.25,0.75])

0.25    0.3
0.75    1.8
Name: petal_width, dtype: float64

In [20]:
df.min()

sepal_length       4.3
sepal_width          2
petal_length         1
petal_width        0.1
species         setosa
dtype: object

In [21]:
df.max()

sepal_length          7.9
sepal_width           4.4
petal_length          6.9
petal_width           2.5
species         virginica
dtype: object

In [22]:
#분산
df.var()

sepal_length    0.685694
sepal_width     0.189979
petal_length    3.116278
petal_width     0.581006
dtype: float64

In [23]:
#표준편차
df.std()

sepal_length    0.828066
sepal_width     0.435866
petal_length    1.765298
petal_width     0.762238
dtype: float64

### apply(function)
---
* 데이터 전처리에서 사용되는 함수
* 특정 값까지 잘라오는 형태로 많이 사용
* 파라미터로 직접 lambda함수를 사용

In [24]:
def smp(x):
    # 뒤에서 3번째까지의 문자를 가져오는 함수
    x = x[-3:]
    return x

In [25]:
df['species_3'] = df['species'].apply(lambda x : x[:3])

In [26]:
df['species-3'] = df['species'].apply(smp)

In [27]:
df

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species,species_3,species-3
0,5.1,3.5,1.4,0.2,setosa,set,osa
1,4.9,3.0,1.4,0.2,setosa,set,osa
2,4.7,3.2,1.3,0.2,setosa,set,osa
3,4.6,3.1,1.5,0.2,setosa,set,osa
4,5.0,3.6,1.4,0.2,setosa,set,osa
5,5.4,3.9,1.7,0.4,setosa,set,osa
6,4.6,3.4,1.4,0.3,setosa,set,osa
7,5.0,3.4,1.5,0.2,setosa,set,osa
8,4.4,2.9,1.4,0.2,setosa,set,osa
9,4.9,3.1,1.5,0.1,setosa,set,osa
