# Pandas


Pandas는 python 머신러닝에서 가장 많이 쓰이는 데이터 처리 라이브러리입니다. 

앞으로 머신러닝에서 다루는 대부분의 데이터는 행과 열로 되어있는 2차원 데이터들을 다루게 되실겁니다.

판다스는 이러한 2차원 데이터를 가공(processing)하는데 효율적인 기능들을 제공합니다.

이 수업노트에 나오는 모든 자료는 아래 를 참고하여 만들었습니다.

https://pandas.pydata.org/

## Pandas Series

1. Python list,ndarray,dictionary로 부터 Pandas series를 만드는 방법에 대해 알아보자.

2. Pandas serie의 Indexing 방법에 대해 알아보자.

3. Series에서 사용가능한 operation에 대해 알아보자.

In [2]:
import numpy as np
import pandas as pd

year = [2014,2018,2020,2021]

list2_series = pd.Series(year)

print(list2_series)


0    2014
1    2018
2    2020
3    2021
dtype: int64


In [3]:
# Index 설정하기

name = ["철수","영희","재석","명수"]

list2_series = pd.Series(data=year,index=name)
print( list2_series)

철수    2014
영희    2018
재석    2020
명수    2021
dtype: int64


In [4]:
year = np.array([2014,2018,2020,2021])
numpy2_series = pd.Series(data=year,index=name)
print( numpy2_series)

철수    2014
영희    2018
재석    2020
명수    2021
dtype: int32


In [5]:
graduate = {'철수':2014,'영희':2018,'재석':2020,'명수':2021}

dict2_series = pd.Series(data=graduate)
print(dict2_series)

철수    2014
영희    2018
재석    2020
명수    2021
dtype: int64


In [6]:
# series는 Index와 위치 정보 모두로 Indexing이 가능하다.
print(dict2_series["철수"])
print(dict2_series[0])
# Index 찾기
print(dict2_series.keys()) #key는 인덱스 값을 도출
print(dict2_series.keys()[1])
print(dict2_series.values) # values는 해당하는 data값

2014
2014
Index(['철수', '영희', '재석', '명수'], dtype='object')
영희
[2014 2018 2020 2021]


In [7]:
# pandas series는 numpy와 같이 다양한 operation이 가능하다.

#각 data 값에 +,*,/ 적용됨
# 1. 덧셈/뺄셈
print(dict2_series+1000,"\n") 

# 2.곱셈
print(dict2_series*100,"\n")

# 3. 나눗셈
print(dict2_series/10,"\n")


철수    3014
영희    3018
재석    3020
명수    3021
dtype: int64 

철수    201400
영희    201800
재석    202000
명수    202100
dtype: int64 

철수    201.4
영희    201.8
재석    202.0
명수    202.1
dtype: float64 



In [8]:
# Series 끼리의 연산

graduate2 = {'철수':1,'영희':2,'미선':3,'명수':4}

dict2_series2 = pd.Series(data=graduate2)

In [9]:
print(dict2_series +dict2_series2)
# series끼리의 연산시에는 두 series안의 key값들이 일치해야만 연산이 가능하다.
# dict2_series 에는 미선이 없고, dict2_series2에는 재석이 없어서
# 결과값에 미선 재석은 NaN이 나오게된다.


명수    2025.0
미선       NaN
영희    2020.0
재석       NaN
철수    2015.0
dtype: float64


In [10]:
dict2_series.add(dict2_series2,fill_value=100)

# 위 같이 key값이 일치 하지 않는경우 다음과 같이 fill_value에 값을 설정해주어 값이 없을 경우 fill_value값을 더해서 나타내준다. 

명수    2025.0
미선     103.0
영희    2020.0
재석    2120.0
철수    2015.0
dtype: float64

## Pandas DataFrame

pandas DataFrame도 series와 같이 list, ndarray, dictionary로 부터 만들 수 있다. 여기서는 ndarray로만 실습해보도록 하자.

In [11]:
data = np.zeros([4,4])
df = pd.DataFrame(data)
df

Unnamed: 0,0,1,2,3
0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0


In [16]:
name = ["철수","영희","재석","명수"]
score = ["math","physics","biology","chemistry"]

df = pd.DataFrame(data,index=name,columns=score)
df 

Unnamed: 0,math,physics,biology,chemistry
철수,0.0,0.0,0.0,0.0
영희,0.0,0.0,0.0,0.0
재석,0.0,0.0,0.0,0.0
명수,0.0,0.0,0.0,0.0


데이터 프레임 합치기 연습

In [20]:
data = np.zeros([4,4])
score = ["math","physics","biology","chemistry"]
columns = score

df = pd.DataFrame(data,columns=columns)
df 


Unnamed: 0,math,physics,biology,chemistry
0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0


In [21]:
data = np.ones([4,4])
score = ["math","physics","literature","korean"]
columns = score

df2 = pd.DataFrame(data,columns=columns)
df2

Unnamed: 0,math,physics,literature,korean
0,1.0,1.0,1.0,1.0
1,1.0,1.0,1.0,1.0
2,1.0,1.0,1.0,1.0
3,1.0,1.0,1.0,1.0


In [28]:
concat_axis0 = pd.concat([df,df2],axis=0)
concat_axis0
# axis 0는 행 방향으로 합치는 것을 알 수 있다.
# NaN 데이터가 나오는 이유는 열이 매치가 되지 않아서이다.
# df에는 literature,korean이 없고 df2에는 biology,chemistry가 없음.

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species,math,physics,literature,korean
0,1.0,5.1,3.5,1.4,0.2,Iris-setosa,,,,
1,2.0,4.9,3.0,1.4,0.2,Iris-setosa,,,,
2,3.0,4.7,3.2,1.3,0.2,Iris-setosa,,,,
3,4.0,4.6,3.1,1.5,0.2,Iris-setosa,,,,
4,5.0,5.0,3.6,1.4,0.2,Iris-setosa,,,,
...,...,...,...,...,...,...,...,...,...,...
149,150.0,5.9,3.0,5.1,1.8,Iris-virginica,,,,
0,,,,,,,1.0,1.0,1.0,1.0
1,,,,,,,1.0,1.0,1.0,1.0
2,,,,,,,1.0,1.0,1.0,1.0


In [23]:
concat_axis1 = pd.concat([df,df2],axis=1)
concat_axis1
# axis 1은 열방향으로 합치는 것을 알 수 있다.

Unnamed: 0,math,physics,biology,chemistry,math.1,physics.1,literature,korean
0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0
1,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0
2,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0
3,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0


In [93]:
data = np.ones([4,4])
score = ["math","physics","biology","chemistry"]
columns = score

df2 = pd.DataFrame(data,columns=columns)
df2

Unnamed: 0,math,physics,biology,chemistry
0,1.0,1.0,1.0,1.0
1,1.0,1.0,1.0,1.0
2,1.0,1.0,1.0,1.0
3,1.0,1.0,1.0,1.0


In [94]:
concat_axis0 = pd.concat([df,df2],axis=0)
concat_axis0

# 열이 모두 일치하였을 경우 다음과 같이 NaN데이터가 없다.

Unnamed: 0,math,physics,biology,chemistry
0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0
0,1.0,1.0,1.0,1.0
1,1.0,1.0,1.0,1.0
2,1.0,1.0,1.0,1.0
3,1.0,1.0,1.0,1.0


## DataFrame API

많은 경우 판다스 데이터프레임이나 시리즈를 직접 만드는 경우는 많이 없다. 주어진 데이터를 가공하거나 형태를 변형하여 머신러닝을 진행하는 경우가 대부분입니다. 

판다스 데이터프레임을 활용하여 데이터를 읽고 가공하는법을 알아봅시다.

In [26]:
df = pd.read_csv('Iris.csv') # csv 파일 읽기
df

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa
...,...,...,...,...,...,...
145,146,6.7,3.0,5.2,2.3,Iris-virginica
146,147,6.3,2.5,5.0,1.9,Iris-virginica
147,148,6.5,3.0,5.2,2.0,Iris-virginica
148,149,6.2,3.4,5.4,2.3,Iris-virginica


In [96]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             150 non-null    int64  
 1   SepalLengthCm  150 non-null    float64
 2   SepalWidthCm   150 non-null    float64
 3   PetalLengthCm  150 non-null    float64
 4   PetalWidthCm   150 non-null    float64
 5   Species        150 non-null    object 
dtypes: float64(4), int64(1), object(1)
memory usage: 7.2+ KB


In [97]:
df.describe()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm
count,150.0,150.0,150.0,150.0,150.0
mean,75.5,5.843333,3.054,3.758667,1.198667
std,43.445368,0.828066,0.433594,1.76442,0.763161
min,1.0,4.3,2.0,1.0,0.1
25%,38.25,5.1,2.8,1.6,0.3
50%,75.5,5.8,3.0,4.35,1.3
75%,112.75,6.4,3.3,5.1,1.8
max,150.0,7.9,4.4,6.9,2.5


In [126]:
df.describe().transpose() #행과 열 바꾸기 둘 중 자신에게 편한 방법으로 시각화 하여 사용할 수 있다.

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Id,150.0,75.5,43.445368,1.0,38.25,75.5,112.75,150.0
SepalLengthCm,150.0,5.843333,0.828066,4.3,5.1,5.8,6.4,7.9
SepalWidthCm,150.0,3.054,0.433594,2.0,2.8,3.0,3.3,4.4
PetalLengthCm,150.0,3.758667,1.76442,1.0,1.6,4.35,5.1,6.9
PetalWidthCm,150.0,1.198667,0.763161,0.1,0.3,1.3,1.8,2.5


In [27]:
df.head() #맨위 행부터 읽기 ()안에 숫자를 넣으면 그 행만큼 읽을 수 있다. default =5

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa


In [102]:
df.tail() #맨 아래 행부터 읽기 ()안에 숫자를 넣으면 그 행만큼 읽을 수 있다. default =5

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
145,146,6.7,3.0,5.2,2.3,Iris-virginica
146,147,6.3,2.5,5.0,1.9,Iris-virginica
147,148,6.5,3.0,5.2,2.0,Iris-virginica
148,149,6.2,3.4,5.4,2.3,Iris-virginica
149,150,5.9,3.0,5.1,1.8,Iris-virginica


### 데이터 프레임 indexing 및 Filtering

In [103]:
df["SepalLengthCm"] #열 indexing

0      5.1
1      4.9
2      4.7
3      4.6
4      5.0
      ... 
145    6.7
146    6.3
147    6.5
148    6.2
149    5.9
Name: SepalLengthCm, Length: 150, dtype: float64

In [130]:
df[["SepalLengthCm","Species"]] #여러 열 indexing

Unnamed: 0,SepalLengthCm,Species
0,5.1,Iris-setosa
1,4.9,Iris-setosa
2,4.7,Iris-setosa
3,4.6,Iris-setosa
4,5.0,Iris-setosa
...,...,...
145,6.7,Iris-virginica
146,6.3,Iris-virginica
147,6.5,Iris-virginica
148,6.2,Iris-virginica


In [104]:
df.head()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa


In [143]:
# loc는 행만 indexing
df.loc[1] #행만 indexing가능

Id                         2
SepalLengthCm            4.9
SepalWidthCm               3
PetalLengthCm            1.4
PetalWidthCm             0.2
Species          Iris-setosa
Name: 1, dtype: object

In [105]:
df.loc[0] # 1번째 행

Id                         1
SepalLengthCm            5.1
SepalWidthCm             3.5
PetalLengthCm            1.4
PetalWidthCm             0.2
Species          Iris-setosa
Name: 0, dtype: object

In [106]:
df.iloc[:,1] #2번째 열

0      5.1
1      4.9
2      4.7
3      4.6
4      5.0
      ... 
145    6.7
146    6.3
147    6.5
148    6.2
149    5.9
Name: SepalLengthCm, Length: 150, dtype: float64

In [107]:
df.iloc[0,2] #1번째 행 3번째 열 indexing

3.5

In [152]:
df.head()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa


In [32]:
# 데이터프레임을 조건부로 indexing도 가능하다.
# ex) SepalWidthCm가 3보다 큰 경우만 indexing 해보도록 하자.
higher3 = df["SepalWidthCm"]>3
df[higher3]
# len

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa
5,6,5.4,3.9,1.7,0.4,Iris-setosa
...,...,...,...,...,...,...
140,141,6.7,3.1,5.6,2.4,Iris-virginica
141,142,6.9,3.1,5.1,2.3,Iris-virginica
143,144,6.8,3.2,5.9,2.3,Iris-virginica
144,145,6.7,3.3,5.7,2.5,Iris-virginica


In [158]:
# ex) Species가 Iris-versicolor인 경우만 indexing 해보도록 하자.
versicolor = df["Species"] == "Iris-versicolor"
df[versicolor].head()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
50,51,7.0,3.2,4.7,1.4,Iris-versicolor
51,52,6.4,3.2,4.5,1.5,Iris-versicolor
52,53,6.9,3.1,4.9,1.5,Iris-versicolor
53,54,5.5,2.3,4.0,1.3,Iris-versicolor
54,55,6.5,2.8,4.6,1.5,Iris-versicolor


In [168]:
# ex) Species가 Iris-versicolor이고 SepalWidthCm가 3보다 큰 경우만 indexing 해보도록 하자. 

versicolor3 = (df["Species"] == "Iris-versicolor") & (df["SepalWidthCm"]>3)
df[versicolor3].head()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
50,51,7.0,3.2,4.7,1.4,Iris-versicolor
51,52,6.4,3.2,4.5,1.5,Iris-versicolor
52,53,6.9,3.1,4.9,1.5,Iris-versicolor
56,57,6.3,3.3,4.7,1.6,Iris-versicolor
65,66,6.7,3.1,4.4,1.4,Iris-versicolor


In [39]:
# isin 함수를 이용해서도 필터링이 가능하다.
df[df['Species'].isin(['Iris-versicolor'])].head()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
50,51,7.0,3.2,4.7,1.4,Iris-versicolor
51,52,6.4,3.2,4.5,1.5,Iris-versicolor
52,53,6.9,3.1,4.9,1.5,Iris-versicolor
53,54,5.5,2.3,4.0,1.3,Iris-versicolor
54,55,6.5,2.8,4.6,1.5,Iris-versicolor


In [44]:
df['Species'].isin(['Iris-versicolor'])

0      False
1      False
2      False
3      False
4      False
       ...  
145    False
146    False
147    False
148    False
149    False
Name: Species, Length: 150, dtype: bool

In [113]:
df[df['Species'].isin(['Iris-versicolor','Iris-setosa'])].head()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa


### 데이터프레임 Operation

In [79]:
# 새로운 column을 만들 수 있다.
df['SepalArea'] = df['SepalLengthCm']* df['SepalWidthCm']
df



Unnamed: 0_level_0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,SepalArea
Species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Iris-setosa,1,5.1,3.5,1.4,0.2,17.85
Iris-setosa,2,4.9,3.0,1.4,0.2,14.70
Iris-setosa,3,4.7,3.2,1.3,0.2,15.04
Iris-setosa,4,4.6,3.1,1.5,0.2,14.26
Iris-setosa,5,5.0,3.6,1.4,0.2,18.00
...,...,...,...,...,...,...
Iris-virginica,146,6.7,3.0,5.2,2.3,20.10
Iris-virginica,147,6.3,2.5,5.0,1.9,15.75
Iris-virginica,148,6.5,3.0,5.2,2.0,19.50
Iris-virginica,149,6.2,3.4,5.4,2.3,21.08


In [80]:
df = df.drop("SepalArea",axis=1) # remove column
df.head()

# df1 = df.drop(1,axis=0) # row
# df1.head()

KeyError: '[1] not found in axis'

In [66]:
# 한 coulumn을 행으로 셋팅하는것을 set_index
df = df.set_index('Species')
df.head()

Unnamed: 0_level_0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm
Species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Iris-setosa,1,5.1,3.5,1.4,0.2
Iris-setosa,2,4.9,3.0,1.4,0.2
Iris-setosa,3,4.7,3.2,1.3,0.2
Iris-setosa,4,4.6,3.1,1.5,0.2
Iris-setosa,5,5.0,3.6,1.4,0.2


In [67]:
df = df.reset_index() #행을 리셋

In [68]:
df.head()

Unnamed: 0,Species,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm
0,Iris-setosa,1,5.1,3.5,1.4,0.2
1,Iris-setosa,2,4.9,3.0,1.4,0.2
2,Iris-setosa,3,4.7,3.2,1.3,0.2
3,Iris-setosa,4,4.6,3.1,1.5,0.2
4,Iris-setosa,5,5.0,3.6,1.4,0.2


In [70]:
df = df.set_index('Species')
df.head()

Unnamed: 0_level_0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm
Species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Iris-setosa,1,5.1,3.5,1.4,0.2
Iris-setosa,2,4.9,3.0,1.4,0.2
Iris-setosa,3,4.7,3.2,1.3,0.2
Iris-setosa,4,4.6,3.1,1.5,0.2
Iris-setosa,5,5.0,3.6,1.4,0.2


In [71]:
df.loc['Iris-setosa'].head()

Unnamed: 0_level_0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm
Species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Iris-setosa,1,5.1,3.5,1.4,0.2
Iris-setosa,2,4.9,3.0,1.4,0.2
Iris-setosa,3,4.7,3.2,1.3,0.2
Iris-setosa,4,4.6,3.1,1.5,0.2
Iris-setosa,5,5.0,3.6,1.4,0.2


In [72]:
df.drop('Iris-setosa',axis=0).head()

Unnamed: 0_level_0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm
Species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Iris-versicolor,51,7.0,3.2,4.7,1.4
Iris-versicolor,52,6.4,3.2,4.5,1.5
Iris-versicolor,53,6.9,3.1,4.9,1.5
Iris-versicolor,54,5.5,2.3,4.0,1.3
Iris-versicolor,55,6.5,2.8,4.6,1.5


### apply ,nunique, unique, value_count함수 사용

In [123]:
df = pd.read_csv('Iris.csv') # csv 파일 읽기
df

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa
...,...,...,...,...,...,...
145,146,6.7,3.0,5.2,2.3,Iris-virginica
146,147,6.3,2.5,5.0,1.9,Iris-virginica
147,148,6.5,3.0,5.2,2.0,Iris-virginica
148,149,6.2,3.4,5.4,2.3,Iris-virginica


In [81]:
def float2int(num):
    return int(num)

In [83]:
# 함수를 apply시킬수 있구나 ㅋㅋ
df['SepalWidthCm'] = df['SepalWidthCm'].apply(float2int)

In [85]:
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
Index: 150 entries, Iris-setosa to Iris-virginica
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             150 non-null    int64  
 1   SepalLengthCm  150 non-null    float64
 2   SepalWidthCm   150 non-null    int64  
 3   PetalLengthCm  150 non-null    float64
 4   PetalWidthCm   150 non-null    float64
dtypes: float64(3), int64(2)
memory usage: 11.1+ KB


Unnamed: 0_level_0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm
Species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Iris-setosa,1,5.1,3,1.4,0.2
Iris-setosa,2,4.9,3,1.4,0.2
Iris-setosa,3,4.7,3,1.3,0.2
Iris-setosa,4,4.6,3,1.5,0.2
Iris-setosa,5,5.0,3,1.4,0.2


In [94]:
#uniuque는 column이 값들을 겹치는것 빼고 성분을 array로 받음
df['SepalLengthCm'].unique()

array([5.1, 4.9, 4.7, 4.6, 5. , 5.4, 4.4, 4.8, 4.3, 5.8, 5.7, 5.2, 5.5,
       4.5, 5.3, 7. , 6.4, 6.9, 6.5, 6.3, 6.6, 5.9, 6. , 6.1, 5.6, 6.7,
       6.2, 6.8, 7.1, 7.6, 7.3, 7.2, 7.7, 7.4, 7.9])

In [90]:
#nunique는 unique의 길이 리턴
df['SepalLengthCm'].nunique()


35

In [131]:
# column의 값과 그것들이 몇개 인지 알려주네
df['SepalWidthCm'].value_counts()

3    89
2    57
4     4
Name: SepalWidthCm, dtype: int64

### 날짜데이터 처리



In [99]:
date = pd.Series(['Jul 3, 2021', '2021-07-03','Jul 4, 2021', '2021-07-04'])


0    Jul 3, 2021
1     2021-07-03
2    Jul 4, 2021
3     2021-07-04
dtype: object


Unnamed: 0,0
0,"Jul 3, 2021"
1,2021-07-03
2,"Jul 4, 2021"
3,2021-07-04


In [102]:
print(type(date[0]))
type(pd.to_datetime(date)[0])

<class 'str'>


pandas._libs.tslibs.timestamps.Timestamp

In [103]:
data = np.zeros([4,4])
df = pd.DataFrame(data)
df.iloc[:,0] = date
df

Unnamed: 0,0,1,2,3
0,"Jul 3, 2021",0.0,0.0,0.0
1,2021-07-03,0.0,0.0,0.0
2,"Jul 4, 2021",0.0,0.0,0.0
3,2021-07-04,0.0,0.0,0.0


In [104]:
df.iloc[:,0] = df.iloc[:,0].apply(pd.to_datetime)
df.iloc[:]

Unnamed: 0,0,1,2,3
0,2021-07-03,0.0,0.0,0.0
1,2021-07-03,0.0,0.0,0.0
2,2021-07-04,0.0,0.0,0.0
3,2021-07-04,0.0,0.0,0.0


In [108]:
df['year'] = df.iloc[:,0].apply(lambda x : x.year)
df['month'] = df.iloc[:,0].apply(lambda x : x.month)
df['day'] = df.iloc[:,0].apply(lambda x : x.day)
df.head()

Unnamed: 0,0,1,2,3,year,month,day
0,2021-07-03,0.0,0.0,0.0,2021,7,3
1,2021-07-03,0.0,0.0,0.0,2021,7,3
2,2021-07-04,0.0,0.0,0.0,2021,7,4
3,2021-07-04,0.0,0.0,0.0,2021,7,4


### Group by


In [142]:
df = pd.read_csv('Iris.csv') # csv 파일 읽기
df

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa
...,...,...,...,...,...,...
145,146,6.7,3.0,5.2,2.3,Iris-virginica
146,147,6.3,2.5,5.0,1.9,Iris-virginica
147,148,6.5,3.0,5.2,2.0,Iris-virginica
148,149,6.2,3.4,5.4,2.3,Iris-virginica


In [143]:
#groupby column의 같은 값들끼리 group화 시켜줌
df_group=df.groupby('Species')
df_group

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7ff09010caf0>

In [144]:
# 행을 그룹화 한 column의 요소들로 받고, 각 요소들의 column값을 sum해서 return
df_group.sum()

Unnamed: 0_level_0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm
Species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Iris-setosa,1275,250.3,170.9,73.2,12.2
Iris-versicolor,3775,296.8,138.5,213.0,66.3
Iris-virginica,6275,329.4,148.7,277.6,101.3


In [145]:
df_group.mean()

Unnamed: 0_level_0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm
Species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Iris-setosa,25.5,5.006,3.418,1.464,0.244
Iris-versicolor,75.5,5.936,2.77,4.26,1.326
Iris-virginica,125.5,6.588,2.974,5.552,2.026


In [53]:
df_group.mean().index

Index(['Iris-setosa', 'Iris-versicolor', 'Iris-virginica'], dtype='object', name='Species')

In [54]:
df_group.mean().columns

Index(['Id', 'SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm'], dtype='object')

In [146]:
df_group.mean().iloc[:,0]

Species
Iris-setosa         25.5
Iris-versicolor     75.5
Iris-virginica     125.5
Name: Id, dtype: float64