# 판다스
---

In [40]:
# !pip install pandas --upgrade

In [2]:
import pandas as pd
import numpy as np
pd.__version__

'1.4.4'

![](https://www.runoob.com/wp-content/uploads/2021/04/df-dp.png)

## Series

In [42]:
sr = pd.Series([1,2,3,4,5],name='Apple')
print(sr)

0    1
1    2
2    3
3    4
4    5
Name: Apple, dtype: int64


In [43]:
print(sr.index)
print(sr.shape)

RangeIndex(start=0, stop=5, step=1)
(5,)


In [44]:
sr[1:3]

1    2
2    3
Name: Apple, dtype: int64

In [45]:
sr = pd.Series([1,2,3,4,5],name='Apple',index=['a','b','c','d','e'])
sr

a    1
b    2
c    3
d    4
e    5
Name: Apple, dtype: int64

In [46]:
sr.index,sr.values,sr.to_numpy()

(Index(['a', 'b', 'c', 'd', 'e'], dtype='object'),
 array([1, 2, 3, 4, 5], dtype=int64),
 array([1, 2, 3, 4, 5], dtype=int64))

In [47]:
sr.reset_index()

Unnamed: 0,index,Apple
0,a,1
1,b,2
2,c,3
3,d,4
4,e,5


In [48]:
sr = pd.Series([1,np.nan,2,3,np.nan,4,5]) #np.nan 값이 없음을 뜻함
sr

0    1.0
1    NaN
2    2.0
3    3.0
4    NaN
5    4.0
6    5.0
dtype: float64

In [49]:
#fancy indexing
sr[[1,2,4]]

1    NaN
2    2.0
4    NaN
dtype: float64

In [50]:
# boolean indexing
idx = [False,True,True,False,True,False,False]
sr[idx]

1    NaN
2    2.0
4    NaN
dtype: float64

In [51]:
# 조건
sr[sr>3]

5    4.0
6    5.0
dtype: float64

In [55]:
# 결측치 (Missing Value)

print(sr.isna()) #결측치파트만 True로 반환
sr[sr.isna()] #isna() == isnull()
sr[sr.isnull()]

0    False
1     True
2    False
3    False
4     True
5    False
6    False
dtype: bool


1   NaN
4   NaN
dtype: float64

In [58]:
sr.isna().sum() #내가 가진 데이터에서 결측치가 몇개인지 알 수 있다

2

In [63]:
x = sr.copy()
x[x.isna()] = x.mean() #결측치에 x의 평균값을 넣는 문장
x

0    1.0
1    3.0
2    2.0
3    3.0
4    3.0
5    4.0
6    5.0
dtype: float64

In [69]:
y = sr.copy()
y = y.dropna()  #결측치를 날린 상태로 y에 복사하는 방법
y

0    1.0
2    2.0
3    3.0
5    4.0
6    5.0
dtype: float64

In [68]:
z = sr.copy()
z.dropna(inplace=True) #inplace 는 기본적으로 False로 돼 있다. True로 바꾸면 바로 바뀐다.
z

0    1.0
2    2.0
3    3.0
5    4.0
6    5.0
dtype: float64

In [73]:
# x = sr.copy()
# x[x.isna()] = x.mean()

w = sr.copy()
w.fillna(w.mean()) #fillna 역시 위 식과 같은 결과가 나온다. 내부를 뭘로 채울지 정할 수 있음

0    1.0
1    3.0
2    2.0
3    3.0
4    3.0
5    4.0
6    5.0
dtype: float64

### 슬라이싱

In [92]:
sr = pd.Series([10, 15, 12, 17, 13], index = ['john', 'eva','james','lian','zoe'])
sr

john     10
eva      15
james    12
lian     17
zoe      13
dtype: int64

In [93]:
sr['eva':'lian'] #index로 slicing할 경우 마지막 범위까지 포함된다

eva      15
james    12
lian     17
dtype: int64

In [94]:
sr[:-1]

john     10
eva      15
james    12
lian     17
dtype: int64

In [95]:
sr[:]

john     10
eva      15
james    12
lian     17
zoe      13
dtype: int64

### 정렬

In [96]:
sr.sort_values() #오름차순
sr.sort_values(ascending = False) #내림차순 (ascending은 기본적으로 True로 돼있다)

lian     17
eva      15
zoe      13
james    12
john     10
dtype: int64

In [97]:
## op3 멤버를 뽑는다면
sr.sort_values(ascending = False)[:3]

lian    17
eva     15
zoe     13
dtype: int64

In [98]:
# index 기준 정렬
sr.sort_index()

eva      15
james    12
john     10
lian     17
zoe      13
dtype: int64

## DataFrame

In [99]:
doc = [['Joe', 20, 85.10, 'A', 'Swimming'],
        ['Nat', 21, 77.80, 'B', 'Reading'],
        ['Harry', 19, 91.54, 'A', 'Music'],
        ['Sam', 20, 88.78, 'A', 'Painting'],
        ['Monica', 22, 60.55, 'B', 'Dancing']]

c_name = ['Name', 'Age', 'Marks', 'Grade', 'Hobby']
idx = ['s1', 's2', 's3', 's4', 's5']

In [109]:
df = pd.DataFrame(doc, columns = c_name, index = idx)
print(df)
df.shape

      Name  Age  Marks Grade     Hobby
s1     Joe   20  85.10     A  Swimming
s2     Nat   21  77.80     B   Reading
s3   Harry   19  91.54     A     Music
s4     Sam   20  88.78     A  Painting
s5  Monica   22  60.55     B   Dancing


(5, 5)

In [None]:
df.head() #데이터중에서 제일 위에 있는 5개의 데이터만 표시하는 함수

Unnamed: 0,Name,Age,Marks,Grade,Hobby
s1,Joe,20,85.1,A,Swimming
s2,Nat,21,77.8,B,Reading
s3,Harry,19,91.54,A,Music
s4,Sam,20,88.78,A,Painting
s5,Monica,22,60.55,B,Dancing


In [127]:
doc = {'Name' :['Joe','Nat','Harry','Sam','Monica',],
        'Age':[20, 21, 19, 20, 22],
        'Marks':[85.10, 77.80, 91.54, 88.78, 60.55],
        'Grade':['A', 'B', 'A', 'A', 'B',],
        'Hobby':['Swmming', 'Reading', 'Music', 'Painting', 'Dancing']}

In [155]:
x = df.dtypes
type(x)

pandas.core.series.Series

In [129]:
print(df.columns)
df.columns[[0,2,3]]

Index(['Name', 'Age', 'Marks', 'Grade', 'Hobby'], dtype='object')


Index(['Name', 'Marks', 'Grade'], dtype='object')

In [130]:
#원하는 컬럼만 가져오기
df[df.columns[[0,2,3]]]

Unnamed: 0,Name,Marks,Grade
0,Joe,85.1,A
1,,77.8,B
2,Harry,,A
3,Sam,,A
4,Monica,60.55,B


In [131]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Name    4 non-null      object 
 1   Age     5 non-null      int64  
 2   Marks   3 non-null      float64
 3   Grade   5 non-null      object 
 4   Hobby   5 non-null      object 
dtypes: float64(1), int64(1), object(3)
memory usage: 328.0+ bytes


In [157]:
doc = {'Name' :['Joe',np.nan,'Harry','Sam','Monica',],
        'Age':[20, 21, 19, 20, 22],
        'Marks':[85.10, 77.80, np.nan,np.nan, 60.55],
        'Grade':['A', 'B', 'A', 'A', 'B',],
        'Hobby':['Swmming', 'Reading', 'Music', 'Painting', 'Dancing']}
df = pd.DataFrame(doc)
df.head()

Unnamed: 0,Name,Age,Marks,Grade,Hobby
0,Joe,20,85.1,A,Swmming
1,,21,77.8,B,Reading
2,Harry,19,,A,Music
3,Sam,20,,A,Painting
4,Monica,22,60.55,B,Dancing


In [158]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Name    4 non-null      object 
 1   Age     5 non-null      int64  
 2   Marks   3 non-null      float64
 3   Grade   5 non-null      object 
 4   Hobby   5 non-null      object 
dtypes: float64(1), int64(1), object(3)
memory usage: 328.0+ bytes


### 컬럼 다루기

In [159]:
doc = {'Name' :['Joe',np.nan,'Harry','Sam','Monica',],
        'Age':[20, 21, 19, 20, 22],
        'Marks':[85.10, 77.80, np.nan,np.nan, 60.55],
        'Grade':['A', 'B', 'A', 'A', 'B',],
        'Hobby':['Swmming', 'Reading', 'Music', 'Painting', 'Dancing']}
df = pd.DataFrame(doc)
df.head()

Unnamed: 0,Name,Age,Marks,Grade,Hobby
0,Joe,20,85.1,A,Swmming
1,,21,77.8,B,Reading
2,Harry,19,,A,Music
3,Sam,20,,A,Painting
4,Monica,22,60.55,B,Dancing


In [160]:
df[['Name','Age']] #df['Name','Age'] 이렇게 하면 에러가 발생한다. 꼭 범위를 리스트로 만들도록 하자.

Unnamed: 0,Name,Age
0,Joe,20
1,,21
2,Harry,19
3,Sam,20
4,Monica,22


In [164]:
df[['Age']] #dataframe 형태로 출력이 된다

Unnamed: 0,Age
0,20
1,21
2,19
3,20
4,22


In [165]:
df['Age'] #Series 형태로 출력이 된다

0    20
1    21
2    19
3    20
4    22
Name: Age, dtype: int64

In [152]:
df.columns = ['Name','Age','Score','Grade','Hobby'] #Marks를 Score를 바꾸기중 한가지 방법

Unnamed: 0,Name,Age,Score,Grade,Hobby
0,Joe,20,85.1,A,Swmming
1,,21,77.8,B,Reading
2,Harry,19,,A,Music
3,Sam,20,,A,Painting
4,Monica,22,60.55,B,Dancing


#### Rename

In [153]:
df.rename(columns={'Score':'Marks','Hobby':'etc'}) #하지만 이 함수가 훨씬 효율적

Unnamed: 0,Name,Age,Marks,Grade,etc
0,Joe,20,85.1,A,Swmming
1,,21,77.8,B,Reading
2,Harry,19,,A,Music
3,Sam,20,,A,Painting
4,Monica,22,60.55,B,Dancing


### 파일 입출력

In [170]:
# !pip install gdown

Collecting gdown
  Downloading gdown-4.6.0-py3-none-any.whl (14 kB)
Collecting tqdm
  Downloading tqdm-4.64.1-py2.py3-none-any.whl (78 kB)
     ---------------------------------------- 78.5/78.5 kB ? eta 0:00:00
Collecting filelock
  Downloading filelock-3.8.2-py3-none-any.whl (10 kB)
Collecting beautifulsoup4
  Downloading beautifulsoup4-4.11.1-py3-none-any.whl (128 kB)
     -------------------------------------- 128.2/128.2 kB 7.4 MB/s eta 0:00:00
Collecting requests[socks]
  Downloading requests-2.28.1-py3-none-any.whl (62 kB)
     ---------------------------------------- 62.8/62.8 kB 3.3 MB/s eta 0:00:00
Collecting soupsieve>1.2
  Downloading soupsieve-2.3.2.post1-py3-none-any.whl (37 kB)
Collecting urllib3<1.27,>=1.21.1
  Downloading urllib3-1.26.13-py2.py3-none-any.whl (140 kB)
     ---------------------------------------- 140.6/140.6 kB ? eta 0:00:00
Collecting idna<4,>=2.5
  Downloading idna-3.4-py3-none-any.whl (61 kB)
     ---------------------------------------- 61.5/61.5 kB

#### 평범한 csv파일

In [5]:
# !gdown https://raw.githubusercontent.com/devdio/datasets/main/doc.csv

In [6]:
df = pd.read_csv('doc.csv')
df.shape

(5, 5)

In [7]:
df

Unnamed: 0,Name,Age,Marks,Grade,Hobby
0,Joe,20,85.1,A,Swmming
1,Nat,21,77.8,B,Reading
2,Harry,19,91.54,A,Music
3,Sam,20,88.78,A,Painting
4,Monica,22,60.55,B,Dancing


#### index가 포함돼있는 csv파일

In [None]:
# !gdown https://raw.githubusercontent.com/devdio/datasets/main/doc_idx.csv

In [19]:
df = pd.read_csv('doc_idx.csv', index_col = 0) #첫번째를 index로 만드려면

In [18]:
df

Unnamed: 0,Name,Age,Marks,Grade,Hobby
s1,Joe,20,85.1,A,Swmming
s2,Nat,21,77.8,B,Reading
s3,Harry,19,91.54,A,Music
s4,Sam,20,88.78,A,Painting
s5,Monica,22,60.55,B,Dancing


#### index와 결측치가 포함돼있는 파일

In [21]:
# !gdown https://raw.githubusercontent.com/devdio/datasets/main/doc_na.csv

In [29]:
df = pd.read_csv('doc_na.csv', index_col = 0, na_values = ['?'])
#df = pd.read_csv('doc_na.csv', index_col = 0)
#df = df.replace('?',np.NaN)
df

Unnamed: 0,Name,Age,Marks,Grade,Hobby
s1,Joe,20.0,,A,Swmming
s2,Nat,21.0,77.8,B,Reading
s3,Harry,,91.54,A,Music
s4,Sam,20.0,88.78,A,
s5,Monica,22.0,60.55,B,Dancing


### Titanic.csv를 활용한 예제

In [32]:
# !gdown https://raw.githubusercontent.com/devdio/datasets/main/titanic.csv

In [37]:
titanic = pd.read_csv('titanic.csv')
titanic.shape

(891, 12)

In [38]:
titanic.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,lost,3rd,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,saved,1st,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,saved,3rd,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,saved,1st,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,lost,3rd,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


#### columns의 내용을 모두 소문자로 바꾸는 방법

In [55]:
df = titanic.copy() # 작업을 하기위해 df에 원본을 복사
print(df.columns)
#df.columns = [c.lower() for c in df.columns]
df.columns = df.columns.str.lower()
df.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')


Index(['passengerid', 'survived', 'pclass', 'name', 'sex', 'age', 'sibsp',
       'parch', 'ticket', 'fare', 'cabin', 'embarked'],
      dtype='object')

In [None]:
#숫자로만 돼있는 column에 대한 통계값 출력

In [56]:
df.describe()

Unnamed: 0,passengerid,age,sibsp,parch,fare
count,891.0,714.0,891.0,891.0,891.0
mean,446.0,29.699118,0.523008,0.381594,32.204208
std,257.353842,14.526497,1.102743,0.806057,49.693429
min,1.0,0.42,0.0,0.0,0.0
25%,223.5,20.125,0.0,0.0,7.9104
50%,446.0,28.0,0.0,0.0,14.4542
75%,668.5,38.0,1.0,0.0,31.0
max,891.0,80.0,8.0,6.0,512.3292
