# 시리즈와 데이터 프레임 직접 만들기 

## 데이터 프레임 시리즈는 리스트나 딕셔너리와 비슷하지만 데이터를 다루는데 특화되어있다.

### 1. 시리즈 만들기 

In [1]:
import pandas as pd
s = pd.Series(['apple',33])
print(s)
print(type(s))


0    apple
1       33
dtype: object
<class 'pandas.core.series.Series'>


In [2]:
list_data = ['2021-04-29', 3.14, 'orajava', 100, True]
s = pd.Series(list_data)
print(s)
print(type(s))

0    2021-04-29
1          3.14
2       orajava
3           100
4          True
dtype: object
<class 'pandas.core.series.Series'>


In [3]:
dict_data = {'a':1, 'b':2, 'c':3}
s = pd.Series(dict_data)
print(s)
print(type(s))

a    1
b    2
c    3
dtype: int64
<class 'pandas.core.series.Series'>


### 2. 시리즈 생성- 문자열을 인덱스로 지정할 수 있다

* 문자열을 인덱스로 지정하려면 Series 메서드의 index 인자를 통해 사용하고자하는 문자열을 리스트에 담아 전달 

In [5]:
s =  pd.Series(['Jane', 'student'], index=['Person','Job'])
print(s)

Person       Jane
Job       student
dtype: object


### 3. 딕셔너리로 데이터프레임 생성하기
* 파이썬의 기본 자료구조로 데이터 프레임 생성 가능 
* 아래 예제는 **딕셔너리로 데이터 프레임을 생성하는 예제
* 딕셔너리를 DataFrame클래스에 전달
* 데이터프레임의 컬럼은 모두 시리즈
* 아래 예제는 5개의 시리즈로 수성된 데이터 프레임 

* 딕셔너리 = 키와 벨류값
* 리스트 = 값

In [6]:
scientists = pd.DataFrame({
    'Name' : ['Rosaline Franklin', 'William Gosset'], #나의 칼럼
    'Occupation' : ['Chemist', 'Statistician'],
    'Born' : ['1920-07-25', '1876-06-13'],
    'Died' : ['1958-04-16', '1937-10-16'],
    'Age' : [37,61]})

scientists

Unnamed: 0,Name,Occupation,Born,Died,Age
0,Rosaline Franklin,Chemist,1920-07-25,1958-04-16,37
1,William Gosset,Statistician,1876-06-13,1937-10-16,61


* columns 인자를 사용하면 데이터 프레임의 열 순서를 지정항 수 있다.
* index, columns 지정 

In [11]:
scientists = pd.DataFrame(
     data = {'Occupation' : ['Chemist', 'Statistician'],
            'Born' : ['1920-07-25', '1876-06-13'],
            'Died' : ['1958-04-16', '1937-10-16'],
            'Age' : [37,61]},
    index = ['Rosaline Franklin', 'William Gosset'], #앞머리(인덱스) 지정 가능 
    columns = ['Occupation','Born','Age','Died']) #칼럼의 순서 변경 가능 

scientists

Unnamed: 0,Occupation,Born,Age,Died
Rosaline Franklin,Chemist,1920-07-25,37,1958-04-16
William Gosset,Statistician,1876-06-13,61,1937-10-16


### 4. 딕셔너리는 데이터의 순서를 보장하지 않음
* 순서가 보장된 딕셔너리를 전달하려면 'OrderedDict' 클래스 사용

In [14]:
from collections import OrderedDict
scientists = pd.DataFrame(OrderedDict([
    ('Name', ['Rosaline Franklin', 'William Gosset']),
    ('Occupation', ['Chemist', 'Statistician']),
    ('Born', ['1920-07-25', '1876-06-13']),
    ('Died', ['1958-04-16', '1937-10-16']),
    ('Age', [37,61])
    
])
)
scientists

# 여기에 인덱스 = 0,1 로 자동 갈김 당함

Unnamed: 0,Name,Occupation,Born,Died,Age
0,Rosaline Franklin,Chemist,1920-07-25,1958-04-16,37
1,William Gosset,Statistician,1876-06-13,1937-10-16,61


### 5. 데이터 프레임에서 시리즈 선택하기

In [17]:
scientists = pd.DataFrame(
     data = {'Occupation' : ['Chemist', 'Statistician'],
            'Born' : ['1920-07-25', '1876-06-13'],
            'Died' : ['1958-04-16', '1937-10-16'],
            'Age' : [37,61]},
    index = ['Rosaline Franklin', 'William Gosset'], #앞머리(인덱스) 지정 가능 
    columns = ['Occupation','Born','Age','Died'])    #칼럼의 순서 변경 가능 

scientists

Unnamed: 0,Occupation,Born,Age,Died
Rosaline Franklin,Chemist,1920-07-25,37,1958-04-16
William Gosset,Statistician,1876-06-13,61,1937-10-16


In [18]:
first_row = scientists.loc['William Gosset']
print(type(first_row))

<class 'pandas.core.series.Series'>


In [19]:
print(first_row)

Occupation    Statistician
Born            1876-06-13
Age                     61
Died            1937-10-16
Name: William Gosset, dtype: object


### 6. 시리즈 속성과 메서드
* index, values 속성과 keys 메서드 사용하기

In [20]:
#1) index 속성
print(first_row.index)

Index(['Occupation', 'Born', 'Age', 'Died'], dtype='object')


In [22]:
#2) values 속성
print(first_row.values)

['Statistician' '1876-06-13' 61 '1937-10-16']


In [24]:
#3) keys 속성
print(first_row.keys()) #메서드= () <-- 얘 들어감

Index(['Occupation', 'Born', 'Age', 'Died'], dtype='object')


In [25]:
#4) index 속성 응용
print(first_row.index[3]) # 3번째 인덱스를 가져와라

Died


In [26]:
#5) keys 속성
print(first_row.keys()[0]) # 1번째 키를 가져와라 (인덱스와 같은 역할 하고 있음)

Occupation


### 7. 시리즈의 mean, min, max, std 메서드 사용하기 

In [27]:
ages = scientists['Age']
print(ages)

Rosaline Franklin    37
William Gosset       61
Name: Age, dtype: int64


In [28]:
print(ages.mean())

49.0


In [29]:
print(ages.max())

61


In [30]:
print(ages.std()) #표준 편차

16.97056274847714


In [31]:
print(ages.describe())

count     2.000000
mean     49.000000
std      16.970563
min      37.000000
25%      43.000000
50%      49.000000
75%      55.000000
max      61.000000
Name: Age, dtype: float64


### 8.시리즈 다루기 응용
* 시리즈와 불린 추출

In [40]:
scientists = pd.read_csv('../data/scientists.csv')
print(type(scientists))
scientists 

# 문제_아래에서 name, born, age 칼럼만 추출해서 df에 할당
#df = scientists[['Name', 'Born', 'Age']] #데이터 프레임이라 프레임 작업 없이 바로 가져옴
#print(df)




<class 'pandas.core.frame.DataFrame'>


Unnamed: 0,Name,Born,Died,Age,Occupation
0,Rosaline Franklin,1920-07-25,1958-04-16,37,Chemist
1,William Gosset,1876-06-13,1937-10-16,61,Statistician
2,Florence Nightingale,1820-05-12,1910-08-13,90,Nurse
3,Marie Curie,1867-11-07,1934-07-04,66,Chemist
4,Rachel Carson,1907-05-27,1964-04-14,56,Biologist
5,John Snow,1813-03-15,1858-06-16,45,Physician
6,Alan Turing,1912-06-23,1954-06-07,41,Computer Scientist
7,Johann Gauss,1777-04-30,1855-02-23,77,Mathematician


In [41]:
ages = scientists['Age']
print(ages.max())

90


In [42]:
print(ages.mean())

59.125


In [45]:
print(ages[ages > ages.mean()])

1    61
2    90
3    66
7    77
Name: Age, dtype: int64


In [47]:
print(ages>ages.mean()) #불린 추출

0    False
1     True
2     True
3     True
4    False
5    False
6    False
7     True
Name: Age, dtype: bool


In [50]:
bool_values = [True,True, False,False, True, True, False,True]
print(ages[bool_values])

0    37
1    61
4    56
5    45
7    77
Name: Age, dtype: int64


### 9. 벡터와 스칼라로 브로드캐스팅 수행하기
* 시리즈 처럼 여러 개의 값을 가진 데이터 : 벡터
* 단순 크기를 나타내는 데이터 : 스칼라

In [51]:
print(ages + ages)

0     74
1    122
2    180
3    132
4    112
5     90
6     82
7    154
Name: Age, dtype: int64


In [52]:
print(ages * ages) # 벡터곱하기

0    1369
1    3721
2    8100
3    4356
4    3136
5    2025
6    1681
7    5929
Name: Age, dtype: int64


In [53]:
print(ages + 100) # 벡터에 스칼라 연산 : 브로드캐스팅 한 결과 

0    137
1    161
2    190
3    166
4    156
5    145
6    141
7    177
Name: Age, dtype: int64


In [54]:
print(ages * 2)

0     74
1    122
2    180
3    132
4    112
5     90
6     82
7    154
Name: Age, dtype: int64


In [56]:
print(pd.Series([1,100])) #시리즈 생성

0      1
1    100
dtype: int64


In [57]:
print(ages + pd.Series([1,100])) 
# 길이가 다른 벡터를 연산, 일치한 인덱스만 계산(0,1)
# 시리즈에 두개만 넣었으니께.

0     38.0
1    161.0
2      NaN
3      NaN
4      NaN
5      NaN
6      NaN
7      NaN
dtype: float64


In [61]:
# 인덱스 정렬 - 내림차순 
rev_ages = ages.sort_index(ascending = False)
print(rev_ages)

7    77
6    41
5    45
4    56
3    66
2    90
1    61
0    37
Name: Age, dtype: int64


In [64]:
# 벡터와 벡터의 연산은 일치하는 인덱스 값끼리 수행함 ***
print(ages + rev_ages)

0     74
1    122
2    180
3    132
4    112
5     90
6     82
7    154
Name: Age, dtype: int64


In [65]:
rev_values = ages.sort_values(ascending = False)
print(rev_values) # 값에 대한 정렬

2    90
7    77
3    66
1    61
4    56
5    45
6    41
0    37
Name: Age, dtype: int64


### 10. 데이터 프레임 다루기 
* 불린 추출과 브로드캐스팅

In [66]:
print(scientists[scientists['Age'] > scientists['Age'].mean()])

                   Name        Born        Died  Age     Occupation
1        William Gosset  1876-06-13  1937-10-16   61   Statistician
2  Florence Nightingale  1820-05-12  1910-08-13   90          Nurse
3           Marie Curie  1867-11-07  1934-07-04   66        Chemist
7          Johann Gauss  1777-04-30  1855-02-23   77  Mathematician


In [67]:
print(scientists * 2) # 데이터 프레임 * 2

                                       Name                  Born  \
0        Rosaline FranklinRosaline Franklin  1920-07-251920-07-25   
1              William GossetWilliam Gosset  1876-06-131876-06-13   
2  Florence NightingaleFlorence Nightingale  1820-05-121820-05-12   
3                    Marie CurieMarie Curie  1867-11-071867-11-07   
4                Rachel CarsonRachel Carson  1907-05-271907-05-27   
5                        John SnowJohn Snow  1813-03-151813-03-15   
6                    Alan TuringAlan Turing  1912-06-231912-06-23   
7                  Johann GaussJohann Gauss  1777-04-301777-04-30   

                   Died  Age                            Occupation  
0  1958-04-161958-04-16   74                        ChemistChemist  
1  1937-10-161937-10-16  122              StatisticianStatistician  
2  1910-08-131910-08-13  180                            NurseNurse  
3  1934-07-041934-07-04  132                        ChemistChemist  
4  1964-04-141964-04-14  112     

### 10. 시리즈와 데이터 프레임의 데이터 처리하기
* 1. 열의 자료형 바꾸기와 새로운 열 추가하기

In [70]:
# pd의 object는 python의 string과 같다
print(scientists['Born'].dtype)
print(scientists['Died'].dtype)

object
object


In [74]:
born_datetime = pd.to_datetime(scientists['Born'], format = '%Y-%m-%d')
print(born_datetime) #object(string) = cientists['Born'] 타입을 날짜 타입으로 바꾸는 함수 = pd.to_datetime

0   1920-07-25
1   1876-06-13
2   1820-05-12
3   1867-11-07
4   1907-05-27
5   1813-03-15
6   1912-06-23
7   1777-04-30
Name: Born, dtype: datetime64[ns]


In [75]:
died_datetime = pd.to_datetime(scientists['Died'], format = '%Y-%m-%d')
print(died_datetime) #object(string) = cientists['Died] 타입을 날짜 타입으로 바꾸는 함수 = pd.to_datetime

0   1958-04-16
1   1937-10-16
2   1910-08-13
3   1934-07-04
4   1964-04-14
5   1858-06-16
6   1954-06-07
7   1855-02-23
Name: Died, dtype: datetime64[ns]


In [77]:
#새로운 칼럼 추가
#'born_dt', died_dt' 라는 칼럼을 추가하고 born_datetime, died_datetime 를 널어라

scientists['born_dt'], scientists['died_dt'] = (born_datetime, died_datetime)
scientists

Unnamed: 0,Name,Born,Died,Age,Occupation,born_dt,died_dt
0,Rosaline Franklin,1920-07-25,1958-04-16,37,Chemist,1920-07-25,1958-04-16
1,William Gosset,1876-06-13,1937-10-16,61,Statistician,1876-06-13,1937-10-16
2,Florence Nightingale,1820-05-12,1910-08-13,90,Nurse,1820-05-12,1910-08-13
3,Marie Curie,1867-11-07,1934-07-04,66,Chemist,1867-11-07,1934-07-04
4,Rachel Carson,1907-05-27,1964-04-14,56,Biologist,1907-05-27,1964-04-14
5,John Snow,1813-03-15,1858-06-16,45,Physician,1813-03-15,1858-06-16
6,Alan Turing,1912-06-23,1954-06-07,41,Computer Scientist,1912-06-23,1954-06-07
7,Johann Gauss,1777-04-30,1855-02-23,77,Mathematician,1777-04-30,1855-02-23


In [78]:
scientists.shape

(8, 7)

In [79]:
# 연산하여 칼럼 추가
scientists['age_days_dt'] = (scientists['died_dt'] - scientists['born_dt'])
scientists

Unnamed: 0,Name,Born,Died,Age,Occupation,born_dt,died_dt,age_days_dt
0,Rosaline Franklin,1920-07-25,1958-04-16,37,Chemist,1920-07-25,1958-04-16,13779 days
1,William Gosset,1876-06-13,1937-10-16,61,Statistician,1876-06-13,1937-10-16,22404 days
2,Florence Nightingale,1820-05-12,1910-08-13,90,Nurse,1820-05-12,1910-08-13,32964 days
3,Marie Curie,1867-11-07,1934-07-04,66,Chemist,1867-11-07,1934-07-04,24345 days
4,Rachel Carson,1907-05-27,1964-04-14,56,Biologist,1907-05-27,1964-04-14,20777 days
5,John Snow,1813-03-15,1858-06-16,45,Physician,1813-03-15,1858-06-16,16529 days
6,Alan Turing,1912-06-23,1954-06-07,41,Computer Scientist,1912-06-23,1954-06-07,15324 days
7,Johann Gauss,1777-04-30,1855-02-23,77,Mathematician,1777-04-30,1855-02-23,28422 days


### 11. 시리즈, 데이터프레임의 데이터 섞기

In [80]:
print(scientists['Age'])

0    37
1    61
2    90
3    66
4    56
5    45
6    41
7    77
Name: Age, dtype: int64


In [85]:
import random
random.seed(42) #컴퓨터가 생성하는 난수의 기준값을 설정 
random.shuffle(scientists['Age'])
print(scientists['Age'])

0    90
1    66
2    61
3    41
4    77
5    45
6    56
7    37
Name: Age, dtype: int64


### 데이터 프레임의 열 삭제하기

In [86]:
print(scientists.columns) #칼럼 조회

Index(['Name', 'Born', 'Died', 'Age', 'Occupation', 'born_dt', 'died_dt',
       'age_days_dt'],
      dtype='object')


In [87]:
# 삭제
scientists_drop = scientists.drop(['Age'], axis=1) # axis = 열 
scientists

Unnamed: 0,Name,Born,Died,Age,Occupation,born_dt,died_dt,age_days_dt
0,Rosaline Franklin,1920-07-25,1958-04-16,90,Chemist,1920-07-25,1958-04-16,13779 days
1,William Gosset,1876-06-13,1937-10-16,66,Statistician,1876-06-13,1937-10-16,22404 days
2,Florence Nightingale,1820-05-12,1910-08-13,61,Nurse,1820-05-12,1910-08-13,32964 days
3,Marie Curie,1867-11-07,1934-07-04,41,Chemist,1867-11-07,1934-07-04,24345 days
4,Rachel Carson,1907-05-27,1964-04-14,77,Biologist,1907-05-27,1964-04-14,20777 days
5,John Snow,1813-03-15,1858-06-16,45,Physician,1813-03-15,1858-06-16,16529 days
6,Alan Turing,1912-06-23,1954-06-07,56,Computer Scientist,1912-06-23,1954-06-07,15324 days
7,Johann Gauss,1777-04-30,1855-02-23,37,Mathematician,1777-04-30,1855-02-23,28422 days


In [89]:
scientists_drop

Unnamed: 0,Name,Born,Died,Occupation,born_dt,died_dt,age_days_dt
0,Rosaline Franklin,1920-07-25,1958-04-16,Chemist,1920-07-25,1958-04-16,13779 days
1,William Gosset,1876-06-13,1937-10-16,Statistician,1876-06-13,1937-10-16,22404 days
2,Florence Nightingale,1820-05-12,1910-08-13,Nurse,1820-05-12,1910-08-13,32964 days
3,Marie Curie,1867-11-07,1934-07-04,Chemist,1867-11-07,1934-07-04,24345 days
4,Rachel Carson,1907-05-27,1964-04-14,Biologist,1907-05-27,1964-04-14,20777 days
5,John Snow,1813-03-15,1858-06-16,Physician,1813-03-15,1858-06-16,16529 days
6,Alan Turing,1912-06-23,1954-06-07,Computer Scientist,1912-06-23,1954-06-07,15324 days
7,Johann Gauss,1777-04-30,1855-02-23,Mathematician,1777-04-30,1855-02-23,28422 days


### 데이터를 피클, CSV, TSV 파일로 저장하고 불러오기

#### 1. 피클로 저장하기

In [91]:
names = scientists['Name']
print(names)

0       Rosaline Franklin
1          William Gosset
2    Florence Nightingale
3             Marie Curie
4           Rachel Carson
5               John Snow
6             Alan Turing
7            Johann Gauss
Name: Name, dtype: object


In [93]:
# 새폴더 만들기로 output 폴더 만들고 진행 피클로 저장
names.to_pickle('../output/sci_name_serise.pickle')

In [94]:
scientists.to_pickle('../output/scientists_df.pickle')

In [96]:
scientists_name_f_pickle = pd.read_pickle('../output/sci_name_serise.pickle')
scientists_name_f_pickle

0       Rosaline Franklin
1          William Gosset
2    Florence Nightingale
3             Marie Curie
4           Rachel Carson
5               John Snow
6             Alan Turing
7            Johann Gauss
Name: Name, dtype: object

In [97]:
scientists_f_pickle = pd.read_pickle('../output/scientists_df.pickle')
scientists_f_pickle

Unnamed: 0,Name,Born,Died,Age,Occupation,born_dt,died_dt,age_days_dt
0,Rosaline Franklin,1920-07-25,1958-04-16,90,Chemist,1920-07-25,1958-04-16,13779 days
1,William Gosset,1876-06-13,1937-10-16,66,Statistician,1876-06-13,1937-10-16,22404 days
2,Florence Nightingale,1820-05-12,1910-08-13,61,Nurse,1820-05-12,1910-08-13,32964 days
3,Marie Curie,1867-11-07,1934-07-04,41,Chemist,1867-11-07,1934-07-04,24345 days
4,Rachel Carson,1907-05-27,1964-04-14,77,Biologist,1907-05-27,1964-04-14,20777 days
5,John Snow,1813-03-15,1858-06-16,45,Physician,1813-03-15,1858-06-16,16529 days
6,Alan Turing,1912-06-23,1954-06-07,56,Computer Scientist,1912-06-23,1954-06-07,15324 days
7,Johann Gauss,1777-04-30,1855-02-23,37,Mathematician,1777-04-30,1855-02-23,28422 days


### CSV 파일과 STV파일로 저장하기

In [99]:
names.to_csv('../output/sci_name_serise.csv')

  """Entry point for launching an IPython kernel.


In [101]:
scientists.to_csv('../output/sci_df.tsv', sep = '\t')

In [102]:
scientists.to_csv('../output/sci_df_no_index.csv', index = 'False')

In [103]:
#엑셀로 저장, to_frame() 메서드를 사용 df로 변환 후 엑셀로 저장 

names_df = names.to_frame()

import xlwt
names_df.to_excel('../output/sci_name_serise.xls')

# 4월 29일 문제 풀기

1. 아래와 같이 데이터 프레임을 만들어 보세요.
Names Births
---------------------------------
0 Bob 968
1 Jessica 155
2 Mary 77
3 John 578
4 Mel 973

In [149]:
data1 = pd.DataFrame(
    data = {
        'Names' : ['Bob','Jessica','Mary','John','Mel'],
        'Births' : [968,155,77,578,973]
            },
    columns = ['Names','Births'])

data1

Unnamed: 0,Names,Births
0,Bob,968
1,Jessica,155
2,Mary,77
3,John,578
4,Mel,973


2. 하나의 열을 선택하세요.
0 Bob
1 Jessica
2 Mary
3 John
4 Mel

In [150]:
data1[['Names']] #대소문자 구분 잘하자!!

Unnamed: 0,Names
0,Bob
1,Jessica
2,Mary
3,John
4,Mel


3. 0~3 번째 인덱스를 선택하세요.
Names Births
--------------------------------
0 Bob 968
1 Jessica 155
2 Mary 77

In [151]:
data1.iloc[:3]

Unnamed: 0,Names,Births
0,Bob,968
1,Jessica,155
2,Mary,77


4. Births 열이 100보다 큰 데이터를 선택해 보세요.
Names Births
-------------------------------
0 Bob 968
1 Jessica 155
3 John 578
4 Mel 973

In [152]:
data1[data1['Births']> 100]

Unnamed: 0,Names,Births
0,Bob,968
1,Jessica,155
3,John,578
4,Mel,973


5. 다음과 같이 데이터 프레임(df)을 만들어 보세요.
인덱스는 이름으로 하세요. (서준, 우현, 인아)

수학 영어 음악 체육
서준 90 98 85 100
예현 80 89 95 90
인아 70 95 100 90
민아 60 95 100 90

In [153]:
df = pd.DataFrame(
    data ={'수학':[90,80,70,60],
           '영어':[98,89,95,95],
           '음악':[85,95,100,100],
           '체육':[100,90,90,90]},
    index =['서준','예현','인아','민아'],
    columns =['수학','영어','음악','체육'])
df

Unnamed: 0,수학,영어,음악,체육
서준,90,98,85,100
예현,80,89,95,90
인아,70,95,100,90
민아,60,95,100,90


6. 데이터프레임(df)를 복제하여 변수 df1에 저장

In [154]:
df1 = df
df1


Unnamed: 0,수학,영어,음악,체육
서준,90,98,85,100
예현,80,89,95,90
인아,70,95,100,90
민아,60,95,100,90


7. df1의 1개 행(row)을 삭제 ==> '예현'

In [155]:
df1 = df1.drop(['예현'] , inplace= True)
df1



8. 데이터프레임 df를 복제하여 변수 df2에 저장

In [156]:
df2=df
df2

Unnamed: 0,수학,영어,음악,체육
서준,90,98,85,100
인아,70,95,100,90
민아,60,95,100,90


9. df2의 1개 열(column)을 삭제 ==> '수학'

In [157]:
df2=df2.drop(['수학'], axis=1)
df2

Unnamed: 0,영어,음악,체육
서준,98,85,100
인아,95,100,90
민아,95,100,90


10. 데이터프레임 df를 복제하여 변수 df3에 저장

In [158]:
df3 =df

11. df3의 2개 열(column)을 삭제 ==> '영어' '음악'

In [159]:
df3=df3.drop(['영어','음악'],axis=1)
df3

Unnamed: 0,수학,체육
서준,90,100
인아,70,90
민아,60,90
