##### Pandas
- 데이터 분석을 위한 사용이 쉽고 성능이 좋은 오픈소스 python 라이브러리
- `$ pip3 install pandas`
- 크게 두가지의 데이터 타입이 있습니다.
    - Series
        - Index와 Value로 이루어진 데이터 타입입니다.
    - DataFrame
        - Index와 Value와 Column으로 이루어진 데이터 타입입니다.
        - Column은 Series로 이루어져 있습니다.
        - 엑셀의 테이블 형태로 구성이 되고, Column별로 같은 데이터 타입을 갖습니다.

In [33]:
import numpy as np
import pandas as pd

##### Series

In [34]:
# 0~9까지 랜덤한 5개의 데이터를 Series 생성
data = pd.Series(np.random.randint(10, size=(5)))
data

0    4
1    3
2    7
3    8
4    5
dtype: int64

In [35]:
# index 설정
data = pd.Series(np.random.randint(10, size=5), index=["A","B","C","D","E"])
data

A    5
B    3
C    8
D    3
E    3
dtype: int64

In [36]:
data.index, data.values

(Index(['A', 'B', 'C', 'D', 'E'], dtype='object'), array([5, 3, 8, 3, 3]))

In [37]:
# value 값 확인
data.A, data.D

(5, 3)

In [38]:
data = pd.Series(np.random.randint(10, size=5), index=["1","2","3","4","5"])
data

1    6
2    3
3    5
4    9
5    7
dtype: int64

In [39]:
data.1

SyntaxError: invalid syntax (<ipython-input-39-d42d59a15bba>, line 1)

In [40]:
data

1    6
2    3
3    5
4    9
5    7
dtype: int64

In [41]:
# series에 이름과 인덱스에 이름을 설정할수 있습니다.
data.name = "random_number"
data.index.name = "index_number"
data

index_number
1    6
2    3
3    5
4    9
5    7
Name: random_number, dtype: int64

In [42]:
data = pd.Series(np.random.randint(10, size=5), index=["A","B","C","D","E"])
data

A    8
B    7
C    2
D    9
E    0
dtype: int64

In [43]:
data * 10

A    80
B    70
C    20
D    90
E     0
dtype: int64

In [44]:
data[["B","C","E"]]

B    7
C    2
E    0
dtype: int64

In [45]:
data[1::2]

B    7
D    9
dtype: int64

In [46]:
data[::-1]

E    0
D    9
C    2
B    7
A    8
dtype: int64

In [47]:
data > 5

A     True
B     True
C    False
D     True
E    False
dtype: bool

In [48]:
data[data > 5]

A    8
B    7
D    9
dtype: int64

In [49]:
# for문 사용 - list comprehention 으로도 사용이 가능
# [idx, val for idx, val in data.items() ]
for idx, val in data.items():
    print(idx, val)

A 8
B 7
C 2
D 9
E 0


In [50]:
# dictionary 데이터 타입의 데이터로 series 생성 가능
dic = {"D":3, "F":7, "E":5}
data2 = pd.Series(dic)
data2

D    3
E    5
F    7
dtype: int64

In [51]:
data

A    8
B    7
C    2
D    9
E    0
dtype: int64

In [52]:
data2

D    3
E    5
F    7
dtype: int64

In [53]:
result = data + data2
result

A     NaN
B     NaN
C     NaN
D    12.0
E     5.0
F     NaN
dtype: float64

In [54]:
# NaN 데이터 제거
print(result.notnull())
result[result.notnull()]

A    False
B    False
C    False
D     True
E     True
F    False
dtype: bool


D    12.0
E     5.0
dtype: float64

##### Dataframe
- row(index), value, column으로 이루져 있습니다.
- make
- insert
    - row
    - column
- append
- concat
- groupby, aggregate
- select
- merge

##### make

In [55]:
# 컬럼을 만들고 컬럼에 리스트 데이터를 추가해서 만드는 방법
df = pd.DataFrame(columns=["Email", "Name"])
df

Unnamed: 0,Email,Name


In [56]:
df["Name"] = ["fcamp", "dss"]
df["Email"] = ["fcamp@gmail.com", "dss@gmail.com"]
df

Unnamed: 0,Email,Name
0,fcamp@gmail.com,fcamp
1,dss@gmail.com,dss


In [57]:
df["Name"]

0    fcamp
1      dss
Name: Name, dtype: object

In [58]:
df["Email"]

0    fcamp@gmail.com
1      dss@gmail.com
Name: Email, dtype: object

In [59]:
# 딕셔너리 데이터 타입을 Dataframe으로 만들기
name = ["fcamp", "dss"]
email = ["fcamp@gmail.com", "dss@gmail.com"]
dic = {"Name":name, "Email":email}
df = pd.DataFrame(dic)
df

Unnamed: 0,Email,Name
0,fcamp@gmail.com,fcamp
1,dss@gmail.com,dss


In [60]:
dic

{'Name': ['fcamp', 'dss'], 'Email': ['fcamp@gmail.com', 'dss@gmail.com']}

In [61]:
# 인덱스를 추가해서 만들기
index_list = ["one", "two"]
df = pd.DataFrame(dic, index=index_list)
df

Unnamed: 0,Email,Name
one,fcamp@gmail.com,fcamp
two,dss@gmail.com,dss


In [62]:
df.index, df.columns, df.values

(Index(['one', 'two'], dtype='object'),
 Index(['Email', 'Name'], dtype='object'),
 array([['fcamp@gmail.com', 'fcamp'],
        ['dss@gmail.com', 'dss']], dtype=object))

##### Insert
- row
- column

In [63]:
# row
name = ["fcamp", "dss"]
email = ["fcamp@gmail.com", "dss@gmail.com"]
dic = {"Name":name, "Email":email}
df = pd.DataFrame(dic)
df

Unnamed: 0,Email,Name
0,fcamp@gmail.com,fcamp
1,dss@gmail.com,dss


In [64]:
df.loc[0]

Email    fcamp@gmail.com
Name               fcamp
Name: 0, dtype: object

In [65]:
df.loc[1]

Email    dss@gmail.com
Name               dss
Name: 1, dtype: object

In [66]:
# loc 지정해서 데이터를 넣는 방법
df.loc[2] = {"Email":"data@gmail.com", "Name":"data"}
df

Unnamed: 0,Email,Name
0,fcamp@gmail.com,fcamp
1,dss@gmail.com,dss
2,data@gmail.com,data


In [67]:
# loc 이용해서 항상 가장 마지막에 넣는 방법
print(len(df))
df.loc[len(df)] = {"Email":"data2@gmail.com", "Name":"data2"}
df

3


Unnamed: 0,Email,Name
0,fcamp@gmail.com,fcamp
1,dss@gmail.com,dss
2,data@gmail.com,data
3,data2@gmail.com,data2


In [68]:
# column
df["Address"] = ""
df

Unnamed: 0,Email,Name,Address
0,fcamp@gmail.com,fcamp,
1,dss@gmail.com,dss,
2,data@gmail.com,data,
3,data2@gmail.com,data2,


In [69]:
df["Address"] = ["Seoul", "Busan", "Jeju", "Deagu", "Seoul"]
df

ValueError: Length of values does not match length of index

In [70]:
# apply
# 함수를 사용해서 함수의 리턴값이 데이터로 들어갑니다.
def count_char(name):
    return "{}({})".format(name, len(name))

df["Name_Count"] = df["Name"].apply(count_char)
df

Unnamed: 0,Email,Name,Address,Name_Count
0,fcamp@gmail.com,fcamp,,fcamp(5)
1,dss@gmail.com,dss,,dss(3)
2,data@gmail.com,data,,data(4)
3,data2@gmail.com,data2,,data2(5)


In [71]:
df["Address_Count"] = df["Address"].apply(lambda addr:"{}({})".format(addr, len(addr)))
df

Unnamed: 0,Email,Name,Address,Name_Count,Address_Count
0,fcamp@gmail.com,fcamp,,fcamp(5),(0)
1,dss@gmail.com,dss,,dss(3),(0)
2,data@gmail.com,data,,data(4),(0)
3,data2@gmail.com,data2,,data2(5),(0)


In [72]:
# append
# 사람으 이름과 나이가 들어간 데이터를 만듭니다.
import random, string

def get_name():
    names = ["Adam", "Alan", "Alex", "Alvin", "Andrew", "Anthony", "Arnold", "Jin", "Billy", "Anchal"]
    return random.choice(names)
    
get_name()

'Andrew'

In [73]:
def get_age(start=20, end=40):
    return random.randint(start, end)

get_age()

28

In [74]:
# list
def make_data(rows=10):
    datas = []
    for _ in range(rows):
        data = {"Age":get_age(), "Name":get_name()}
        datas.append(data)
    return datas

make_data()

[{'Age': 35, 'Name': 'Anchal'},
 {'Age': 34, 'Name': 'Anthony'},
 {'Age': 32, 'Name': 'Billy'},
 {'Age': 29, 'Name': 'Jin'},
 {'Age': 20, 'Name': 'Alan'},
 {'Age': 22, 'Name': 'Andrew'},
 {'Age': 29, 'Name': 'Billy'},
 {'Age': 35, 'Name': 'Billy'},
 {'Age': 28, 'Name': 'Jin'},
 {'Age': 34, 'Name': 'Arnold'}]

In [75]:
data1 = make_data()
df1 = pd.DataFrame(data1)
df1

Unnamed: 0,Age,Name
0,32,Alex
1,30,Arnold
2,31,Andrew
3,29,Adam
4,33,Andrew
5,20,Arnold
6,38,Anchal
7,36,Adam
8,25,Andrew
9,33,Alan


In [76]:
data2 = make_data()
df2 = pd.DataFrame(data2)
df2

Unnamed: 0,Age,Name
0,27,Alvin
1,21,Jin
2,20,Billy
3,24,Adam
4,29,Adam
5,38,Alan
6,27,Alvin
7,34,Arnold
8,40,Alvin
9,24,Billy


In [77]:
# df1과 df2를 합치고 싶을때 append를 이용할수 있습니다.
df3 = df1.append(df2)
df3

Unnamed: 0,Age,Name
0,32,Alex
1,30,Arnold
2,31,Andrew
3,29,Adam
4,33,Andrew
5,20,Arnold
6,38,Anchal
7,36,Adam
8,25,Andrew
9,33,Alan


In [78]:
# index 리셋하기
# drop(True) - 새롭게 생성되는 인덱스 컬럼을 삭제합니다.
# inplace(True) - 함수를 사용하는 객체 자체 인덱스를 리셋합니다.
df3.reset_index(drop=True, inplace=True)
df3

Unnamed: 0,Age,Name
0,32,Alex
1,30,Arnold
2,31,Andrew
3,29,Adam
4,33,Andrew
5,20,Arnold
6,38,Anchal
7,36,Adam
8,25,Andrew
9,33,Alan


In [79]:
# append를 할때 인덱스를 리셋
df3 = df1.append(df2, ignore_index=True)
df3

Unnamed: 0,Age,Name
0,32,Alex
1,30,Arnold
2,31,Andrew
3,29,Adam
4,33,Andrew
5,20,Arnold
6,38,Anchal
7,36,Adam
8,25,Andrew
9,33,Alan


##### concat
- rows
- columns

In [80]:
# concat rows
df1

Unnamed: 0,Age,Name
0,32,Alex
1,30,Arnold
2,31,Andrew
3,29,Adam
4,33,Andrew
5,20,Arnold
6,38,Anchal
7,36,Adam
8,25,Andrew
9,33,Alan


In [81]:
df2

Unnamed: 0,Age,Name
0,27,Alvin
1,21,Jin
2,20,Billy
3,24,Adam
4,29,Adam
5,38,Alan
6,27,Alvin
7,34,Arnold
8,40,Alvin
9,24,Billy


In [82]:
df3 = pd.concat([df1, df2]).reset_index(drop=True)
df3

Unnamed: 0,Age,Name
0,32,Alex
1,30,Arnold
2,31,Andrew
3,29,Adam
4,33,Andrew
5,20,Arnold
6,38,Anchal
7,36,Adam
8,25,Andrew
9,33,Alan


In [83]:
# concat colums
# axis = 1 설정하면 가로로 합쳐집니다.
pd.concat([df3, df1], axis=1)

Unnamed: 0,Age,Name,Age.1,Name.1
0,32,Alex,32.0,Alex
1,30,Arnold,30.0,Arnold
2,31,Andrew,31.0,Andrew
3,29,Adam,29.0,Adam
4,33,Andrew,33.0,Andrew
5,20,Arnold,20.0,Arnold
6,38,Anchal,38.0,Anchal
7,36,Adam,36.0,Adam
8,25,Andrew,25.0,Andrew
9,33,Alan,33.0,Alan


In [84]:
df4 = pd.concat([df1, df3], axis=1, join='inner')
df4

Unnamed: 0,Age,Name,Age.1,Name.1
0,32,Alex,32,Alex
1,30,Arnold,30,Arnold
2,31,Andrew,31,Andrew
3,29,Adam,29,Adam
4,33,Andrew,33,Andrew
5,20,Arnold,20,Arnold
6,38,Anchal,38,Anchal
7,36,Adam,36,Adam
8,25,Andrew,25,Andrew
9,33,Alan,33,Alan


##### Group by
- 이름별 평균 나이를 나타내는 데이터 프레임을 만들겁니다.

In [85]:
# 20명에 대한 이름과 나이를 나타내는 데이터 프레임을 만듭니다.
g_df = pd.DataFrame(make_data(20))
g_df.tail()

Unnamed: 0,Age,Name
15,32,Anthony
16,37,Arnold
17,26,Alan
18,35,Alan
19,39,Adam


In [86]:
# 이름을 unique로 출력
result1 = np.array(list(set(g_df["Name"].values)))
len(result1), result1

(9, array(['Anthony', 'Alvin', 'Alex', 'Arnold', 'Billy', 'Anchal', 'Andrew',
        'Alan', 'Adam'], dtype='<U7'))

In [87]:
# pandas의 unique를 이용하여 유니크한 이름을 출력
result2 = g_df["Name"].unique()
len(result2), result2

(9, array(['Alan', 'Alvin', 'Andrew', 'Billy', 'Anchal', 'Anthony', 'Alex',
        'Arnold', 'Adam'], dtype=object))

In [88]:
# groupby - size
result_df = g_df.groupby("Name").size().reset_index(name="counts")
result_df

Unnamed: 0,Name,counts
0,Adam,2
1,Alan,4
2,Alex,3
3,Alvin,2
4,Anchal,1
5,Andrew,3
6,Anthony,2
7,Arnold,2
8,Billy,1


In [89]:
# sort values
result_df = result_df.sort_values(by=["counts"], ascending=False)
result_df.reset_index(drop=True, inplace=True)
result_df

Unnamed: 0,Name,counts
0,Alan,4
1,Alex,3
2,Andrew,3
3,Adam,2
4,Alvin,2
5,Anthony,2
6,Arnold,2
7,Anchal,1
8,Billy,1


In [90]:
# agg : min
# 나이가 제일 어린 나이로 name 그룹핑 합니다.
g_df.groupby("Name").agg("min").reset_index()

Unnamed: 0,Name,Age
0,Adam,22
1,Alan,23
2,Alex,32
3,Alvin,25
4,Anchal,29
5,Andrew,27
6,Anthony,20
7,Arnold,28
8,Billy,37


In [91]:
# 가장 나이가 많은 이름으로 그룹핑
g_df.groupby("Name").agg("max").reset_index()

Unnamed: 0,Name,Age
0,Adam,39
1,Alan,38
2,Alex,39
3,Alvin,36
4,Anchal,29
5,Andrew,39
6,Anthony,32
7,Arnold,37
8,Billy,37


In [92]:
# agg : mean
g_df.groupby("Name").agg("mean").reset_index()

Unnamed: 0,Name,Age
0,Adam,30.5
1,Alan,30.5
2,Alex,35.666667
3,Alvin,30.5
4,Anchal,29.0
5,Andrew,35.0
6,Anthony,26.0
7,Arnold,32.5
8,Billy,37.0


In [93]:
# agg : sum
g_df.groupby("Name").agg("sum").reset_index()

Unnamed: 0,Name,Age
0,Adam,61
1,Alan,122
2,Alex,107
3,Alvin,61
4,Anchal,29
5,Andrew,105
6,Anthony,52
7,Arnold,65
8,Billy,37


In [94]:
# agg : median
g_df.groupby("Name").agg("median").reset_index()

Unnamed: 0,Name,Age
0,Adam,30.5
1,Alan,30.5
2,Alex,36.0
3,Alvin,30.5
4,Anchal,29.0
5,Andrew,39.0
6,Anthony,26.0
7,Arnold,32.5
8,Billy,37.0


In [95]:
# agg으로 여러개 컬럼 생성
df = g_df.groupby("Name").agg(["min","max","mean"]).reset_index()
df

Unnamed: 0_level_0,Name,Age,Age,Age
Unnamed: 0_level_1,Unnamed: 1_level_1,min,max,mean
0,Adam,22,39,30.5
1,Alan,23,38,30.5
2,Alex,32,39,35.666667
3,Alvin,25,36,30.5
4,Anchal,29,29,29.0
5,Andrew,27,39,35.0
6,Anthony,20,32,26.0
7,Arnold,28,37,32.5
8,Billy,37,37,37.0


In [96]:
# select

In [97]:
df.head()

Unnamed: 0_level_0,Name,Age,Age,Age
Unnamed: 0_level_1,Unnamed: 1_level_1,min,max,mean
0,Adam,22,39,30.5
1,Alan,23,38,30.5
2,Alex,32,39,35.666667
3,Alvin,25,36,30.5
4,Anchal,29,29,29.0


In [98]:
df.tail(3)

Unnamed: 0_level_0,Name,Age,Age,Age
Unnamed: 0_level_1,Unnamed: 1_level_1,min,max,mean
6,Anthony,20,32,26.0
7,Arnold,28,37,32.5
8,Billy,37,37,37.0


In [99]:
df.tail(n=7)

Unnamed: 0_level_0,Name,Age,Age,Age
Unnamed: 0_level_1,Unnamed: 1_level_1,min,max,mean
2,Alex,32,39,35.666667
3,Alvin,25,36,30.5
4,Anchal,29,29,29.0
5,Andrew,27,39,35.0
6,Anthony,20,32,26.0
7,Arnold,28,37,32.5
8,Billy,37,37,37.0


In [100]:
df[3:6]

Unnamed: 0_level_0,Name,Age,Age,Age
Unnamed: 0_level_1,Unnamed: 1_level_1,min,max,mean
3,Alvin,25,36,30.5
4,Anchal,29,29,29.0
5,Andrew,27,39,35.0


In [101]:
df[3:]

Unnamed: 0_level_0,Name,Age,Age,Age
Unnamed: 0_level_1,Unnamed: 1_level_1,min,max,mean
3,Alvin,25,36,30.5
4,Anchal,29,29,29.0
5,Andrew,27,39,35.0
6,Anthony,20,32,26.0
7,Arnold,28,37,32.5
8,Billy,37,37,37.0


In [102]:
df.loc[3]

Name          Alvin
Age   min        25
      max        36
      mean     30.5
Name: 3, dtype: object

In [103]:
df

Unnamed: 0_level_0,Name,Age,Age,Age
Unnamed: 0_level_1,Unnamed: 1_level_1,min,max,mean
0,Adam,22,39,30.5
1,Alan,23,38,30.5
2,Alex,32,39,35.666667
3,Alvin,25,36,30.5
4,Anchal,29,29,29.0
5,Andrew,27,39,35.0
6,Anthony,20,32,26.0
7,Arnold,28,37,32.5
8,Billy,37,37,37.0


In [104]:
df.loc[2]["Age"]["min"]

32

In [105]:
df.loc[3]["Name"][""]

'Alvin'

In [106]:
data = {
    "Name":df["Name"],
    "Min":df["Age"]["min"],
    "Max":df["Age"]["max"],
    "Mean":df["Age"]["mean"],
}
n_df = pd.DataFrame(data)
n_df

Unnamed: 0,Max,Mean,Min,Name
0,39,30.5,22,Adam
1,38,30.5,23,Alan
2,39,35.666667,32,Alex
3,36,30.5,25,Alvin
4,29,29.0,29,Anchal
5,39,35.0,27,Andrew
6,32,26.0,20,Anthony
7,37,32.5,28,Arnold
8,37,37.0,37,Billy


In [107]:
# 평균나이가 30세 이상이 데이터를 내림차순으로 정렬하고 인덱스를 재설정
n_df[n_df["Mean"]>30].sort_values(by=["Mean"], ascending=False).reset_index(drop=True)

Unnamed: 0,Max,Mean,Min,Name
0,37,37.0,37,Billy
1,39,35.666667,32,Alex
2,39,35.0,27,Andrew
3,37,32.5,28,Arnold
4,39,30.5,22,Adam
5,38,30.5,23,Alan
6,36,30.5,25,Alvin


In [108]:
g_df

Unnamed: 0,Age,Name
0,38,Alan
1,25,Alvin
2,27,Andrew
3,36,Alvin
4,37,Billy
5,29,Anchal
6,20,Anthony
7,36,Alex
8,28,Arnold
9,39,Andrew


In [109]:
n_df

Unnamed: 0,Max,Mean,Min,Name
0,39,30.5,22,Adam
1,38,30.5,23,Alan
2,39,35.666667,32,Alex
3,36,30.5,25,Alvin
4,29,29.0,29,Anchal
5,39,35.0,27,Andrew
6,32,26.0,20,Anthony
7,37,32.5,28,Arnold
8,37,37.0,37,Billy


In [110]:
n_df["Count"] = list(g_df.groupby("Name").size())
n_df

Unnamed: 0,Max,Mean,Min,Name,Count
0,39,30.5,22,Adam,2
1,38,30.5,23,Alan,4
2,39,35.666667,32,Alex,3
3,36,30.5,25,Alvin,2
4,29,29.0,29,Anchal,1
5,39,35.0,27,Andrew,3
6,32,26.0,20,Anthony,2
7,37,32.5,28,Arnold,2
8,37,37.0,37,Billy,1


In [111]:
# drop - mean 데이터를 가장 뒤로 이동시키겠습니다.
mean = n_df["Mean"]
n_df.drop("Mean", axis=1, inplace=True)
n_df

Unnamed: 0,Max,Min,Name,Count
0,39,22,Adam,2
1,38,23,Alan,4
2,39,32,Alex,3
3,36,25,Alvin,2
4,29,29,Anchal,1
5,39,27,Andrew,3
6,32,20,Anthony,2
7,37,28,Arnold,2
8,37,37,Billy,1


In [112]:
n_df["Mean"] = mean

In [None]:
n_df

In [None]:
# rename colum
n_df.rename(columns={"Max":"Maximum","Name":"Unique_Name"})

##### Merge = sql(join)
- user_df : 아이디, 이름, 나이 데이터 프레임 생성
- money_df : 아이디, 돈 데이터 프레임을 생성

In [114]:
user_df = pd.DataFrame(columns=["UserID", "Name", "Age"])
for idx in range(1,9):
    name = get_name()
    
    # 중복 이름 제거
    while name in list(user_df["Name"]):
        name = get_name()
        
    # 데이터 name_df insert
    data = {"Name":name, "UserID":idx, "Age":get_age()}
    user_df.loc[len(user_df)] = data
    
user_df

Unnamed: 0,UserID,Name,Age
0,1,Adam,40
1,2,Anthony,30
2,3,Alvin,29
3,4,Alex,31
4,5,Billy,36
5,6,Alan,38
6,7,Andrew,38
7,8,Arnold,21


In [115]:
# 중복되는 이름 없이 아이디, 이름, 나이 데이터가 포함된 DF 생성

user_df = pd.DataFrame(columns = ["UserID", "Name", "Age"])

for idx in range(1, 11):
    name = get_name()
    
    # 중복 이름 제거
    while name in list(user_df["Name"]):
        name = get_name()
        
    # 데이터 name_df insert
    
    data = {"Name":name, "UserID":idx, "Age":get_age()}
    user_df.loc[len(user_df)] = data

user_df

Unnamed: 0,UserID,Name,Age
0,1,Jin,36
1,2,Andrew,22
2,3,Alan,39
3,4,Anthony,22
4,5,Arnold,33
5,6,Anchal,25
6,7,Billy,24
7,8,Alvin,22
8,9,Adam,40
9,10,Alex,35


In [None]:
money_df = pd.DataFrame(columns = ["ID", "Money"])

for idx in range(15):
    money = random.randint(1, 20) * 1000
    data = {"Money":money, "ID":random.randint(1,8)}
    money_df.loc[len(money_df)] = data
    
money_df

In [None]:
# Merge가 동작하는 방식

# step1: n * n 만큼의 중복 데이터를 만든다
# step2: ID가 같은 데이터만 출력한다


In [None]:
# merge - user_df, money_df - key:ID, UserID
# money 데이터 기준으로 merge
moeny_df.merge(user_df, left_on="ID", right_on="UserID")

In [None]:
# usr 데이터 기준으로 merge
user_df.merge(money_df, left_on="UserID", right_on="ID")

In [None]:
# left_on / right_on 사용하지 않기

user_df.rename(columns = {"UserID":"ID"}, inplace = True)

In [None]:
result_df = pd.merge(money_df, user_df)
result_df

In [None]:
# money data 활용

money_list = result_df.groupby("Name").sum()["Money"].reset_index()
money_list

In [None]:
# merge - outer
# fillna - NaN을 특정 데이터로 채워주는 함수
result = pd.merge(user_df, money_list, how = 'outer').fillna(value = 0)
result

In [None]:
# change data type 

# Series 별로 dtype을 바꿀 수 있기 때문이다

result["Money"] = result["Money"].astype("int")
result

#### Dataframe Input / Output 

- csv : DB 데이터를 뽑아낼 때 sql으로도 뽑아내지만, csv 파일로 얻는 경우가 많다
- excel
- excel 파일을 읽고 쓰기 위해서는 xlrd을 해야 한다 

In [None]:
# save csv

"""
result.to_csv('{filename}.csv', index = false) # index = True 인 경우 index 값이 추가되어 저장된다
"""

In [None]:
# load csv

df = pd.read_csv('surfing_data.csv')
df

In [None]:
# excel은 utf8을 사용하지 않아, 저장하고 불러올 때 인코딩 타입을 확인해야 한다

df.to_excel('surfing_data.csv', sheet_name = 'Sheet1')

In [None]:
df.to_excel?

In [None]:
df.pd.read_excel('surfing_data.csv', sheet_name = 'Sheet1')

#### 두진 강사님 

In [117]:
money_df = pd.DataFrame(columns=["ID", "Money"])

for idx in range(15):
    money = random.randint(1, 20) * 1000
    data = {"Money":money, "ID":random.randint(1, 8)}
    money_df.loc[len(money_df)] = data
    
money_df    

Unnamed: 0,ID,Money
0,1,8000
1,8,8000
2,8,1000
3,8,3000
4,5,2000
5,2,1000
6,6,7000
7,7,20000
8,2,18000
9,2,12000


In [118]:
# merge - user_df, money_df - key:ID, UserID
# money 데이터 기준으로 merge
money_df.merge(user_df, left_on="ID", right_on="UserID")

Unnamed: 0,ID,Money,UserID,Name,Age
0,1,8000,1,Jin,36
1,8,8000,8,Alvin,22
2,8,1000,8,Alvin,22
3,8,3000,8,Alvin,22
4,5,2000,5,Arnold,33
5,5,8000,5,Arnold,33
6,2,1000,2,Andrew,22
7,2,18000,2,Andrew,22
8,2,12000,2,Andrew,22
9,2,6000,2,Andrew,22


In [119]:
# money 데이터 기준으로 merge
user_df.merge(money_df, left_on="UserID", right_on="ID")

Unnamed: 0,UserID,Name,Age,ID,Money
0,1,Jin,36,1,8000
1,2,Andrew,22,2,1000
2,2,Andrew,22,2,18000
3,2,Andrew,22,2,12000
4,2,Andrew,22,2,6000
5,2,Andrew,22,2,5000
6,4,Anthony,22,4,15000
7,4,Anthony,22,4,7000
8,5,Arnold,33,5,2000
9,5,Arnold,33,5,8000


In [120]:
user_df.rename(columns={"UserID":"ID"}, inplace=True)
user_df

Unnamed: 0,ID,Name,Age
0,1,Jin,36
1,2,Andrew,22
2,3,Alan,39
3,4,Anthony,22
4,5,Arnold,33
5,6,Anchal,25
6,7,Billy,24
7,8,Alvin,22
8,9,Adam,40
9,10,Alex,35


In [None]:
result_df = pd.merge(money_df, user_df)
result_df

In [None]:
money_list = result_df.groupby("Name").sum()["Money"].reset_index()
money_list

In [None]:
df

In [None]:
df1

In [None]:
# 38세 데이터만 필터링
df1[df1["Age"] == 38]

In [None]:
money_list

In [None]:
# merge - outer
# fillna - NaN을 특정 데이터로 채워줍니다.
result = pd.merge(user_df, money_list, how='outer').fillna(value=0)
result

In [None]:
# change data type
result["Money"] = result["Money"].astype("int")
result

##### Dataframe Input / Output
- csv, excel
- `$ pip3 install xlrd`
- `$ pip3 install openpyxl`

In [None]:
result

In [None]:
# save csv
result.to_csv('foo.csv', index=False)

In [None]:
# load csv
df = pd.read_csv('foo.csv')
df

In [None]:
!pwd

In [None]:
# excel 은 저장 되는 인코딩 타입을 확인해야 합니다.(utf-8을 사용하지 않습니다.)
df.to_excel('../ttt/foo.xlsx', sheet_name='Sheet1')

In [None]:
path = "/Users/rada/Documents/fastcampus/dss8/02_numpy_pandas/B/test/"
df.to_excel(path+'foo.xlsx', sheet_name='Sheet1')

In [121]:
# load excel
df = pd.read_excel('foo.xlsx', 'Sheet1')
df

Unnamed: 0,ID,Name,Age,Money
0,1,Anchal,25,3000
1,2,Alan,21,20000
2,3,Jin,39,0
3,4,Billy,40,58000
4,5,Alex,27,24000
5,6,Andrew,26,5000
6,7,Adam,27,6000
7,8,Arnold,23,41000
