# 240930

# 판다스 데이터프레임
- 데이터프레임의 추가는 리스트/ 딕셔너리로 가능하다.
- 딕셔너리 형태가 즉시 colnames를 지정할 수 있기 때문에 더욱 효과적이다.

In [95]:
from google.colab import drive
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [12]:
import pandas as pd
import numpy as np

print(np.__version__)
print(pd.__version__)
#버전 확인은 필수적으로 진행하자.

1.26.4
2.1.4


In [6]:
# 리스트를 통해 데이터 프레임 만들기.
# colnames를 따로 지정해 주어야 한다. 없으면 0:n 까지의 기본 인덱싱이 colnames로 지정된다.

data = [["3598","alpha", 123456, 10.05],
        ["3565","beta", 112343, 1.05],
        ["8402","delta", 876723, 1.25],
        ["7272","gamma", 815474, 12.25]]

#colnames 지정
cols = ["stock_code", "stock_name", "current_price", "up & downs"]
df = pd.DataFrame(data, columns=cols)
df

Unnamed: 0,stock_code,stock_name,current_price,up & downs
0,3598,alpha,123456,10.05
1,3565,beta,112343,1.05
2,8402,delta,876723,1.25
3,7272,gamma,815474,12.25


In [11]:
# 튜플 방식 - 즉시 colnames를 지정해 줄 수 있다.

data2 = {'stock_code': [3598, 3565, 8402, 7272],
         'stock_name': ['alpha', 'beta', 'delta', 'gamma'],
         'current_price': [123456, 112343, 876723, 815474],
         "up & downs": [10.05,1.05,1.25,12.25]}

df2 = pd.DataFrame(data2); df2

Unnamed: 0,stock_code,stock_name,current_price,up & downs
0,3598,alpha,123456,10.05
1,3565,beta,112343,1.05
2,8402,delta,876723,1.25
3,7272,gamma,815474,12.25


## Sample Code & 메서드


In [13]:
import pandas as pd
import random
import string


# Regenerating the dictionary where each key (종목코드, 종목명, 현재가, 등락률) has a list of values

data_dict = {
    "종목코드": [],
    "종목명": [],
    "현재가": [],
    "등락률": []
}

# Function to generate simpler 종목코드 and 종목명 ensuring the 종목코드 starts with '0'
def generate_code_name_for_dict(existing_codes):
    while True:
        code = '0' + ''.join(random.choices(string.digits, k=5))  # Ensure it starts with '0'
        name = ''.join(random.choices(string.ascii_uppercase, k=2))  # Simpler 종목명 with 2 letters
        if code not in existing_codes:
            return code, name

# Generating 10,000 rows of data
existing_codes_for_dict = set()

for _ in range(10000):
    code, name = generate_code_name_for_dict(existing_codes_for_dict)
    existing_codes_for_dict.add(code)
    current_price = random.randint(1000, 1000000)  # Simpler current price
    change_rate = round(random.uniform(-5, 5), 2)  # Simpler change rate

    data_dict["종목코드"].append(code)
    data_dict["종목명"].append(name)
    data_dict["현재가"].append(current_price)
    data_dict["등락률"].append(change_rate)

# Previewing a portion of the dictionary
data_dict_preview = {k: data_dict[k][:5] for k in data_dict}

pd.DataFrame(data_dict_preview)

Unnamed: 0,종목코드,종목명,현재가,등락률
0,91774,NY,168281,-3.17
1,96693,AI,218182,-3.43
2,20177,WL,82960,0.58
3,60139,ZY,334177,2.55
4,11894,CI,697324,3.96


In [14]:
df2.head() #상위 n개의 데이터 관측

Unnamed: 0,stock_code,stock_name,current_price,up & downs
0,3598,alpha,123456,10.05
1,3565,beta,112343,1.05
2,8402,delta,876723,1.25
3,7272,gamma,815474,12.25


In [15]:
df2.tail() #하위 n개의 데이터 관측

Unnamed: 0,stock_code,stock_name,current_price,up & downs
0,3598,alpha,123456,10.05
1,3565,beta,112343,1.05
2,8402,delta,876723,1.25
3,7272,gamma,815474,12.25


In [16]:
df2.info() #데이터 프레임에 대한 간략한 정보

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   stock_code     4 non-null      int64  
 1   stock_name     4 non-null      object 
 2   current_price  4 non-null      int64  
 3   up & downs     4 non-null      float64
dtypes: float64(1), int64(2), object(1)
memory usage: 256.0+ bytes


In [18]:
df2.describe() #데이터프레임의 기술통계

Unnamed: 0,stock_code,current_price,up & downs
count,4.0,4.0,4.0
mean,5709.25,481999.0,6.15
std,2499.884981,421193.246616,5.843515
min,3565.0,112343.0,1.05
25%,3589.75,120677.75,1.2
50%,5435.0,469465.0,5.65
75%,7554.5,830786.25,10.6
max,8402.0,876723.0,12.25


In [24]:
data2 = {
    "종목코드" : ['039900', '039910', '039920'],
    "종목명" : ["알파코", "A", "B"],
    "현재가" : [10000000, 500000, 1000],
    "등락률" : [10.05, 1.05, 1.28]
}

df2 = pd.DataFrame(data = data2)
df2

Unnamed: 0,종목코드,종목명,현재가,등락률
0,39900,알파코,10000000,10.05
1,39910,A,500000,1.05
2,39920,B,1000,1.28


In [25]:
df2 = df2.set_index('종목코드')
df2

Unnamed: 0_level_0,종목명,현재가,등락률
종목코드,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
39900,알파코,10000000,10.05
39910,A,500000,1.05
39920,B,1000,1.28


In [37]:
from pandas import DataFrame

data = [
    ["알파코", 10000000, 10.05],
    ["A", 500000, 1.05],
    ["B", 1000, 1.28]
]

#사용할 인덱스, 칼럼명 생성
index = ["039900", "039910", "039900"]
columns = ["종목명", "현재가", "등락률"]

#데이터, 인덱스, 칼럼명을 전부 다르게 지정해준다.
df = DataFrame(data=data, index=index, columns=columns)
df

Unnamed: 0,종목명,현재가,등락률
39900,알파코,10000000,10.05
39910,A,500000,1.05
39900,B,1000,1.28


In [38]:
df.index.shape

(3,)

In [39]:
df.values #원래 형태로 보기 - 값을 위주로 본다.

array([['알파코', 10000000, 10.05],
       ['A', 500000, 1.05],
       ['B', 1000, 1.28]], dtype=object)

In [40]:
df.현재가

Unnamed: 0,현재가
39900,10000000
39910,500000
39900,1000


In [41]:
df['현재가']

Unnamed: 0,현재가
39900,10000000
39910,500000
39900,1000


In [42]:
df[['현재가']]

Unnamed: 0,현재가
39900,10000000
39910,500000
39900,1000


## 로우 인덱싱
- loc, iloc
- loc : label를 기준으로 인덱싱
- iloc : index를 기준으로 인덱싱

In [44]:
import seaborn as sns
print(sns.__version__)

0.13.1


In [45]:
iris = sns.load_dataset('iris')
iris.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [49]:
#iris.loc[행, 열]
iris.loc[[0,9], ["sepal_width", "petal_width"]]

Unnamed: 0,sepal_width,petal_width
0,3.5,0.2
9,3.1,0.1


In [57]:
iris.iloc[[0,9], [1,3]] #그래서 1,3열이 뭔데? 원본 데이터를 모른다면 이해하기가 어렵다는 단점이 있다.

Unnamed: 0,sepal_width,petal_width
0,3.5,0.2
9,3.1,0.1


In [55]:
iris.loc[iris['sepal_width'] >= 4.0, :] #반환되는 인덱스 값이 너무 귀찮을 때

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
14,5.8,4.0,1.2,0.2,setosa
15,5.7,4.4,1.5,0.4,setosa
32,5.2,4.1,1.5,0.1,setosa
33,5.5,4.2,1.4,0.2,setosa


In [58]:
iris.loc[iris['sepal_width'] >= 4.0, :].reset_index(drop=True) #인덱스가 원래 값이 아닌 슬라이싱 후 자료형의 인덱싱을 따르게 바꾸었다!

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.8,4.0,1.2,0.2,setosa
1,5.7,4.4,1.5,0.4,setosa
2,5.2,4.1,1.5,0.1,setosa
3,5.5,4.2,1.4,0.2,setosa


## 다중조건
- & : and 연산자
- | : or 연산자

*괄호로 각 조건문들을 구분하는 걸 잊지 말자!*


In [70]:
result = iris.loc[(iris['species'] == "setosa") &
                  (iris['petal_width'] < 0.4) &
                  (iris['petal_length'] == 1.4), :].reset_index(drop = True)
result.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,5.0,3.6,1.4,0.2,setosa
3,4.6,3.4,1.4,0.3,setosa
4,4.4,2.9,1.4,0.2,setosa


## 컬럼 추가 및 삭제


In [72]:
iris2 =iris.copy()

In [73]:
iris2['newCol'] = 0
iris2['sepals'] = iris2['sepal_length'] * iris2['sepal_width']
iris2.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species,newCol,sepals
0,5.1,3.5,1.4,0.2,setosa,0,17.85
1,4.9,3.0,1.4,0.2,setosa,0,14.7
2,4.7,3.2,1.3,0.2,setosa,0,15.04
3,4.6,3.1,1.5,0.2,setosa,0,14.26
4,5.0,3.6,1.4,0.2,setosa,0,18.0


In [84]:
iris2 = iris2.drop("sepals", axis = 1); iris2

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species,newCol
0,5.1,3.5,1.4,0.2,setosa,0
1,4.9,3.0,1.4,0.2,setosa,0
2,4.7,3.2,1.3,0.2,setosa,0
3,4.6,3.1,1.5,0.2,setosa,0
4,5.0,3.6,1.4,0.2,setosa,0
...,...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,virginica,0
146,6.3,2.5,5.0,1.9,virginica,0
147,6.5,3.0,5.2,2.0,virginica,0
148,6.2,3.4,5.4,2.3,virginica,0


In [79]:
from pandas import DataFrame

data = [
    ["알파코", 10000000, 10.05],
    ["A", 500000, 1.05],
    ["B", 1000, 1.28]
]

#사용할 인덱스, 칼럼명 생성
index = ["039900", "039910", "039900"]
columns = ["종목명", "현재가", "등락률"]

#데이터, 인덱스, 칼럼명을 전부 다르게 지정해준다.
df = DataFrame(data=data, index=index, columns=columns)
df

Unnamed: 0,종목명,현재가,등락률
39900,알파코,10000000,10.05
39910,A,500000,1.05
39900,B,1000,1.28


In [80]:
df.drop('종목명', axis = 1, inplace = True)
df

Unnamed: 0,현재가,등락률
39900,10000000,10.05
39910,500000,1.05
39900,1000,1.28


In [85]:
data = [
    ["1,000", "1,100", '1,510'],
    ["1,410", "1,420", '1,790'],
    ["850", "900", '1,185'],
]
columns = ["03/02", "03/03", "03/04"]
df = DataFrame(data=data, columns=columns)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   03/02   3 non-null      object
 1   03/03   3 non-null      object
 2   03/04   3 non-null      object
dtypes: object(3)
memory usage: 200.0+ bytes


## 데이터 불러오기

In [None]:
# pip install openpyxl
# import openpyxl
# openpyxl.__version__

In [91]:
iris.to_csv('iris_240930.xlsx', index = False) #파일 쓰기 -> csv, excel 등 가능

In [92]:
df = pd.read_csv("iris_240930.xlsx") #파일 읽어오기 -> csv, excel 등 가능
df.head(10)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa
5,5.4,3.9,1.7,0.4,setosa
6,4.6,3.4,1.4,0.3,setosa
7,5.0,3.4,1.5,0.2,setosa
8,4.4,2.9,1.4,0.2,setosa
9,4.9,3.1,1.5,0.1,setosa


In [93]:
# 파이썬을 통해 파일 경로 위치하기


In [96]:
pwd

'/content'