# 넘파이(Numpy)
### 넘파이 라이브러리 
### 넘파이 라이브러리의 주요기능과 문법
### 인공지능 모델 개발에 필요한 기능

### 넘파이
- 넘파이는 수치 계산에 특화된 라이브러리
- 다차원 배열 객체인 ndarray를 중심으로 작동
  - 고성능 다차원 배열 객체를 제공하며, 요소의 데이터 타입을 통일
- 다양한 함수 지원
  - 배열 연산, 선형 대수, 푸리에 변환, 난수 생성 등 다양한 수학적 기능을 제공

In [2]:
import numpy as np

In [3]:
arr = np.array([1, 2, 3, 4, 5])
arr + 10

array([11, 12, 13, 14, 15])

In [5]:
arr2 = np.array([ [1,2], [3,4] ])
print(arr2)

[[1 2]
 [3 4]]


In [6]:
zeros = np.zeros((2, 3))
print(zeros)

[[0. 0. 0.]
 [0. 0. 0.]]


In [7]:
#일련번생성 : 연속된 숫자 생성
seq1 = np.arange(1, 10, 2)
print(seq1)

[1 3 5 7 9]


In [8]:
# 범위안에 동일한 간격의 숫자 생성
linspace = np.linspace(0, 1, 5)
print(linspace)

[0.   0.25 0.5  0.75 1.  ]


In [9]:
#배열의 속성 확인
arr = np.array([[1,2,3], [4,5,6]])

In [10]:
print(arr.shape)
print(arr.ndim)
print(arr.size)
print(arr.dtype)

(2, 3)
2
6
int64


In [11]:
#인덱싱
arr = np.array([10, 20, 30, 40])

In [12]:
arr[0]

np.int64(10)

In [14]:
print(arr[0])

10


In [15]:
arr = np.array([[1,2,3], [4,5,6]])

In [16]:
print(arr)

[[1 2 3]
 [4 5 6]]


In [17]:
#arr[][] - 앞[] : 행, 뒤[] : 열
print(arr[0][0])

1


In [18]:
print(arr[1][2])

6


In [19]:
print(arr[-1][-1])

6


In [20]:
# 슬라이싱
arr = np.array([10,20,30,40,50])


In [21]:
print(arr[1:4])
print(arr[:3])
print(arr[::2])

[20 30 40]
[10 20 30]
[10 30 50]


In [22]:
arr = np.array([[1,2,3],[4,5,6],[7,8,9]])
print(arr)

[[1 2 3]
 [4 5 6]
 [7 8 9]]


In [24]:
print(arr[:2, 1:])

[[2 3]
 [5 6]]


In [25]:
print(arr[1:, :2])

[[4 5]
 [7 8]]


In [26]:
# 배열의 연산
arr1 = np.array([1,2,3])
arr2 = np.array([4,5,6])

In [27]:
print(arr1+arr2)

[5 7 9]


In [28]:
arr = np.array([[1,2,3],[4,5,6]])

In [29]:
print(arr+10)

[[11 12 13]
 [14 15 16]]


In [30]:
# 구조 변경 : 2 -> 1 , 1 -> 2
arr = np.array([1,2,3,4,5,6])

In [31]:
reshape = arr.reshape(2,3)
print(reshape)

[[1 2 3]
 [4 5 6]]


In [32]:
flat_arr = arr.flatten()
print(flat_arr)

[1 2 3 4 5 6]


In [33]:
arr = np.array([1,2,3,4,5,6])

In [34]:
filter_arr = arr[arr > 3]
print(filter_arr)

[4 5 6]


In [35]:
# 랜덤 생성
ran_val = np.random.rand(5)
print(ran_val)

[0.4929933  0.68321652 0.74530296 0.38923982 0.24806386]


In [36]:
ran_int = np.random.randint(1, 10)
print(ran_int)

9


In [37]:
ran_int = np.random.randint(1000, 10000, size=(1,10))
print(ran_int)

[[7234 4549 9205 7128 1009 3492 9810 4558 8533 5542]]


# 판다스 (Pandas) 라이브러리

### Pandas
- 판다스는 데이터 분석과 조작에 특화된 라이브러리
- 구조화된 데이터를 처리하는 데 유용
  - Series와 DataFrame이라는 두 가지 데이터 구조를 중심으로 작동
- 구조화된 데이터 처리
  - 행과 열 기반의 데이터 프레임 구조를 사용
- 다양한 데이터 소스 지원
  - CSV, Excel, SQL, JSON 등 다양한 형식의 데이터를 읽고 쓰기
- 편리한 데이터 조작
  - 필터링, 그룹화, 집계, 결측 값 처리 등을 간단히 수행

In [39]:
import pandas as pd

In [57]:
# DataFrame : 2차원 구조 (표, 테이블)
data = {'Name' : ['Alice', 'Bob', 'Charlie'],
        'Age' : [25, 30, 35]}
df = pd.DataFrame(data)

In [41]:
print(df.head()) # 처음 5개 행 출력

      Nmae  Age
0    Alice   25
1      Bob   30
2  Charlie   35


In [42]:
df

Unnamed: 0,Nmae,Age
0,Alice,25
1,Bob,30
2,Charlie,35


In [43]:
df.tail(n=2)

Unnamed: 0,Nmae,Age
1,Bob,30
2,Charlie,35


In [45]:
filter_df = df[df['Age'] > 25]

In [46]:
print(filter_df)

      Nmae  Age
1      Bob   30
2  Charlie   35


In [47]:
print(df['Age'].sum())

90


In [48]:
print(df['Age'].mean())

30.0


In [50]:
# Serise
# 1차원 구조, (값, index)
s1 = pd.Series([100, 200, 300], index=['a', 'b', 'c'])
print(s1)

a    100
b    200
c    300
dtype: int64


In [51]:
print(s1['b'])
print(s1.index)
print(s1.values)

200
Index(['a', 'b', 'c'], dtype='object')
[100 200 300]


In [52]:
# 데이터의 사이즈
print(df.shape)

(3, 2)


In [53]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Nmae    3 non-null      object
 1   Age     3 non-null      int64 
dtypes: int64(1), object(1)
memory usage: 180.0+ bytes
None


In [54]:
print(df.describe())

        Age
count   3.0
mean   30.0
std     5.0
min    25.0
25%    27.5
50%    30.0
75%    32.5
max    35.0


In [55]:
# 컬럼
print(df['Nmae'])

0      Alice
1        Bob
2    Charlie
Name: Nmae, dtype: object


In [58]:
print(df[['Name', 'Age']])

      Name  Age
0    Alice   25
1      Bob   30
2  Charlie   35


In [60]:
df[df['Age'] > 25]

Unnamed: 0,Name,Age
1,Bob,30
2,Charlie,35


In [122]:
data = {
    'name' : ['son', 'king', 'scott'],
    'age' : [30, 40, 25],
    'score' : [90, 85, 95]
    }
df = pd.DataFrame(data)

In [104]:
chk_score = df[df['score'] >= 90]
print(chk_score)

    name  age  score
0    son   30     90
2  scott   25     95


In [105]:
# 열(컬럼) 추가
df['passed'] = df['score'] >= 90

In [106]:
df

Unnamed: 0,name,age,score,passed
0,son,30,90,True
1,king,40,85,False
2,scott,25,95,True


In [107]:
# 행 추가 : pd.append
new_row = {
    'name' : ['smith'],
    'age' : [23],
    'score' : [88],
    'passed' : [False]
}
df2 = pd.DataFrame(new_row)

In [108]:
df = pd.concat([df, df2], ignore_index=True)

In [109]:
df

Unnamed: 0,name,age,score,passed
0,son,30,90,True
1,king,40,85,False
2,scott,25,95,True
3,smith,23,88,False


In [110]:
# 데이터 수정
df.loc[0, 'score'] = 95

In [111]:
df

Unnamed: 0,name,age,score,passed
0,son,30,95,True
1,king,40,85,False
2,scott,25,95,True
3,smith,23,88,False


In [112]:
df['passed'] = df['score'] >= 80

In [113]:
df

Unnamed: 0,name,age,score,passed
0,son,30,95,True
1,king,40,85,True
2,scott,25,95,True
3,smith,23,88,True


In [114]:
df.loc[1, 'score'] = None

In [115]:
df

Unnamed: 0,name,age,score,passed
0,son,30,95.0,True
1,king,40,,True
2,scott,25,95.0,True
3,smith,23,88.0,True


In [116]:
print(df.isnull())

    name    age  score  passed
0  False  False  False   False
1  False  False   True   False
2  False  False  False   False
3  False  False  False   False


In [117]:
df = df.fillna(0)

In [118]:
df

Unnamed: 0,name,age,score,passed
0,son,30,95.0,True
1,king,40,0.0,True
2,scott,25,95.0,True
3,smith,23,88.0,True


In [119]:
df.loc[1, 'score'] = None

In [120]:
df = df.dropna()

In [121]:
df

Unnamed: 0,name,age,score,passed
0,son,30,95.0,True
2,scott,25,95.0,True
3,smith,23,88.0,True


In [123]:
sorted_df = df.sort_values(by='score')
print(sorted_df)

    name  age  score
1   king   40     85
0    son   30     90
2  scott   25     95


In [124]:
sorted_df = df.sort_values(by='score', ascending=False)
print(sorted_df)

    name  age  score
2  scott   25     95
0    son   30     90
1   king   40     85


In [125]:
# 그룹핑 : ~별
data = {
    'team' : ['A', 'A', 'B', 'B', 'C'],
    'score' : [85, 90, 78, 88, 70]
}

In [95]:
df = pd.DataFrame(data)

In [96]:
df

Unnamed: 0,team,score
0,A,85
1,A,90
2,B,78
3,B,88
4,C,70


In [97]:
group_data = df.groupby('team')['score'].mean()
print(group_data)

team
A    87.5
B    83.0
C    70.0
Name: score, dtype: float64


In [98]:
# CSV파일로 저장
df.to_csv('output.csv', index=False)

In [99]:
group_data.to_csv('result.csv', index=False)

# 데이터 수집 개요 및 HTTP 요청
### 데이터 수집 방법 개요
### HTTP 요청과 응답 이해 (GET, POST 등)
### requests 모듈을 활용한 데이터 수집

In [126]:
pip install requests

Collecting requests
  Downloading requests-2.32.3-py3-none-any.whl.metadata (4.6 kB)
Collecting charset-normalizer<4,>=2 (from requests)
  Downloading charset_normalizer-3.4.0-cp313-cp313-win_amd64.whl.metadata (34 kB)
Collecting idna<4,>=2.5 (from requests)
  Downloading idna-3.10-py3-none-any.whl.metadata (10 kB)
Collecting urllib3<3,>=1.21.1 (from requests)
  Downloading urllib3-2.2.3-py3-none-any.whl.metadata (6.5 kB)
Collecting certifi>=2017.4.17 (from requests)
  Downloading certifi-2024.12.14-py3-none-any.whl.metadata (2.3 kB)
Downloading requests-2.32.3-py3-none-any.whl (64 kB)
Downloading certifi-2024.12.14-py3-none-any.whl (164 kB)
Downloading charset_normalizer-3.4.0-cp313-cp313-win_amd64.whl (102 kB)
Downloading idna-3.10-py3-none-any.whl (70 kB)
Downloading urllib3-2.2.3-py3-none-any.whl (126 kB)
Installing collected packages: urllib3, idna, charset-normalizer, certifi, requests
Successfully installed certifi-2024.12.14 charset-normalizer-3.4.0 idna-3.10 requests-2.32.3 ur


[notice] A new release of pip is available: 24.2 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [1]:
url = 'https://jsonplaceholder.typicode.com/posts'

In [3]:
import requests
response = requests.get(url)

In [4]:
response

<Response [200]>

In [5]:
type(response.text)

str

In [6]:
if response.status_code == 200 :
    print('응답 성공')
    print(response.text[:50])
else:
    print(f'오류 발생 : {response.status_code}')

응답 성공
[
  {
    "userId": 1,
    "id": 1,
    "title": "


In [7]:
data_obj = response.json()

In [8]:
type(data_obj)

list

In [9]:
for post in data_obj:
    print('title', post['title'][:10])
    print('body', post['body'][:10])

title sunt aut f
body quia et su
title qui est es
body est rerum 
title ea molesti
body et iusto s
title eum et est
body ullam et s
title nesciunt q
body repudianda
title dolorem eu
body ut asperna
title magnam fac
body dolore pla
title dolorem do
body dignissimo
title nesciunt i
body consectetu
title optio mole
body quo et exp
title et ea vero
body delectus r
title in quibusd
body itaque id 
title dolorum ut
body aut dicta 
title voluptatem
body fuga et ac
title eveniet qu
body reprehende
title sint susci
body suscipit n
title fugit volu
body eos volupt
title voluptate 
body eveniet qu
title adipisci p
body illum quis
title doloribus 
body qui conseq
title asperiores
body repellat a
title dolor sint
body eos qui et
title maxime id 
body veritatis 
title autem hic 
body enim et ex
title rem alias 
body ullam cons
title est et qua
body similique 
title quasi id e
body eum sed do
title delectus u
body non et qua
title iusto eius
body odit magna
title a quo magn
body alias dolo
title ulla

In [10]:
pip install beautifulsoup4

Collecting beautifulsoup4
  Downloading beautifulsoup4-4.12.3-py3-none-any.whl.metadata (3.8 kB)
Collecting soupsieve>1.2 (from beautifulsoup4)
  Downloading soupsieve-2.6-py3-none-any.whl.metadata (4.6 kB)
Downloading beautifulsoup4-4.12.3-py3-none-any.whl (147 kB)
Downloading soupsieve-2.6-py3-none-any.whl (36 kB)
Installing collected packages: soupsieve, beautifulsoup4
Successfully installed beautifulsoup4-4.12.3 soupsieve-2.6
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.2 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [1]:
from bs4 import BeautifulSoup

In [11]:
html_doc = """
<html>
    <head>
        <title>Example Page</title>
    </head>
    <body>
        <h1>Welcome to Web Scraping</h1>
        <p class="description">This is an example paragraph.</p>
        <a href="https://example.com1">Visit Example1</a>
        <a href="https://example.com2">Visit Example2</a>
        <a href="https://example.com3">Visit Example3</a>
        <a href="https://example.com4">Visit Example4</a>
    </body>
</html>
"""

In [12]:
# BeautifulSoup 객체 생성
soup = BeautifulSoup(html_doc, 'html.parser')

In [7]:
# 태그 탐색
print(soup.title.string)

Example Page


In [13]:
print(soup.h1.string)
print(soup.a['href']) # 첫 번재 a태그 반환

Welcome to Web Scraping
https://example.com1


In [9]:
print(soup.p)
print(soup.p['class'])

<p class="description">This is an example paragraph.</p>
['description']


In [10]:
type(soup.a['href'])

str

In [15]:
# find, find_all
a_tag = soup.find('a') # 첫 번재것만 반환
a_tag

<a href="https://example.com1">Visit Example1</a>

In [21]:
a_tags = soup.find_all('a')
print(a_tags) # 리스트로 반환
links = []
for a_tag in a_tags:
    print(a_tag['href'])
    links.append(a_tag['href'])

print(links)

[<a href="https://example.com1">Visit Example1</a>, <a href="https://example.com2">Visit Example2</a>, <a href="https://example.com3">Visit Example3</a>, <a href="https://example.com4">Visit Example4</a>]
https://example.com1
https://example.com2
https://example.com3
https://example.com4
['https://example.com1', 'https://example.com2', 'https://example.com3', 'https://example.com4']


In [22]:
import csv

In [23]:
# 데이터 준비
data = [
    ['Name', 'Age', 'City'],
    ['Alice', 30, 'New York'],
    ['Bob', 25, 'Los Angles'],
    ['Charile', 35, 'Chicago']
]

# csv 파일에 저장
with open('data.csv', mode='w', newline='',
          encoding='utf-8') as file:
    writer = csv.writer(file)
    writer.writerows(data)

print("CSV 파일이 저장되었습니다!")

CSV 파일이 저장되었습니다!


In [2]:
import pandas as pd

data = pd.read_csv('data.csv')
print(data)

      Name  Age        City
0    Alice   30    New York
1      Bob   25  Los Angles
2  Charile   35     Chicago


In [3]:
data = {
'Name': ['Alice', 'Bob', 'Charlie'],
'Age': [30, 25, 35],
'City': ['New York', 'Los Angeles', 'Chicago']
}

df = pd.DataFrame(data)

df.to_excel('data.xlsx', index=False)

print("Excel 파일이 저장되었습니다.")

Excel 파일이 저장되었습니다.


In [4]:
data2 = pd.read_excel('data.xlsx')
print(data2)

      Name  Age         City
0    Alice   30     New York
1      Bob   25  Los Angeles
2  Charlie   35      Chicago


In [5]:
# json파일에 저장

import json

# 데이터 준비
data = {
'people': [
{'name': 'Alice', 'age': 30, 'city': 'New York'},
{'name': 'Bob', 'age': 25, 'city': 'Los Angeles'},
{'name': 'Charlie', 'age': 35, 'city': 'Chicago'}
]
}

# JSON파일로 저장
with open('data.json', mode='w', encoding='utf-8') as file:
    json.dump(data, file, indent=4)
print("JSON 파일이 저장되었습니다!")

JSON 파일이 저장되었습니다!


In [6]:
with open('data.json', mode='r') as file:
    data = json.load(file)
print(data)

{'people': [{'name': 'Alice', 'age': 30, 'city': 'New York'}, {'name': 'Bob', 'age': 25, 'city': 'Los Angeles'}, {'name': 'Charlie', 'age': 35, 'city': 'Chicago'}]}
