## 산점도

## 기본 사용

![image.png](attachment:image.png)

산점도 (Scatter plot)는 두 변수의 상관 관계를 직교 좌표계의 평면에 점으로 표현하는 그래프입니다.

matplotlib.pyplot 모듈의 scatter() 함수를 이용하면 산점도를 그릴 수 있습니다.

### 단일 매칭

In [1]:
import seaborn as sns
iris = sns.load_dataset('iris')
iris.shape

(150, 5)

In [2]:
iris.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [3]:
iris.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sepal_length  150 non-null    float64
 1   sepal_width   150 non-null    float64
 2   petal_length  150 non-null    float64
 3   petal_width   150 non-null    float64
 4   species       150 non-null    object 
dtypes: float64(4), object(1)
memory usage: 6.0+ KB


In [4]:
iris.describe()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
count,150.0,150.0,150.0,150.0
mean,5.843333,3.057333,3.758,1.199333
std,0.828066,0.435866,1.765298,0.762238
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


In [None]:
iris.describe()

In [5]:
plt.scatter(x = iris['sepal_length'], y=iris['sepal_width'])
plt.xlabel('꽃받침 길이')
plt.ylabel('꽃받침 너비')
plt.title('붗꽃 꽃받침 길이/너비 별 분포도')
plt.show()

NameError: name 'plt' is not defined

In [None]:
plt.scatter(x = iris['petal_length'], y=iris['petal_width'])
plt.xlabel('꽃잎 길이')
plt.ylabel('꽃잎 너비')
plt.title('붗꽃 꽃잎 길이/너비 별 분포도')
plt.show()

---

### 누적값 관점에서의 분석

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import font_manager,rc

import random

font_location = "C:\Windows\Fonts\malgun.ttf"
font_name = font_manager.FontProperties(fname=font_location).get_name()
rc('font',family=font_name)
plt.rcParams['axes.unicode_minus'] = False

In [None]:
np.random.seed(0)

n = 50
x = np.random.rand(n)
y = np.random.rand(n)
x

In [None]:
y

In [None]:
plt.scatter(x, y)
plt.show()

## 색상과 크기 지정하기

![image.png](attachment:image.png)

In [None]:
np.random.seed(0)

n = 100
x = np.random.rand(n)
y = np.random.rand(n)
area = (50 * np.random.rand(n))**2
area

In [None]:
plt.scatter(x, y, s=area)
plt.show()

In [None]:
plt.scatter(x, y, s=area, c='yellow')
plt.show()

In [None]:
# 여러가지 색을 지정하려면 원소의 갯수와 일치해야 하기 떄문에 에러가 발생한다.
#plt.scatter(x, y, s=area, c=['yellow','green'])
#plt.show()

In [None]:
colors = np.random.rand(n)
colors

In [None]:
# 임의의 수를 할 당 하여 색상코드처럼 활용할 수 있다.
plt.scatter(x, y, s=area, c=colors)
plt.show()

### plot함수의 marker size 지정과 비교

In [None]:
## 
plt.plot([1], [1], 'o', markersize=20, c='#FF5733')
plt.scatter([2], [1], s=20**2, c='#33FFCE')

plt.text(0.5, 1.05, 'plot(markersize=20)', fontdict={'size': 14})
plt.text(1.6, 1.05, 'scatter(s=20**2)', fontdict={'size': 14})
plt.axis([0.4, 2.6, 0.8, 1.2])
plt.show()

In [None]:
np.random.seed(0)

n = 100
x = np.random.rand(n)
y = np.random.rand(n)
area = (50 * np.random.rand(n))**2

FIRST = 0
LAST = n
MEDIAN = n/2

plt.scatter(x, y, s=area, c='yellow')
plt.scatter(x[FIRST], y[FIRST], s=area[FIRST], c='red')
plt.text(x[FIRST], y[FIRST], '최초 관측된 데이터', fontdict={'size': 14})
# plt.show()

In [None]:
np.random.seed(0)

n = 100
x = np.random.rand(n)
y = np.random.rand(n)
area = (50 * np.random.rand(n))**2

FIRST = 0
LAST = n
MEDIAN = n/2
X_PADDING = 0.17
Y_PADDING = 0.07

plt.scatter(x, y, s=area, c='yellow')
plt.scatter(x[FIRST], y[FIRST], s=area[FIRST], c='red')
plt.text(x[FIRST]-X_PADDING, y[FIRST]+Y_PADDING, '최초 관측된 데이터', fontdict={'size': 14})
plt.show()

In [None]:
from matplotlib import font_manager,rc
font_location = "C:\Windows\Fonts\malgun.ttf"
font_name = font_manager.FontProperties(fname=font_location).get_name()
rc('font',family=font_name)
plt.rcParams['axes.unicode_minus'] = False

In [None]:
np.random.seed(0)

n = 100
x = np.random.rand(n)
y = np.random.rand(n)
area = (50 * np.random.rand(n))**2

FIRST = 0
LAST = n-1
MEDIAN = int(n/2)
X_PADDING = 0.17
Y_PADDING = 0.07

plt.scatter(x, y, s=area, c='yellow')
plt.scatter(x[FIRST], y[FIRST], s=area[FIRST], c='red')
plt.text(x[FIRST]-X_PADDING, y[FIRST]+Y_PADDING, '최초 관측된 데이터', fontdict={'size': 14})

plt.scatter(x[MEDIAN], y[MEDIAN], s=area[MEDIAN], c='green')
plt.text(x[MEDIAN]-X_PADDING-0.05, y[MEDIAN]+Y_PADDING, '1/2 지점 발견한 데이터', fontdict={'size': 14})

plt.scatter(x[LAST], y[LAST], s=area[LAST], c='blue')
plt.text(x[LAST]-X_PADDING+0.15, y[LAST]+Y_PADDING+0.05, '마지막 관측된 데이터', fontdict={'size': 14})
plt.show()

### 시나리오 설정

In [None]:
np.random.seed(0)

n = 100
x = np.random.rand(n)
y = np.random.rand(n)
area = (50 * np.random.rand(n))**2

FIRST = 0
LAST = n-1
MEDIAN = int(n/2)
X_PADDING = 0.09
Y_PADDING = 0.07

plt.scatter(x, y, s=area, c='yellow')
plt.scatter(x[FIRST], y[FIRST], s=area[FIRST], c='red')
plt.text(x[FIRST]-X_PADDING, y[FIRST]+Y_PADDING, '1호 점포', fontdict={'size': 14})

plt.scatter(x[MEDIAN], y[MEDIAN], s=area[MEDIAN], c='green')
plt.text(x[MEDIAN]-X_PADDING, y[MEDIAN]+Y_PADDING, '중간 점포 ', fontdict={'size': 14})

plt.scatter(x[LAST], y[LAST], s=area[LAST], c='blue')
plt.text(x[LAST]-X_PADDING+0.06, y[LAST]+Y_PADDING+0.06, '가장 최신 점포', fontdict={'size': 14})
plt.title('더조은동 지역별, 상점별 매출 현황')
plt.xlabel('x 좌표 (동경 126 기준)')
plt.ylabel('y 좌표 (북위 37 기준)')
plt.show()

## 3차원 산점도

In [None]:
n = 100
xmin, xmax, ymin, ymax, zmin, zmax = 0, 20, 0, 20, 0, 50
cmin, cmax = 0, 2

xs = np.array([(xmax - xmin) * np.random.random_sample() + xmin for i in range(n)])
ys = np.array([(ymax - ymin) * np.random.random_sample() + ymin for i in range(n)])
zs = np.array([(zmax - zmin) * np.random.random_sample() + zmin for i in range(n)])
color = np.array([(cmax - cmin) * np.random.random_sample() + cmin for i in range(n)])

fig = plt.figure(figsize=(6, 6))
ax = fig.add_subplot(111, projection='3d')
ax.scatter(xs, ys, zs, c=color, marker='o', s=15, cmap='Greens')

plt.show()

----