In [None]:
import pandas as pd

In [None]:
import matplotlib as mpl
import matplotlib.pyplot as plt
import platform

plt.rcParams['axes.unicode_minus'] = False

if platform.system() == 'Darwin':
    mpl.rc('font', family='AppleGothic')
elif platform.system() == 'Windows':
    path = "c:/Windows/Fonts/malgun.ttf"
    font_name = mpl.font_manager.FontProperties(fname=path).get_name()
    mpl.rc('font', family=font_name)
else:
    print("Unknown System OS")

In [None]:
# 8. pandas 시각화
# (3) 원형그래프
df_tpop = pd.read_csv("data/20200521_202004_주민등록인구및세대현황.csv",
                      encoding="cp949", engine="python")
df_pro = df_tpop

In [None]:
print(df_tpop.shape)

In [None]:
print(df_tpop.columns)

In [None]:
print(df_tpop.info())

In [None]:
print(df_tpop.head())

In [None]:
print(df_tpop.tail())

In [None]:
df_tpop.plot(kind='pie', y='총인구수', autopct='%1.1f%%',
             labels=df_tpop['행정구역'], legend=False, figsize=(10, 10))
plt.title('2020년 04월 행정구역별 주민등록인구 현황')
plt.savefig('plots/2020_04_행정구역별_주민등록인구_원형.png')
plt.show()
plt.close()

In [None]:
df_tpop.plot.pie(y='총인구수', autopct='%1.1f%%',
                 labels=df_tpop['행정구역'], legend=False, figsize=(10, 10))

In [None]:
# (4) 막대 그래프

df_tpop.plot(kind='bar', x='행정구역', y='총인구수', rot=45, legend=False,
             figsize=(10, 10))
plt.title('2020년 04월 행정구역별 주민등록인구 현황')
plt.savefig('plots/2020_04_행정구역별_주민등록인구_막대.png')
plt.show()
plt.close()

In [None]:
df_tpop.plot.bar(x='행정구역', y='총인구수', rot=45, legend=False,
                 figsize=(10, 10))

In [None]:
df_tpop.plot(kind='bar', x='행정구역', subplots=True, layout=(3, 2),
             rot=45, figsize=(10, 8))

In [None]:
df_tpop.plot.bar(subplots=True, x='행정구역', layout=(3, 2),
                 rot=45, figsize=(10, 8))

In [None]:
# 빈도 막대그래프
df_subway2 = pd.read_csv("data/20200423_202002_서울지하철승하차인원수.csv",
                         encoding='cp949', engine="python")
df_subway_groupby = df_subway2.groupby('노선명').sum()
df_subway_groupby['하차총승객수'].plot(kind='bar', rot=45, figsize=(12, 7))
plt.title('2020년 02월 지하철노선별 하차승객수')
plt.savefig('plots/2020_02_지하철노선별_하차승객수_막대.png')
plt.show()
plt.close()

In [None]:
df_subway_groupby['하차총승객수'].plot.bar(rot=45, figsize=(12, 7))

In [None]:
# (5) 히스토그램
df_air = pd.read_csv("data/20200511_ghg.csv", encoding='cp949')
print(df_air.columns)
print(df_air.info())

In [None]:
df_air['CO2_ppm'].plot(kind='hist')

In [None]:
df_air['CO2_ppm'].plot.hist()

In [None]:
df_air['CH4_ppm'].plot(kind='hist')

In [None]:
df_air['CH4_ppm'].plot.hist()

In [None]:
df_air.plot(kind='hist', alpha=0.5)

In [None]:
df_air.plot.hist(alpha=0.5)

In [None]:
df_air.plot(kind='hist', bins=30, subplots=True,
            layout=(4, 2), figsize=(10, 8))

In [None]:
df_air.plot.hist(bins=30, subplots=True, layout=(4, 2), figsize=(10, 8))

In [None]:
# 히스토그램 서브플롯별 축 분리
var1 = ['CO2_ppm', 'CH4_ppm', 'N2O_ppm', 'CFC11_ppm', 'CFC12_ppm',
       'CFC113_ppm', 'SF6_ppm']

i = 0

fig = plt.figure(figsize=(15, 10))
plt.title('1999-2008년 온실가스', y=1.05)
for v in var1:
    i += 1
    ax = fig.add_subplot(3, 3, i)
    ax.title.set_text(v)
    df_air[v].plot.hist(alpha=0.8)
    plt.legend(loc='upper right')
    #plt.suptitle(v)
plt.savefig('plots/1999-2008_온실가스.png')
plt.show()
plt.close()

In [None]:
# 7개의 개별 히스토그램 생성
var1 = ['CO2_ppm', 'CH4_ppm', 'N2O_ppm', 'CFC11_ppm', 'CFC12_ppm',
       'CFC113_ppm', 'SF6_ppm']
for v in var1:
    df_air[v].plot(kind='hist', alpha=0.8)
    plt.legend(loc='upper right')
    plt.suptitle(v)
    plt.show()

plt.close()

In [None]:
# (6) 산점도, 산점도 행렬, 히트맵
# 1) 산점도(산포도)
df_ol = pd.read_excel('data/20180217_2017년서울시구별노령화지수.xlsx')
print(df_ol.columns)
print(df_ol.info())

In [None]:
df_ol.plot(kind='scatter', x='유년부양비', y='노년부양비')

In [None]:
df_ol.plot.scatter(x='유년부양비', y='노년부양비')

In [None]:
df_ol.plot(kind='scatter', x='노령화지수', y='노년부양비')

In [None]:
df_ol.plot.scatter(x='노령화지수', y='노년부양비')

In [None]:
df_ol.plot(kind='scatter', x='노령화지수', y='유년부양비')

In [None]:
df_ol.plot.scatter(x='노령화지수', y='유년부양비')

In [None]:
# 상관관계
df_ol_p = df_ol[['자치구', '유년부양비', '노년부양비', '노령화지수']]

print(df_ol_p.corr())  # Pearson

In [None]:
print(df_ol_p.corr(method="spearman"))  # spearman

In [None]:
print(df_ol_p["노년부양비"].corr(df_ol_p["노령화지수"]))
print(df_ol_p["노년부양비"].corr(df_ol_p["노령화지수"], method="spearman"))

In [None]:
# 2) 산점도 행렬(산포행렬)
from pandas.plotting import scatter_matrix

scatter_matrix(df_ol_p, figsize=(8, 8))

In [None]:
scatter_matrix(df_ol_p, diagonal='kde', figsize=(8, 8))

In [None]:
df_air = pd.read_csv("data/20200511_ghg.csv", encoding='cp949')
print(df_air.columns)

In [None]:
print(df_air.info())

In [None]:
scatter_matrix(df_air, figsize=(8, 8))

In [None]:
df_air.plot(kind='scatter', x='SF6_ppm', y='CFC113_ppm')

In [None]:
df_air.plot(kind='scatter', x='CO2_ppm', y='CH4_ppm')

In [None]:
# 산점도 행렬에 상관계수 표시
from pandas.plotting import scatter_matrix

axes = scatter_matrix(df_ol_p, alpha=0.5, diagonal='kde', figsize=(8, 8))
corr = df_ol_p.corr().to_numpy()

for i, j in zip(*plt.np.triu_indices_from(axes, k=1)):
    axes[i, j].annotate("%.3f" % corr[i, j], (0.8, 0.8),
          xycoords='axes fraction', ha='center', va='center')

plt.show()
plt.close()

In [None]:
# seaborn을 사용한 산점도에 선형회귀 추가
import seaborn as sns
sns.regplot(x="CO2_ppm", y="CH4_ppm", data=df_air)

In [None]:
sns.regplot(x="SF6_ppm", y="CFC113_ppm", data=df_air)

In [None]:
# seaborn을 사용한 산포행렬에 선형회귀 추가
import seaborn as sns

pp = sns.pairplot(df_ol_p,  # df[cols]
                  diag_kws=dict(shade=True),  # "diag" adjusts/tunes the diagonal plots
                  diag_kind="kde",  # use "kde" for diagonal plots
                  kind="reg")  # 산점도에 선형회귀(linear regression) 추가

fig = pp.fig 
fig.subplots_adjust(top=0.93, wspace=0.3)
fig.suptitle('노령화지수 산포행렬', fontsize=14, fontweight='bold')
fig.savefig("plots/pairplot.png")

In [None]:
# 3) 히트맵(hexbin)
df_air.plot(kind="hexbin", x="CO2_ppm", y="CH4_ppm", sharex=False)

In [None]:
df_air.plot.hexbin(x="CO2_ppm", y="CH4_ppm", sharex=False)

In [None]:
df_ol_p.plot(kind="hexbin", x="유년부양비", y="노년부양비", sharex=False)

In [None]:
df_ol_p.plot.hexbin(x="유년부양비", y="노년부양비", sharex=False)

In [None]:
# seaborn을 사용한 히트맵
# 참고 https://jovianlin.io/data-visualization-seaborn-part-2/
import seaborn as sns

corr = df_air.corr()
fig, ax = plt.subplots(1, 1, figsize=(10,6))

hm = sns.heatmap(corr, 
                 ax=ax,   # Axes in which to draw the plot, otherwise use the currently-active Axes.
                 cmap="coolwarm", # Color Map.
                 #square=True,    # If True, set the Axes aspect to “equal” so each cell will be square-shaped.
                 annot=True, 
                 fmt='.2f',       # String formatting code to use when adding annotations.
                 #annot_kws={"size": 14},
                 linewidths=.05)

fig.subplots_adjust(top=0.93)
fig.suptitle('Green House gas Heatmap', 
              fontsize=14, 
              fontweight='bold')

In [None]:
#(7) 선그래프
print(df_air.columns)
print(df_air.info())

In [None]:
df_air.plot(kind="line", x="시간", y="CO2_ppm")

In [None]:
df_air.plot.line(x="시간", y="CO2_ppm")

In [None]:
df_air.plot(kind="line", x="시간")

In [None]:
df_air.plot.line(x="시간")

In [None]:
df_oz = pd.read_csv('data/ozone_data.csv', encoding="cp949")
print(df_oz.columns)
print(df_oz.info())
df_oz.plot.line(x="일시", y="평균오존전량(DU)")

In [None]:
# (8) 상자그림 그래프
df_subway2 = pd.read_csv("data/20200423_202002_서울지하철승하차인원수.csv",
                         encoding='cp949', engine="python")
df_sub = df_subway2
print(df_subway2.columns)
print(df_subway2.info())

In [None]:
df_subway2.boxplot(by="노선명", column=['하차총승객수'], figsize=(15, 8))

In [None]:
# seaborn을 사용한 상자그림
import matplotlib.pyplot as plt
import seaborn as sns

fig, ax = plt.subplots(figsize=(15, 8))
sns.boxplot(y='하차총승객수', x='노선명', data=df_subway2, ax=ax,
            width=0.5, palette="colorblind")

In [None]:
cnd = ['1호선', '2호선', '3호선', '4호선']
print(df_subway2[df_subway2['노선명'].isin(['1호선', '2호선'])])

In [None]:
df_subway2_p = df_subway2[df_subway2['노선명'].isin(cnd)]
print(df_subway2_p)

In [None]:
# seaborn을 사용한 상자그림에 산점도 추가 - stripplot
import numpy as np
np.random.seed(202005)

fig, ax = plt.subplots(figsize=(15, 8))
sns.boxplot(y='하차총승객수', x='노선명', data=df_subway2_p, ax=ax,
            palette="colorblind")
sns.stripplot(y='하차총승객수', x='노선명', data=df_subway2_p, ax=ax,
              jitter=True, marker='o', alpha=0.5, color='black')

In [None]:
# seaborn을 사용한 상자그림에 산점도 추가 - swarmplot
fig, ax = plt.subplots(figsize=(15, 8))
sns.boxplot(y='하차총승객수', x='노선명', data=df_subway2_p, ax=ax,
            palette="colorblind")
sns.swarmplot(y='하차총승객수', x='노선명', data=df_subway2_p, ax=ax,
              alpha=0.5, color='black')