<a href="https://colab.research.google.com/github/pykido/DataScience_Introduction/blob/main/EDA_tutorial.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# T1. Pie Chart
Visualizing categorical data with [pie()](https://matplotlib.org/stable/api/_as_gen/matplotlib.pyplot.pie.html
) method

## Installing Korean fonts
구글 코랩을 통해 데이터 시각화를 할때, 한글 폰트 사용을 위해서는 먼저 폰트설치가 필요합니다.
아래 코드를 실행 한 뒤에는 "런타임 다시 시작"을 해주셔야 합니다.

In [None]:
!sudo apt-get install -y fonts-nanum
!sudo fc-cache -fv
!rm ~/.cache/matplotlib -rf

## Basic Pie Chart

labels: company names

x: market share for each company

In [None]:
import matplotlib.pyplot as plt #import basic visualization module 
import numpy as np

labels = ['A', 'B', 'C', 'D', 'E']
x = np.array([50, 20, 15, 10, 5])
plt.pie(x, labels=labels)
plt.show()   

The fractional area of each wedge is given by x/sum(x). If sum(x) < 1, then the values of x give the fractional area directly and the array will not be normalized.

In [None]:
x2 = x * 2
print(x2)

In [None]:
plt.pie(x2, labels=labels)
plt.show()                                    

In [None]:
x3 = np.array([0.4, 0.2, 0.1, 0.1, 0.1])
plt.pie(x3, labels=labels)
plt.show()  

## Adding more information


### adding ratio

In [None]:
plt.pie(x, labels=labels, shadow=True, startangle=90, autopct='%0.1f%%')
plt.show()  

### adding korean title

In [None]:
plt.rc('font', family='NanumBarunGothic') # to use korean in the chart
plt.pie(x, labels=labels, shadow=True, \
        startangle=90, autopct='%0.1f %%')
plt.title('시장점유율')
plt.show()  


### adding legend

In [None]:
plt.rc('font', family='NanumBarunGothic') # to use korean in the chart
plt.pie(x, shadow=True, startangle=90, autopct='%0.1f %%')
plt.title('시장점유율')
plt.legend(labels)
plt.show()  

## Saving figures

In [None]:
plt.rc('font', family='NanumBarunGothic') # to use korean in the chart
plt.pie(x, shadow=True, startangle=90, autopct='%0.1f %%')
plt.title('시장점유율')
plt.legend(labels)
plt.savefig("pie_chart.jpeg")

## Read csv with Pandas

In [None]:
import pandas as pd
url = 'https://raw.githubusercontent.com/mlee-pnu/IDS/main/data1.csv'
myDF = pd.read_csv(url)
myDF.head()

In [None]:
table = pd.crosstab(index=myDF["group"].values, \
                    colnames=["group"],columns='participants')
table.index = ["A","B","C"]
print(table)


In [None]:
table.plot.pie(y='participants', autopct='%0.1f%%')

In [None]:
table.plot.bar(y='participants')


# T2. Bar chart

## Basic Bar Chart

In [None]:
import matplotlib.pyplot as plt
import numpy as np

x = np.arange(3) # 0, 1, 2
years = ['2017', '2018', '2019']
values = [100, 400, 900]

plt.bar(x, values)
plt.xticks(x, years) # displaying years on x axis
plt.show()

In [None]:
plt.bar(years,values)
plt.show()

In [None]:
import pandas as pd
myDF = pd.DataFrame({'Year':years,
                     'Score':values})
myDF.head()

In [None]:
myDF.plot.bar(x='Year', y='Score', rot=0)

## Adding more information

### Rotating labels

In [None]:
days_in_year = [88, 225, 365, 687, 4333, 10756, 30687, 60190, 90553]
xticks = range(len(days_in_year)); 
xlabels = ['Mercury', 'Venus', 'Earth', 'Mars', 'Jupiter', 'Saturn', 'Uranus', 'Neptune', 'Pluto'];
plt.bar(xticks, days_in_year)
plt.xticks(xticks, xlabels, rotation=30)
plt.ylabel('days in a year')
plt.show()


### Horizontal bar chart

In [None]:
x = np.array(["A", "B", "C", "D"])
y = np.array([3, 8, 1, 10])

plt.barh(x, y)
plt.show()

### Adjusting bar width

In [None]:
days_in_year = [88, 225, 365, 687, 4333, 10756, 30687, 60190, 90553]
xticks = range(len(days_in_year)); 
xlabels = ['Mercury', 'Venus', 'Earth', 'Mars', 'Jupiter', 'Saturn', 'Uranus', 'Neptune', 'Pluto'];
plt.bar(xticks, days_in_year, width = 0.4)
plt.xticks(xticks, xlabels, rotation=30)
plt.ylabel('days in a year')
plt.show()

### Adding error bars

In [None]:
values = [10, 13, 11, 15, 20]
yerr = [1, 3, 0.5, 2, 4] # error bars
plt.bar(range(len(values)), values, yerr=yerr, capsize=10)
plt.show()

## Stacked Bar Chart

In [None]:
import pandas as pd
df = pd.DataFrame()
df['Quarter'] = ['1Q','2Q','3Q','4Q']
df['A'] = [300,100,300,250]
df['B'] = [400,200,400,100]
df['C'] = [250,500,200,300]
df['D'] = [100,200,300,400]
df.head()


In [None]:
ax = df.plot.bar(x = 'Quarter')

In [None]:
ax = df.plot.bar(stacked = True,\
                 x = 'Quarter', rot=0)

# T3. Histogram

### Basic Histogram

In [None]:
import matplotlib.pyplot as plt
import numpy as np
x = np.random.normal(170, 10, 250)
x.mean(), x.std(), x.min(), x.max()

In [None]:
x[10:20]

In [None]:
plt.hist(x)
plt.show() 

## Adjusting bin size

In [None]:
n, edges, patch  = plt.hist(x, bins = 5)
plt.show()

In [None]:
print(edges)
print(np.diff(edges))
print(patch[1])

In [None]:
plt.hist(x, bins=[130,140,150,160,
                  170,180,190,200,210])
plt.show()

## Adjusting rage

In [None]:
plt.hist(x, range=(160,180),bins=10)
plt.show()

## Two histograms in one figure

In [None]:
a = np.random.normal(170, 10, 600)
b = np.random.normal(160, 15, 100)
plt.hist(a, alpha=0.5)
plt.hist(b, alpha=0.5, density = False)
plt.show()

In [None]:
plt.hist(a, alpha=0.5, density = True)
plt.hist(b, alpha=0.5, density = True)
plt.show()

# T4. Box plot and Scatter plot

## Loading dataset from seaborn

[Seaborn](https://seaborn.pydata.org/) 에 있는 데이터셋을 사용해 봅시다

In [None]:
import matplotlib.pyplot as plt #basic visualization module 
import seaborn as sns 
import numpy as np
import pandas as pd

# loading dataset
iris = sns.load_dataset('iris')
iris.shape

In [None]:
iris.head()

종 별로 평균 및 표준편차 확인하기.

In [None]:
#iris.groupby('species').mean()
iris.groupby('species').count()


In [None]:
iris.groupby('species').std()

## Box plot by matplotlib

In [None]:
# make subsets
c1 = iris[iris['species'] == 'setosa']
c2 = iris[iris['species'] == 'versicolor']
c3 = iris[iris['species'] == 'virginica']

plt.boxplot((c1['petal_length'], c2['petal_length'],
             c3['petal_length']))
plt.xticks([1,2,3],['setosa', 'versicolor','virginica'])
#plt.grid()
plt.show()



## Box plot by Pandas

In [None]:
boxplot = iris.boxplot(column = 'petal_length', by = 'species')

## Box plot by Seaborn

In [None]:
boxplot = sns.boxplot(data=iris, x='species', y='petal_length')

## Scatter plot by matplotlib

In [None]:
plt.plot('petal_length',  # x
         'petal_width',  # y
         data=iris, 
         linestyle='none', 
         marker='o', 
         markersize=5,
         color='blue', 
         alpha=0.5)
plt.title('Scatter Plot of iris by matplotlib', fontsize=20)
plt.xlabel('Petal Length', fontsize=14)
plt.ylabel('Petal Width', fontsize=14)
plt.show()

In [None]:
plt.scatter(iris['petal_length'],iris['petal_width'])
plt.title('Scatter Plot of iris by matplotlib', fontsize=20)
plt.xlabel('Petal Length', fontsize=14)
plt.ylabel('Petal Width', fontsize=14)
plt.show()

## Scatter plot by Seaborn

In [None]:
sns.scatterplot(x='petal_length',
                y='petal_width',
                hue='species',
                style='species',
                data=iris)
plt.show()

# T3. Time series

Plotting time series data with Pandas [plot()](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.plot.html/) method.



In [None]:
import statsmodels.api as sm
dta = sm.datasets.co2.load_pandas().data
dta.plot()
plt.title("CO2 Levels")
plt.ylabel("Parts per million")
plt.show()