In [3]:
import pandas as pd
import numpy as np

In [16]:
# Display the help document
pd?

[1;31mType:[0m        module
[1;31mString form:[0m <module 'pandas' from 'C:\\Users\\yumin\\anaconda3\\lib\\site-packages\\pandas\\__init__.py'>
[1;31mFile:[0m        c:\users\yumin\anaconda3\lib\site-packages\pandas\__init__.py
[1;31mDocstring:[0m  
pandas - a powerful data analysis and manipulation library for Python

**pandas** is a Python package providing fast, flexible, and expressive data
structures designed to make working with "relational" or "labeled" data both
easy and intuitive. It aims to be the fundamental high-level building block for
doing practical, **real world** data analysis in Python. Additionally, it has
the broader goal of becoming **the most powerful and flexible open source data
analysis / manipulation tool available in any language**. It is already well on
its way toward this goal.

Main Features
-------------
Here are just a few of the things that pandas does well:

  - Easy handling of missing data in floating point as well as non-floating
    point 

In [17]:
# Print the version of pandas
pd.__version__

'1.2.4'

In [28]:
"""
Pandas Object: Series
"""
# Create a series from an array

ser = pd.Series([0.25, 0.5, 0.75, 1.0]) # constructor method
print(ser)

0    0.25
1    0.50
2    0.75
3    1.00
dtype: float64


In [33]:
# Two main attributes: 'values' and 'index '
arr = ser.values
print(arr)

ind = ser.index
print(ind)
print(ind.values)

[0.25 0.5  0.75 1.  ]
RangeIndex(start=0, stop=4, step=1)
[0 1 2 3]


In [37]:
# Label-based Indexing

ser = pd.Series([0.25, 0.5, 0.75, 1.0], index=['a', 'b', 'c', 'd'])
print(ser)

print(ser['a'])
print(ser['b'])

a    0.25
b    0.50
c    0.75
d    1.00
dtype: float64
0.25
0.5


In [39]:
"""
Dictionary and Series
"""

dict = {'a' : 1, 2 : 'two', 'third' : True}
print(dict)

{'a': 1, 2: 'two', 'third': True}


In [48]:
"""
Create a series from a dictionary
"""

population_dict = {'California' : 38332521,
                   'Texas' : 26448193,
                   'New York' : 19651127,
                   'Florida' : 19552860,
                   'Illinois': 12882135}

population = pd.Series(population_dict)
print(population)

print(population['California' : 'Texas'])
print(population['New York' : 'Illinois'])

California    38332521
Texas         26448193
New York      19651127
Florida       19552860
Illinois      12882135
dtype: int64
California    38332521
Texas         26448193
dtype: int64
New York    19651127
Florida     19552860
Illinois    12882135
dtype: int64


In [51]:
"""
Pandas object: DataFrame
"""

area_dict = {'California' : 423967, 'Texas' : 695662, 'New York' : 141297,
            'Florida' : 170312, 'Illinois' : 149995}

area = pd.Series(area_dict)
print(area)

California    423967
Texas         695662
New York      141297
Florida       170312
Illinois      149995
dtype: int64


In [60]:
# Construct a DataFrame containing 'population' and 'area' Series

states = pd.DataFrame({'population' : population, 'area' : area})
print(states)

print(states.index, '\n')
print(states.columns, '\n')

print(states['area'], '\n')
print(states['population']) 

            population    area
California    38332521  423967
Texas         26448193  695662
New York      19651127  141297
Florida       19552860  170312
Illinois      12882135  149995
Index(['California', 'Texas', 'New York', 'Florida', 'Illinois'], dtype='object') 

Index(['population', 'area'], dtype='object') 

California    423967
Texas         695662
New York      141297
Florida       170312
Illinois      149995
Name: area, dtype: int64 

California    38332521
Texas         26448193
New York      19651127
Florida       19552860
Illinois      12882135
Name: population, dtype: int64


In [64]:
# Construct a DataFrame from a 2D NumPy arrray

arr = np.random.rand(3, 2)
print(arr, '\n')

df = pd.DataFrame(arr, columns=['foo', 'bar'], index = ['a', 'b', 'c'])
print(df)

[[0.1266924  0.51050913]
 [0.75512208 0.80762233]
 [0.95465841 0.60385114]] 

        foo       bar
a  0.126692  0.510509
b  0.755122  0.807622
c  0.954658  0.603851


In [97]:
"""
Series object manipulation: dictionary-style
"""

ser = pd.Series([0.25, 0.5, 0.75, 1.0], index=['a', 'b', 'c', 'd'])

print(ser, '\n')

print(ser[0]) # 정수형 인덱스 (암묵적)
print(ser['a']) # 레이블 기반 인덱스(명시적)

print('a' in ser) # check if key exists
print(0.25 in ser) # 특정값(value)를 찾는 문법은 존재 X

print(ser.index)

0.25
a    0.25
b    0.50
c    0.75
d    1.00
dtype: float64 

0.25
0.25
True
False
Index(['a', 'b', 'c', 'd'], dtype='object')


In [80]:
"""
Series object manipulation: array-style
"""

print(ser.keys())
print(ser, '\n')

ser['e'] = 1.25
ser['a'] = 0.125

print(ser)

Index(['a', 'b', 'c', 'd', 'e'], dtype='object')
a    0.125
b    0.500
c    0.750
d    1.000
e    1.250
dtype: float64 

a    0.125
b    0.500
c    0.750
d    1.000
e    1.250
dtype: float64


In [81]:
"""
Caution: Slicing Series object using explicit/implicit indexing
"""

'\nCaution: Slicing Series object using explicit/implicit indexing\n'

In [87]:
"""
DataFrame object manipulation
"""

print(states['area'], '\n')
print(states.area)

states['density'] = states['population'] / states['area']
print(states)

California    423967
Texas         695662
New York      141297
Florida       170312
Illinois      149995
Name: area, dtype: int64 

California    423967
Texas         695662
New York      141297
Florida       170312
Illinois      149995
Name: area, dtype: int64
            population    area     density
California    38332521  423967   90.413926
Texas         26448193  695662   38.018740
New York      19651127  141297  139.076746
Florida       19552860  170312  114.806121
Illinois      12882135  149995   85.883763


In [115]:
# Indexers: loc, iloc

print(states, '\n')

print(states.loc['California' : 'New York'], '\n')

print(states.iloc[0:1], '\n')

ser = pd.Series(['a', 'b', 'c'], index = ['1', '3', '5'])
print(ser)
ser.loc['1']
ser.loc['1' : '3']
ser.iloc[0]
ser.iloc[0 : 2]

            population    area     density
California    38332521  423967   90.413926
Texas         26448193  695662   38.018740
New York      19651127  141297  139.076746
Florida       19552860  170312  114.806121
Illinois      12882135  149995   85.883763 

            population    area     density
California    38332521  423967   90.413926
Texas         26448193  695662   38.018740
New York      19651127  141297  139.076746 

            population    area    density
California    38332521  423967  90.413926 

1    a
3    b
5    c
dtype: object


1    a
3    b
dtype: object

In [125]:
# Data Manipulation: DataFram

print(states, '\n')

states.iloc[1, 1] = 777

print(states, '\n')

print(states.loc[states.density > 100, ['population', 'density']], '\n')

            population    area     density
California    38332521  423967   90.413926
Texas         26448193     777   38.018740
New York      19651127  141297  139.076746
Florida       19552860  170312  114.806121
Illinois      12882135  149995   85.883763 

            population    area     density
California    38332521  423967   90.413926
Texas         26448193     777   38.018740
New York      19651127  141297  139.076746
Florida       19552860  170312  114.806121
Illinois      12882135  149995   85.883763 

          population     density
New York    19651127  139.076746
Florida     19552860  114.806121 

