# Getting Started With Pandas

# keywords: 

reindexing, overwriting the indices using lists, accessing and modiying series using indices, values and indices of a pd series, Missing data, NaN, isnull, notnull, naming of a series object, naming of series indices, inplace re_indexing using lists, dataframes and various ways of creating dataframes,

In [309]:
%reset -f
import pandas as pd

# Pandas Series

In [310]:
# A Series is a one-dimensional array-like object containing an array of data (of anyNumPy data type) 
# and an associated array of data labels, called its index. 
#The simplestSeries is formed from only an array of data:
obj = pd.Series([4, 7, -5, 3])

In [311]:
obj

0    4
1    7
2   -5
3    3
dtype: int64

In [312]:
# get the indices
obj.index

RangeIndex(start=0, stop=4, step=1)

In [313]:
# get the values
obj.values

array([ 4,  7, -5,  3])

In [314]:
# We can specify the indices using the "index" keyword
obj2 = pd.Series([4, 7, -5, 3], index=['d', 'b', 'a', 'c'])

In [315]:
obj2

d    4
b    7
a   -5
c    3
dtype: int64

In [316]:
# getting the indices of the series
obj2.index

Index(['d', 'b', 'a', 'c'], dtype='object')

In [317]:
# We can get the values using the indices (linke in Numpy)
obj2['a']

-5

In [318]:
obj2['d']=99

In [319]:
# Indexing can also be done using a list of indices
obj2[['a','b']]

a   -5
b    7
dtype: int64

In [320]:
obj2

d    99
b     7
a    -5
c     3
dtype: int64

In [321]:
# Filtering
obj2>3

d     True
b     True
a    False
c    False
dtype: bool

In [322]:
obj2[obj2>3]

d    99
b     7
dtype: int64

In [323]:
# Another way to think about a Series is as a fixed-length, ordered dict, as it is a mappingof index values to data values. 
# It can be substituted into many functions that expect adict:

'b' in obj2

True

In [324]:
'e' in obj2

False

In [325]:
sdata = {'Ohio': 35000, 'Texas': 71000, 'Oregon': 16000, 'Utah': 5000}

In [326]:
obj3 = pd.Series(sdata)

In [327]:
obj3

Ohio      35000
Texas     71000
Oregon    16000
Utah       5000
dtype: int64

In [328]:
# The dictionary keys of the Series object can be overwrtten by a list of indices

states = ['California', 'Ohio', 'Oregon', 'Texas']

In [329]:
obj4 = pd.Series(sdata, index=states)

In [330]:
obj4

California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
dtype: float64

In [331]:
# NaN is not a number, and can be referred to as 'missing'
# isnull() and notnull() can be used to detect null and not null in a series or dataframe

obj4.isnull() # or pd.isnull()

California     True
Ohio          False
Oregon        False
Texas         False
dtype: bool

In [332]:
obj4.notnull() # or pd.notnull()

California    False
Ohio           True
Oregon         True
Texas          True
dtype: bool

In [333]:
# Series automatically aligns the differently indexed data in arithmetic operations

obj3 + obj4

California         NaN
Ohio           70000.0
Oregon         32000.0
Texas         142000.0
Utah               NaN
dtype: float64

In [334]:
# naming of a series object
obj4.name = 'population'

# naming of series indices
obj4.index.name = 'states'

obj4

states
California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
Name: population, dtype: float64

In [335]:
#inplace reindexing

obj.index=[10,20,30,40]

obj


10    4
20    7
30   -5
40    3
dtype: int64

# Pandas DataFrames

In [336]:
data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada'],
        'year': [2000, 2001, 2002, 2001, 2002],
        'pop': [1.5, 1.7, 3.6, 2.4, 2.9]}
frame = pd.DataFrame(data)

# The resulting DataFrame contains the names of the columns in the sorted order
frame

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9


In [337]:
# The columns can be ordered differently using a list as follows

frame = pd.DataFrame(data, columns = ['year','state','pop'])

frame

Unnamed: 0,year,state,pop
0,2000,Ohio,1.5
1,2001,Ohio,1.7
2,2002,Ohio,3.6
3,2001,Nevada,2.4
4,2002,Nevada,2.9


In [338]:
frame2 = pd.DataFrame(data, columns=['year', 'state', 'pop', 'debt'], 
                   index=['one', 'two', 'three', 'four', 'five'])

In [339]:
frame2.columns

Index(['year', 'state', 'pop', 'debt'], dtype='object')

In [341]:
# Retrieval can be done using square brackets (dictionary-like) or dot operator. For example:

frame2.year

# Or
frame2['year']

one      2000
two      2001
three    2002
four     2001
five     2002
Name: year, dtype: int64

# ============================================================