# Introduction to Pandas Data Structures

In [1]:
# !pip install pandas

### Series

* One-dim array of same type, also found in Dataframe colums

In [2]:
import numpy as np
import pandas as pd

In [3]:
pd.Series(data=[2])

0    2
dtype: int64

In [4]:
pd.Series(data=[1, 2 , 3, 0.3, 4], index=['a', 'b', 'c', 'd', 'e'])

a    1.0
b    2.0
c    3.0
d    0.3
e    4.0
dtype: float64

In [5]:
s1 = pd.Series(data=[1, 2 , 3, 5, 4])    

In [6]:
s1

0    1
1    2
2    3
3    5
4    4
dtype: int64

In [7]:
s1.index

RangeIndex(start=0, stop=5, step=1)

In [8]:
s1[0]

1

In [9]:
s2 = pd.Series(data=[1, 2 , 3, 0.3, '4'], index=['a', 'b', 'c', 'd', 'e'])
s2

a      1
b      2
c      3
d    0.3
e      4
dtype: object

In [10]:
s2['c']

3

#### Passing Dictionary

In [11]:
d = {'a': 1, 'b': 9, 'c': 6}

In [12]:
s = pd.Series(d)
s

a    1
b    9
c    6
dtype: int64

#### Passing Numpy Array

In [13]:
s = pd.Series(np.array([15, 32, 88, 63, 44]))
s

0    15
1    32
2    88
3    63
4    44
dtype: int64

#### Size, shape, uniqueness, and counts of values

In [14]:
s = pd.Series([1,8,56,763,8,0,2,np.nan])
s

0      1.0
1      8.0
2     56.0
3    763.0
4      8.0
5      0.0
6      2.0
7      NaN
dtype: float64

In [15]:
len(s)

8

In [16]:
s.size

8

In [17]:
s.shape

(8,)

In [18]:
s.unique()

array([  1.,   8.,  56., 763.,   0.,   2.,  nan])

In [19]:
s.count() # count Not Nan Values

7

In [20]:
s.value_counts(dropna=False) # Konsi value kitni bar hai.

8.0      2
56.0     1
763.0    1
NaN      1
2.0      1
0.0      1
1.0      1
dtype: int64

#### Peeking at data with heads, tails, and take

In [21]:
s

0      1.0
1      8.0
2     56.0
3    763.0
4      8.0
5      0.0
6      2.0
7      NaN
dtype: float64

In [22]:
s.head()

0      1.0
1      8.0
2     56.0
3    763.0
4      8.0
dtype: float64

In [23]:
s.tail()

3    763.0
4      8.0
5      0.0
6      2.0
7      NaN
dtype: float64

#### .take() 

method will return the rows in a series that correspond to the zero-based positions, only take specific items

In [24]:
s.take([1, 3, 5], axis=0)

1      8.0
3    763.0
5      0.0
dtype: float64

### Indexing: loc, iloc and etc

In [25]:
s = pd.Series(np.arange(7))
s

0    0
1    1
2    2
3    3
4    4
5    5
6    6
dtype: int64

In [26]:
s[0]

0

In [27]:
s[[2, 5]]

2    2
5    5
dtype: int64

In [28]:
s.loc[0]

0

In [29]:
s.loc[[2, 6]]

2    2
6    6
dtype: int64

In [30]:
s.loc[0]

0

In [31]:
s.loc[[2, 6]]

2    2
6    6
dtype: int64

In [32]:
s = pd.Series(np.arange(7), index=['a','b','c','d','e','f', 'g'])
s

a    0
b    1
c    2
d    3
e    4
f    5
g    6
dtype: int64

In [33]:
s.loc['g']

6

In [34]:
s.loc[['d', 'e']]

d    3
e    4
dtype: int64

In [35]:
s.iloc[2]

2

In [36]:
s.iloc[[2, 5]]

c    2
f    5
dtype: int64

### Alignment via index labels

`Nan + number = NaN = number + NaN`

In [37]:
s6 = pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd'])
s6

a    1
b    2
c    3
d    4
dtype: int64

In [38]:
s7 = pd.Series([4, 3, 2, 1, 9], index=['d', 'c', 'b', 'a', 'e'])
s7

d    4
c    3
b    2
a    1
e    9
dtype: int64

In [39]:
s6 + s7

a    2.0
b    4.0
c    6.0
d    8.0
e    NaN
dtype: float64

In [40]:
s8 = pd.Series({'a': 1, 'b': 2, 'c': 3, 'd': 5})
s8

a    1
b    2
c    3
d    5
dtype: int64

In [41]:
s9 = pd.Series({'b': 6, 'c': 7, 'd': 9, 'e': 10})
s9

b     6
c     7
d     9
e    10
dtype: int64

In [42]:
s8 + s9

a     NaN
b     8.0
c    10.0
d    14.0
e     NaN
dtype: float64

In [43]:
s10 = pd.Series([1.0, 2.0, 3.0], index=['a', 'a', 'b'])
s10

a    1.0
a    2.0
b    3.0
dtype: float64

In [44]:
s11 = pd.Series([4.0, 5.0, 6.0], index=['a', 'a', 'c'])
s11

a    4.0
a    5.0
c    6.0
dtype: float64

In [45]:
s10+s11 # ek repeated apne same se add hoga

a    5.0
a    6.0
a    6.0
a    7.0
b    NaN
c    NaN
dtype: float64

### The Special case of Not-A-Number (NaN)

In [46]:
# mean of numpy array values
nda = np.array([1, 2, 3, 4, 5])
nda.mean()

3.0

In [47]:
# mean of numpy array values with a NaN
nda = np.array([1, 2, 3, 4, np.NaN])
nda.mean()

nan

In [48]:
# ignores NaN values
nda = np.array([1, 2, 3, 4, np.NaN])
s = pd.Series(nda)      
s.mean()

2.5

In [49]:
s.mean(skipna=False)

nan

###  Boolean Selection

In [50]:
s = pd.Series(np.arange(0, 10))
s

0    0
1    1
2    2
3    3
4    4
5    5
6    6
7    7
8    8
9    9
dtype: int64

In [51]:
s[s>5] # which row values are greater than 5

6    6
7    7
8    8
9    9
dtype: int64

In [52]:
s[(s>5) & (s<8)] # which row values are greater than 5 and less than 8

6    6
7    7
dtype: int64

In [53]:
s = pd.Series([True, False, False, True, True])
s

0     True
1    False
2    False
3     True
4     True
dtype: bool

In [54]:
s.all()

False

In [55]:
s = pd.Series([True, False, False, True, True])
s

0     True
1    False
2    False
3     True
4     True
dtype: bool

In [56]:
s.any()

True

In [57]:
arr = np.array([True,False,True,True])
arr

array([ True, False,  True,  True])

In [58]:
arr.sum()

3

In [59]:
s = pd.Series(np.arange(0, 10))
s

0    0
1    1
2    2
3    3
4    4
5    5
6    6
7    7
8    8
9    9
dtype: int64

In [60]:
# are all items >= 0?
(s >= 0).all()

True

In [61]:
# any items < 2?
(s[s < 0].any())

False

In [62]:
# how many values < 2?
(s < 2).sum()

2

### Reindexing a Series

Reindexing in pandas is a process that makes the data in a Series or DataFrame
match a given set of labels. This is core to the functionality of pandas as it enables
label alignment across multiple objects, which may originally have different
indexing schemes.
This process of performing a reindex includes the following steps:
1. Reordering existing data to match a set of labels.
2. Inserting NaN markers where no data exists for a label.
3. Possibly, filling missing data for a label using some type of logic (defaulting
to adding NaN values).

In [63]:
# sample series of five items
s = pd.Series(np.random.randn(5))
s

0   -0.348301
1   -0.698444
2   -0.309558
3   -0.887962
4   -0.983089
dtype: float64

In [64]:
# change the index
s.index = ['a', 'b', 'c', 'd', 'e']

In [65]:
s

a   -0.348301
b   -0.698444
c   -0.309558
d   -0.887962
e   -0.983089
dtype: float64

In [66]:
np.random.seed(123456)

s1 = pd.Series(np.random.randn(3))
s2 = pd.Series(np.random.randn(3))

s = pd.concat([s1, s2])

s

0    0.469112
1   -0.282863
2   -1.509059
0   -1.135632
1    1.212112
2   -0.173215
dtype: float64

In [67]:
s.reset_index(drop=True)

0    0.469112
1   -0.282863
2   -1.509059
3   -1.135632
4    1.212112
5   -0.173215
dtype: float64

In [68]:
np.random.seed(123456)
s1 = pd.Series(np.random.randn(4), ['a', 'b', 'c', 'd'])
s1

a    0.469112
b   -0.282863
c   -1.509059
d   -1.135632
dtype: float64

In [69]:
# reindex with different number of labels, results in dropped rows and/or NaN's
s2 = s1.reindex(['a', 'c', 'g']) # inplace = False
s2

a    0.469112
c   -1.509059
g         NaN
dtype: float64

In [70]:
# different types for the same values of labels
# causes big trouble
s1 = pd.Series([0, 1, 2], index=[0, 1, 2])
s2 = pd.Series([3, 4, 5], index=['0', '1', '2'])
s1 + s2

0   NaN
1   NaN
2   NaN
0   NaN
1   NaN
2   NaN
dtype: float64

#### Solution

In [71]:
s2.index.values.astype(int)

array([0, 1, 2])

In [72]:
# reindex by casting the label types and we will get the desired result

s2.index = s2.index.values.astype(int)
s1 + s2

0    3
1    5
2    7
dtype: int64

#### NaN vs fill_value 

In [73]:
# fill with 0 instead of NaN
s2 = s.copy()
s2.index = ['a', 'b', 'c', 'd', 'e', 'f']
s2

a    0.469112
b   -0.282863
c   -1.509059
d   -1.135632
e    1.212112
f   -0.173215
dtype: float64

In [74]:
s2_reindexed = s2.reindex(['a', 'f', 'g'], fill_value=0)
s2_reindexed

a    0.469112
f   -0.173215
g    0.000000
dtype: float64

#### ffill, bfill, & nearest

In [75]:
# create example to demonstrate fills
s3 = pd.Series(['red', 'green', 'blue' ], index=[0, 8, 10])
s3

0       red
8     green
10     blue
dtype: object

In [76]:
# forward fill example
s3.reindex(np.arange(0,15), method='ffill')

0       red
1       red
2       red
3       red
4       red
5       red
6       red
7       red
8     green
9     green
10     blue
11     blue
12     blue
13     blue
14     blue
dtype: object

In [77]:
# backwards fill example
s3.reindex(np.arange(0,15), method='bfill')

0       red
1     green
2     green
3     green
4     green
5     green
6     green
7     green
8     green
9      blue
10     blue
11      NaN
12      NaN
13      NaN
14      NaN
dtype: object

In [78]:
# nearest: use nearest valid observations to fill gap
s3.reindex(np.arange(0,11), method='nearest')

0       red
1       red
2       red
3       red
4     green
5     green
6     green
7     green
8     green
9      blue
10     blue
dtype: object

### Slicing a Series

In [79]:
s = pd.Series(np.arange(100, 110), index=np.arange(10, 20))
s

10    100
11    101
12    102
13    103
14    104
15    105
16    106
17    107
18    108
19    109
dtype: int64

In [80]:
# [startofrow:endofrow:step]
print(s[0:6:2])

10    100
12    102
14    104
dtype: int64


In [81]:
# equivalent to
s.iloc[[0, 2, 4]]

10    100
12    102
14    104
dtype: int64

In [82]:
# first five by slicing, same as .head(5)
s[:5]

10    100
11    101
12    102
13    103
14    104
dtype: int64

In [83]:
s.iloc[:5]

10    100
11    101
12    102
13    103
14    104
dtype: int64

### Missing Data in Series

In [84]:
sdata = {'Ohio': 35000, 'Texas': 71000, 'Oregon': 16000, 'Utah': 5000}
obj3 = pd.Series(sdata)
obj3

Ohio      35000
Texas     71000
Oregon    16000
Utah       5000
dtype: int64

In [85]:
states = ['California', 'Ohio', 'Oregon', 'Texas']

In [86]:
obj4 = pd.Series(sdata, index=states)
obj4

California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
dtype: float64

In [87]:
pd.isnull(obj4)  # obj4.isnull()

California     True
Ohio          False
Oregon        False
Texas         False
dtype: bool

In [88]:
pd.notnull(obj4)  #obj4.notnull()

California    False
Ohio           True
Oregon         True
Texas          True
dtype: bool

In [89]:
obj4.loc[obj4.isnull()] = 0
obj4

California        0.0
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
dtype: float64

In [90]:
obj4.name = 'population'
obj4.index.name="state"
obj4

state
California        0.0
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
Name: population, dtype: float64

### The Pandas DataFrame Object

A DataFrame represents a rectangular table of data and contains an ordered collection of columns, each of which can be a different value type (numeric, string,
boolean, etc.). 

In [91]:
df = pd.DataFrame(np.array([[10,11,12,13,14], [20,21,22,23,24]]))
df

Unnamed: 0,0,1,2,3,4
0,10,11,12,13,14
1,20,21,22,23,24


In [92]:
# create a DataFrame for a list of Series objects

df1 = pd.DataFrame([pd.Series(np.arange(10, 15)),
                    pd.Series(np.arange(15, 20))])
df1
# default row and columns indexes

Unnamed: 0,0,1,2,3,4
0,10,11,12,13,14
1,15,16,17,18,19


In [93]:
# create a DataFrame with two Series objects
# and a dictionary
s1 = pd.Series(np.arange(1, 6, 1))
s2 = pd.Series(np.arange(6, 11, 1))

In [94]:
df2= pd.DataFrame({'boys': s1, 'girls': s2})
df2

Unnamed: 0,boys,girls
0,1,6
1,2,7
2,3,8
3,4,9
4,5,10


In [95]:
# specify column names
df3 = pd.DataFrame(np.array([[10, 11], [20, 21]]), columns=['apples', 'oranges'])
df3

Unnamed: 0,apples,oranges
0,10,11
1,20,21


In [96]:
# create a DataFrame with named columns and rows

df4 = pd.DataFrame(np.array([[10, 11, 12, 13], [20, 21, 22, 23]]), 
                   index=['apples', 'oranges'], columns=['Mon', 'Tue','Wed', 'Thu'])
df4

Unnamed: 0,Mon,Tue,Wed,Thu
apples,10,11,12,13
oranges,20,21,22,23


In [97]:
# demonstrate alignment during creation
s3 = pd.Series(np.arange(12, 14), index=[1, 2])
df5 = pd.DataFrame({'c1': s1, 'c2': s2, 'c3': s3})
df5

Unnamed: 0,c1,c2,c3
0,1,6,
1,2,7,12.0
2,3,8,13.0
3,4,9,
4,5,10,


In [98]:
data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada', 'Nevada'],
        'year': [2000, 2001, 2002, 2001, 2002, 2003],
        'pop': [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}

frame = pd.DataFrame(data)
frame

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9
5,Nevada,2003,3.2


In [99]:
pd.DataFrame(frame, columns=['year', 'state', 'pop'])

Unnamed: 0,year,state,pop
0,2000,Ohio,1.5
1,2001,Ohio,1.7
2,2002,Ohio,3.6
3,2001,Nevada,2.4
4,2002,Nevada,2.9
5,2003,Nevada,3.2


In [100]:
frame.year

0    2000
1    2001
2    2002
3    2001
4    2002
5    2003
Name: year, dtype: int64

In [101]:
type(frame.year)

pandas.core.series.Series

In [102]:
frame['pop']

0    1.5
1    1.7
2    3.6
3    2.4
4    2.9
5    3.2
Name: pop, dtype: float64

In [103]:
# If you pass a column that isn’t contained in the dict(debt), it will appear with missing values
# in the result:
frame2 = pd.DataFrame(data, columns=['year', 'state', 'pop', 'debt'],
                              index=['one', 'two', 'three', 'four','five', 'six'])
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,
five,2002,Nevada,2.9,
six,2003,Nevada,3.2,


In [104]:
frame2.debt = 100

In [105]:
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,100
two,2001,Ohio,1.7,100
three,2002,Ohio,3.6,100
four,2001,Nevada,2.4,100
five,2002,Nevada,2.9,100
six,2003,Nevada,3.2,100


In [106]:
frame2.debt = np.arange(6)

In [107]:
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,0
two,2001,Ohio,1.7,1
three,2002,Ohio,3.6,2
four,2001,Nevada,2.4,3
five,2002,Nevada,2.9,4
six,2003,Nevada,3.2,5


In [108]:
val = pd.Series([-1.2, -1.5, -1.7], index=['two', 'four', 'five'])
val

two    -1.2
four   -1.5
five   -1.7
dtype: float64

In [109]:
frame2['debt'] = val

In [110]:
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,-1.2
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,-1.5
five,2002,Nevada,2.9,-1.7
six,2003,Nevada,3.2,


In [111]:
frame2['GDP'] = frame2.state == 'Ohio'
frame2

Unnamed: 0,year,state,pop,debt,GDP
one,2000,Ohio,1.5,,True
two,2001,Ohio,1.7,-1.2,True
three,2002,Ohio,3.6,,True
four,2001,Nevada,2.4,-1.5,False
five,2002,Nevada,2.9,-1.7,False
six,2003,Nevada,3.2,,False


In [112]:
frame2['moreThan2'] = frame2['pop'] > 2

In [113]:
frame2

Unnamed: 0,year,state,pop,debt,GDP,moreThan2
one,2000,Ohio,1.5,,True,False
two,2001,Ohio,1.7,-1.2,True,False
three,2002,Ohio,3.6,,True,True
four,2001,Nevada,2.4,-1.5,False,True
five,2002,Nevada,2.9,-1.7,False,True
six,2003,Nevada,3.2,,False,True


In [114]:
del frame2['GDP']
frame2

Unnamed: 0,year,state,pop,debt,moreThan2
one,2000,Ohio,1.5,,False
two,2001,Ohio,1.7,-1.2,False
three,2002,Ohio,3.6,,True
four,2001,Nevada,2.4,-1.5,True
five,2002,Nevada,2.9,-1.7,True
six,2003,Nevada,3.2,,True


### Nested dict
If the nested dict is passed to the DataFrame, pandas will interpret the outer dict keys as the columns and the inner keys as the row indices

In [115]:
#Another common form of data is a nested dict of dicts:
pop = {'Nevada': {2001: 2.4, 2002: 2.9},
       'Ohio':   {2000: 1.5, 2001: 1.7, 2002: 3.6}}
df3 =pd.DataFrame(pop)
df3

Unnamed: 0,Nevada,Ohio
2001,2.4,1.7
2002,2.9,3.6
2000,,1.5


In [116]:
df3.T

Unnamed: 0,2001,2002,2000
Nevada,2.4,2.9,
Ohio,1.7,3.6,1.5


In [117]:
df3

Unnamed: 0,Nevada,Ohio
2001,2.4,1.7
2002,2.9,3.6
2000,,1.5


In [118]:
pop1 = pd.DataFrame(pop, index=[2001, 2002, 2003])
pop1

Unnamed: 0,Nevada,Ohio
2001,2.4,1.7
2002,2.9,3.6
2003,,


In [119]:
pdata = {'Ohio': df3['Ohio'][:-1],
        'Nevada': df3['Nevada'][:2]}
pd.DataFrame(pdata)

Unnamed: 0,Ohio,Nevada
2001,1.7,2.4
2002,3.6,2.9


In [120]:
df3.index.name = 'year'; df3.columns.name = 'state'

In [121]:
df3

state,Nevada,Ohio
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2001,2.4,1.7
2002,2.9,3.6
2000,,1.5
