In [3]:
import pandas
# Check pandas version
pandas.__version__

import pandas as pd

pd? # display the documentation of pandas


We will start our code sessions with the standard NumPy and Pandas imports:


In [1]:
import numpy as np
import pandas as pd

### 01.The Pandas Series Object


In [10]:
x = pd.Series([12.3,4,5.5,67,34,3])
x

x.values # values method extrct the only values of the Series

x.index #The index is an array-like object of type pd.Index, which we’ll discuss in more detail momentarily:

RangeIndex(start=0, stop=6, step=1)

In [12]:
# We can also acess the element by index no.
x[0]

x[1:3]

1    4.0
2    5.5
dtype: float64

### 02. Series as generalized NumPy array


In [20]:
x1 = pd.Series([1.2,223.3,4,5,6],index=["a","b","c","d",'e'])
x1

x1["c"]# items can acess as can expected

#We can even use noncontiguous or nonsequential indices:

x2 = pd.Series([11,22,33,44,55,66],index=[7,2,9,4,5,6])
x2

7    11
2    22
9    33
4    44
5    55
6    66
dtype: int64

### 03. Series as specialized dictionary


In [23]:
population_dict = {"india":140987897,'UK':34234563,'US':98763453,'China':203453405}

population = pd.Series(population_dict)
population

population['india']
population['india':'US']

india    140987897
UK        34234563
US        98763453
dtype: int64

### 04. Constructing Series objects


In [24]:
#data can be a list or NumPy array, in which case index defaults to an integer sequence:
pd.Series([2,3,4,5])

0    2
1    3
2    4
3    5
dtype: int64

In [25]:
#data can be a scalar, which is repeated to fill the specified index:

pd.Series(5,index=[100,200,300,400,500,600])

100    5
200    5
300    5
400    5
500    5
600    5
dtype: int64

In [29]:
# data can be a dictionary, in which index defaults to the sorted dictionary keys:

pd.Series({2:'a',1:"c",3:"d"}).sort_index()

1    c
2    a
3    d
dtype: object

In [31]:
# In each case, the index can be explicitly set if a different result is preferred:
pd.Series({2:"a",1:"c",3:"d"},index=[3,1])


# Notice that in this case, the Series is populated only with the explicitly identified keys.

3    d
1    c
dtype: object

# The Pandas DataFrame Object


### 01.DataFrame as a generalized NumPy array


In [46]:
population_dict = {"Bihar":140987897,'J&K':34234563,'UK':98763453,'UP':203453405}
area_dict = {"Bihar":234234,"J&K":234234,"UK":87764,"UP":3443222}

population = pd.Series(population_dict)
area = pd.Series(area_dict)

state = pd.DataFrame({"Population":population,"Area":area}) 
print(state) # display the table of the DataFrame

state.index # display the index of the dataframe

state.columns # display the name of total columns


       Population     Area
Bihar   140987897   234234
J&K      34234563   234234
UK       98763453    87764
UP      203453405  3443222


Index(['Population', 'Area'], dtype='object')

### 02. DataFrame as specialized dictionary


In [48]:
state["Area"]

Bihar     234234
J&K       234234
UK         87764
UP       3443222
Name: Area, dtype: int64

### 03. Constructing DataFrame objects


From a single Series object:-

 A DataFrame is a collection of Series objects, and a singlecolumn DataFrame can be constructed 
from a single Series:

In [50]:
pd.DataFrame(population,columns=['population'])

Unnamed: 0,population
Bihar,140987897
J&K,34234563
UK,98763453
UP,203453405


From a list of dicts:- 
Any list of dictionaries can be made into a DataFrame. We’ll use a simple list comprehension to create some data:

In [52]:
data = [{"a":i,'b':i*2} for i in range(3)]
pd.DataFrame(data)

Unnamed: 0,a,b
0,0,0
1,1,2
2,2,4


Even if some keys in the dictionary are missing, Pandas will fill them in with NaN (i.e.,
“not a number”) values:

In [57]:
pd.DataFrame([{"a":2,"b":4,'d':1.3},{"b":3,"c":7,"d":13}])

Unnamed: 0,a,b,d,c
0,2.0,4,1.3,
1,,3,13.0,7.0


### 01. From a dictionary of Series objects

In [58]:
#As we saw before, a DataFrame can be constructed from a dictionary of Series objects as well:

pd.DataFrame({"Population":population,"Area":area})

Unnamed: 0,Population,Area
Bihar,140987897,234234
J&K,34234563,234234
UK,98763453,87764
UP,203453405,3443222


### 02. From a two-dimensional NumPy array

In [85]:
# Given a two-dimensional array of data, we can create a DataFrame with any specified column and index names. 
# If omitted, an integer index will be used for each:

pd.DataFrame(np.random.rand(4,2),columns=["column 01","column 02"],index = ["a","b","c","d"])

Unnamed: 0,column 01,column 02
a,0.956959,0.777306
b,0.072011,0.591102
c,0.729132,0.484323
d,0.732699,0.899321


### 03. From a NumPy structured array

In [92]:
a = np.zeros(3,dtype=[("A","i4"),("B","f8")])
a
pd.DataFrame(a)                 

Unnamed: 0,A,B
0,0,0.0
1,0,0.0
2,0,0.0
3,0,0.0
4,0,0.0
5,0,0.0
6,0,0.0
7,0,0.0
8,0,0.0
9,0,0.0


# The Pandas Index Object


In [93]:
ind = pd.Index([2,4,6,7,1])
ind

Int64Index([2, 4, 6, 7, 1], dtype='int64')

### 01. Index as immutable array


In [100]:
ind[1]

ind[::2]

# Index objects also have many of the attributes familiar from NumPy arrays:

print("ind shape",ind.shape,"\n ind difference ",ind.difference,"\n ind data type",ind.dtype,"\n ind size",ind.size)

ind shape (5,) 
 ind difference  <bound method Index.difference of Int64Index([2, 4, 6, 7, 1], dtype='int64')> 
 ind data type int64 
 ind size 5


In [102]:
ind[0]=12 # Get error "Index does not support mutable operations"

TypeError: Index does not support mutable operations

### 02. Index as ordered set


In [108]:
indA = pd.Index([12,23,3,4,5,55])
indB = pd.Index([1,44,55,2,3,23])

print("Intersection    =",indA & indB) # intersection

print("Union   =",indA | indB) # Union

print("Symetric Diffrence  =",indA ^ indB) # Symetric diffrence

Intersection    = Int64Index([23, 3, 55], dtype='int64')
Union   = Int64Index([1, 2, 3, 4, 5, 12, 23, 44, 55], dtype='int64')
Symetric Diffrence  = Int64Index([1, 2, 4, 5, 12, 44], dtype='int64')


  print("Intersection    =",indA & indB) # intersection
  print("Union   =",indA | indB) # Union
  print("Symetric Diffrence  =",indA ^ indB) # Symetric diffrence


# Data Indexing and Selection


### 01. Series as dictionary


In [2]:
x = pd.Series([0.5,0.3,0.6,0.33,0.15],index=["a","b","c","d","e"])

x

a    0.50
b    0.30
c    0.60
d    0.33
e    0.15
dtype: float64

In [15]:
x['b'] # Acess  value of the key "b"

'a' in x # checking the key element in series x

x.keys() # extracting the keys in series

x.values # extracting the values in series

list(x.items()) # extract the elements as list

x["c"] = .25 # assigning new value for key 'c'

print(x)


a    0.50
b    0.30
c    0.25
d    0.33
e    0.15
dtype: float64


### 02. Series as one-dimensional array


A Series builds on this dictionary-like interface and provides array-style item selec‐tion via the same basic mechanisms 
as NumPy arrays—that is, slices, masking, and fancy indexing. Examples of these are as follows:

In [17]:
# slicing by explicit index

x['a':'c']

a    0.50
b    0.30
c    0.25
dtype: float64

In [18]:
# slicing by implicit integer index
x[0:3]

a    0.50
b    0.30
c    0.25
dtype: float64

In [20]:
# masking

x[(x<0.25) & (x<1.0)]

e    0.15
dtype: float64

In [21]:
# fancy indexing

x[['a','d']]

a    0.50
d    0.33
dtype: float64

### 03. Indexers: loc, iloc, and ix

In [25]:
data = pd.Series(['a','b','c','d','e'],index=[1,2,33,4,6])
data

data[1] # explicit index when indexing

data[2:4] # implicit index when slicing



33    c
4     d
dtype: object

First, the loc attribute allows indexing and slicing that always references the explicit
index:

In [32]:

data.loc[1]

data.loc[1:3]

KeyError: 3

The iloc attribute allows indexing and slicing that always references the implicit
Python-style index:

In [31]:
data.iloc[1:3]

2     b
33    c
dtype: object

# Data Selection in DataFrame

### 01. DataFrame as a dictionary


In [40]:
population_state = pd.Series({'Bihar':323432,'UP':654534,'Jharkhand':234534,'Delhi':8384823})

area_state = pd.Series({"Bihar":12342,'Jharkhand':23432,'Delhi':12345,'UP':2345})

state = pd.DataFrame({'Population':population_state,'Area':area_state})

print(state)

state["Area"] # print the area of the state

state.Area #Equivalently, we can use attribute-style access with column names that are strings:

state["Density"] = state["Population"] / state["Area"]

state

           Population   Area
Bihar          323432  12342
Delhi         8384823  12345
Jharkhand      234534  23432
UP             654534   2345


Unnamed: 0,Population,Area,Density
Bihar,323432,12342,26.205801
Delhi,8384823,12345,679.208019
Jharkhand,234534,23432,10.009133
UP,654534,2345,279.118977


### 02. DataFrame as two-dimensional array


In [41]:
state.values

array([[3.23432000e+05, 1.23420000e+04, 2.62058013e+01],
       [8.38482300e+06, 1.23450000e+04, 6.79208019e+02],
       [2.34534000e+05, 2.34320000e+04, 1.00091328e+01],
       [6.54534000e+05, 2.34500000e+03, 2.79118977e+02]])

In [42]:
state.T  # swap row into column

Unnamed: 0,Bihar,Delhi,Jharkhand,UP
Population,323432.0,8384823.0,234534.0,654534.0
Area,12342.0,12345.0,23432.0,2345.0
Density,26.205801,679.208,10.009133,279.118977


state.values[0]     # extract the 1st row


state.iloc[0:3,0:3] # extract the row with respect to column(called slicing)

###  03. Additional indexing conventions


In [53]:
state['Bihar':"Delhi"] # print the slic of the index

state[0:2] # here integer slic index is used for print the table

state[state.Density>100] # Similarly, direct masking operations are also interpreted row-wise rather than column-wise:

Unnamed: 0,Population,Area,Density
Delhi,8384823,12345,679.208019
UP,654534,2345,279.118977


### 04. Operating on Data in Pandas
