# Pandas

Pandas contains high level data structures and manipulation tools to make data analysis fast and easy in Python.

In [7]:
import pandas as pd #I am importing pandas as pd
from pandas import Series, DataFrame # Series and Data Frame are two data structures available in python
import numpy as np

Series
Series is a one-dimensional array like object containing an array of data(any Numpy data type, and an associated array of data labels, called its index.

In [8]:
#mjp = Series([5,4,3,2,1])# a simple series
mjp = np.array([5,4,3,2,1])
mjp = pd.Series(mjp)
print(mjp)        # A series is represented by index on the left and values on the right
print(mjp.values) # similar to dictionary. ".values" command returns values in a series 

0    5
1    4
2    3
3    2
4    1
dtype: int64
[5 4 3 2 1]


In [10]:
print(mjp.index) # returns the index values of the series

RangeIndex(start=0, stop=5, step=1)


In [11]:
jeeva = Series([5,4,3,2,1,-7,-29], index =['a','b','c','d','e','f','h']) # The index is specified
print(jeeva) # try jeeva.index and jeeva.values
print(jeeva['a']) # selecting a particular value from a Series, by using index

a     5
b     4
c     3
d     2
e     1
f    -7
h   -29
dtype: int64
5


In [12]:
jeeva['d'] = 9 # change the value of a particular element in series
print(jeeva)
jeeva[['a','b','c']] # select a group of values

a     5
b     4
c     3
d     9
e     1
f    -7
h   -29
dtype: int64


a    5
b    4
c    3
dtype: int64

In [13]:
print(jeeva[jeeva>0]) # returns only the positive values
print(jeeva *2) # multiplies 2 to each element of a series

a    5
b    4
c    3
d    9
e    1
dtype: int64
a    10
b     8
c     6
d    18
e     2
f   -14
h   -58
dtype: int64


In [14]:
import numpy as np
np.mean(jeeva) # you can apply numpy functions to a Series

-2.0

In [17]:
print('b' in jeeva) # checks whether the index is present in Series or not
print('z' in jeeva)

True
False


In [18]:
player_salary ={'Rooney': 50000, 'Messi': 75000, 'Ronaldo': 85000, 'Fabregas':40000, 'Van persie': 67000} 
new_player = Series(player_salary)# converting a dictionary to a series
print(new_player) # the series has keys of a dictionary

Rooney        50000
Messi         75000
Ronaldo       85000
Fabregas      40000
Van persie    67000
dtype: int64


In [19]:
players =['Klose', 'Messi', 'Ronaldo', 'Van persie', 'Ballack'] 
player_1 =Series(player_salary, index= players)
print(player_1) # I have changed the index of the Series. 
#Since, no value was not found for Klose and Ballack, it appears as NAN

Klose             NaN
Messi         75000.0
Ronaldo       85000.0
Van persie    67000.0
Ballack           NaN
dtype: float64


In [20]:
pd.isnull(player_1)#checks for Null values in player_1, pd denotes a pandas dataframe

Klose          True
Messi         False
Ronaldo       False
Van persie    False
Ballack        True
dtype: bool

In [21]:
player_1.name ='Bundesliga players' # name for the Series
player_1.index.name='Player names' #name of the index
player_1

Player names
Klose             NaN
Messi         75000.0
Ronaldo       85000.0
Van persie    67000.0
Ballack           NaN
Name: Bundesliga players, dtype: float64

In [22]:
player_1.index =['Neymar', 'Hulk', 'Pirlo', 'Buffon', 'Anderson'] # is used to alter the index of Series
player_1 

Neymar          NaN
Hulk        75000.0
Pirlo       85000.0
Buffon      67000.0
Anderson        NaN
Name: Bundesliga players, dtype: float64

# Data Frame
Data frame is a spread sheet like structure, containing ordered collection of columns. Each column can have different value type. Data frame has both row index and column index.

In [24]:
states ={'State' :['Gujarat', 'Tamil Nadu', ' Andhra', 'Karnataka', 'Kerala'],
                  'Population': [36, 44, 67,89,34],
                  'Language' :['Gujarati', 'Tamil', 'Telugu', 'Kannada', 'Malayalam']}
india = pd.DataFrame(states) # creating a data frame
india

Unnamed: 0,State,Population,Language
0,Gujarat,36,Gujarati
1,Tamil Nadu,44,Tamil
2,Andhra,67,Telugu
3,Karnataka,89,Kannada
4,Kerala,34,Malayalam


In [25]:
DataFrame(states, columns=['State', 'Language', 'Population']) # change the sequence of column index

Unnamed: 0,State,Language,Population
0,Gujarat,Gujarati,36
1,Tamil Nadu,Tamil,44
2,Andhra,Telugu,67
3,Karnataka,Kannada,89
4,Kerala,Malayalam,34


In [28]:
new_farme = DataFrame(states, columns=['State', 'Language', 'Population', 'Per Capita Income'], index =['a','b','c','d','e'])
#if you pass a column that isnt in states, it will appear with Na values
new_farme

Unnamed: 0,State,Language,Population,Per Capita Income
a,Gujarat,Gujarati,36,
b,Tamil Nadu,Tamil,44,
c,Andhra,Telugu,67,
d,Karnataka,Kannada,89,
e,Kerala,Malayalam,34,


In [30]:
print(new_farme.columns)
print(new_farme['State']) # retrieveing data like dictionary

Index(['State', 'Language', 'Population', 'Per Capita Income'], dtype='object')
a       Gujarat
b    Tamil Nadu
c        Andhra
d     Karnataka
e        Kerala
Name: State, dtype: object


In [31]:
new_farme.Population # like Series

a    36
b    44
c    67
d    89
e    34
Name: Population, dtype: int64

In [34]:
new_farme.ix[3] # rows can be retrieved using .ix function
# here I have retrieved 3rd row

AttributeError: 'DataFrame' object has no attribute 'ix'

In [35]:
 new_farme

Unnamed: 0,State,Language,Population,Per Capita Income
a,Gujarat,Gujarati,36,
b,Tamil Nadu,Tamil,44,
c,Andhra,Telugu,67,
d,Karnataka,Kannada,89,
e,Kerala,Malayalam,34,


In [36]:
new_farme['Per Capita Income'] = 99 # the empty per capita income column can be assigned a value
new_farme

Unnamed: 0,State,Language,Population,Per Capita Income
a,Gujarat,Gujarati,36,99
b,Tamil Nadu,Tamil,44,99
c,Andhra,Telugu,67,99
d,Karnataka,Kannada,89,99
e,Kerala,Malayalam,34,99


In [37]:
new_farme['Per Capita Income'] = np.arange(5) # assigning a value to the last column
new_farme

Unnamed: 0,State,Language,Population,Per Capita Income
a,Gujarat,Gujarati,36,0
b,Tamil Nadu,Tamil,44,1
c,Andhra,Telugu,67,2
d,Karnataka,Kannada,89,3
e,Kerala,Malayalam,34,4


In [38]:
series = Series([44,33,22], index =['b','c','d'])
new_farme['Per Capita Income'] = series
#when assigning list or arrays to a column, the values lenght should match the length of the DataFrame
new_farme # again the missing values are displayed as NAN

Unnamed: 0,State,Language,Population,Per Capita Income
a,Gujarat,Gujarati,36,
b,Tamil Nadu,Tamil,44,44.0
c,Andhra,Telugu,67,33.0
d,Karnataka,Kannada,89,22.0
e,Kerala,Malayalam,34,


In [None]:
new_farme['Development'] = new_farme.State == 'Gujarat'# assigning a new column
print new_farme
del new_farme['Development'] # will delete the column 'Development'
new_farme

# Selection, Indexing and FilteringÂ¶


In [None]:
var = Series(['Python', 'Java', 'c', 'c++', 'Php'], index =[5,4,3,2,1])
var

In [None]:
print var[5]
print var[2:4]

In [None]:
var[[3,2,1]]

In [None]:
var[var == 'Php']

In [None]:
states ={'State' :['Gujarat', 'Tamil Nadu', ' Andhra', 'Karnataka', 'Kerala'],
                  'Population': [36, 44, 67,89,34],
                  'Language' :['Gujarati', 'Tamil', 'Telugu', 'Kannada', 'Malayalam']}
india = DataFrame(states, columns =['State', 'Population', 'Language'])
india

In [None]:
india[['Population', 'Language']] # retrieve data from data frame

In [None]:
india[india['Population'] > 50] # returns data for population greater than 50

In [None]:
india[:3] # first three rows

In [None]:
# for selecting specific rows and columns, you can use ix function
import pandas as pd
states ={'State' :['Gujarat', 'Tamil Nadu', ' Andhra', 'Karnataka', 'Kerala'],
                  'Population': [36, 44, 67,89,34],
                  'Language' :['Gujarati', 'Tamil', 'Telugu', 'Kannada', 'Malayalam']}
india = DataFrame(states, columns =['State', 'Population', 'Language'], index =['a', 'b', 'c', 'd', 'e'])
india

In [None]:
india.ix[['a','b'], ['State','Language']] # this is how you select subset of rows