# Pandas Basics

In [1]:
import numpy as np
import pandas as pd

# Series

In [2]:
#series from list
first_series = pd.Series(list('abcdef'))

In [3]:
print(first_series)

0    a
1    b
2    c
3    d
4    e
5    f
dtype: object


In [4]:
#series from ndarray
np_cities = np.array(['calicut','cochin','bangalore','chennai','mumbai','delhi'])
s_cities = pd.Series(np_cities) #pass ndarray as argument
print(s_cities)

0      calicut
1       cochin
2    bangalore
3      chennai
4       mumbai
5        delhi
dtype: object


In [5]:
#series from dict with user defined index
dict_country_gdp = pd.Series([2256.231,1236.234,8569.741,2369.587,1594.856,5987.698,2375.987,1298.365],index=['india','iceland','dubai','usa','uae','singapore','bhutan','nepal'])
print(dict_country_gdp)

india        2256.231
iceland      1236.234
dubai        8569.741
usa          2369.587
uae          1594.856
singapore    5987.698
bhutan       2375.987
nepal        1298.365
dtype: float64


In [6]:
#series from scalar
scalar_series = pd.Series(5.,index=['a','b','c','d','e'])
scalar_series

a    5.0
b    5.0
c    5.0
d    5.0
e    5.0
dtype: float64

# Accessing series data

In [7]:
#direct accessing
dict_country_gdp[0]

2256.231

In [8]:
#Accessing first 5 countries from series
dict_country_gdp[0:5]

india      2256.231
iceland    1236.234
dubai      8569.741
usa        2369.587
uae        1594.856
dtype: float64

In [9]:
#look up a country by name or index
dict_country_gdp.loc['usa'] #using user defined

2369.587

In [10]:
#look up by the position
dict_country_gdp.iloc[0] #using predefined index

2256.231

# vectorized operations in series

In [11]:

first_vector_series = pd.Series([1,2,3,4],index=['a','b','c','d'])
second_vector_series = pd.Series([10,20,30,40],index=['a','b','c','d'])

In [12]:
first_vector_series + second_vector_series

a    11
b    22
c    33
d    44
dtype: int64

In [13]:
second_vector_series = pd.Series([10,20,40],index=['a','d','c'])

In [14]:
first_vector_series + second_vector_series

a    11.0
b     NaN
c    43.0
d    24.0
dtype: float64

In [15]:
second_vector_series = pd.Series([10,20,30,40],index=['a','b','d','f'])
sum_vectors = first_vector_series + second_vector_series
sum_vectors

a    11.0
b    22.0
c     NaN
d    34.0
f     NaN
dtype: float64

# DataFrame

In [16]:
#create dataframe from dict of equal length list
olympic_data_list = {'Hostcity':['London','Baijing','Athens','Sydney','Atlanta'],
                    'year':[2012,2008,2004,2000,1996],
                    'No. of participating countries':[205,204,201,200,197]
                    }

In [17]:
df_olympic_data = pd.DataFrame(olympic_data_list) #pass list to the dataframe
df_olympic_data

Unnamed: 0,Hostcity,year,No. of participating countries
0,London,2012,205
1,Baijing,2008,204
2,Athens,2004,201
3,Sydney,2000,200
4,Atlanta,1996,197


In [18]:
#create dataframe from dict of dict

olympic_data_dict = {'London':{2012:205},'Beijing':{2008:204}}
df_olympic_data_dict = pd.DataFrame(olympic_data_dict)
df_olympic_data_dict

Unnamed: 0,London,Beijing
2008,,204.0
2012,205.0,


In [19]:
df_olympic_data.Hostcity

0     London
1    Baijing
2     Athens
3     Sydney
4    Atlanta
Name: Hostcity, dtype: object

In [20]:
df_olympic_data.describe

<bound method NDFrame.describe of   Hostcity  year  No. of participating countries
0   London  2012                             205
1  Baijing  2008                             204
2   Athens  2004                             201
3   Sydney  2000                             200
4  Atlanta  1996                             197>

In [21]:
df_olympic_data.describe()

Unnamed: 0,year,No. of participating countries
count,5.0,5.0
mean,2004.0,201.4
std,6.324555,3.209361
min,1996.0,197.0
25%,2000.0,200.0
50%,2004.0,201.0
75%,2008.0,204.0
max,2012.0,205.0


In [22]:
#create df from dict of series

olympic_series_participation = pd.Series([241,200,201,235,240],index=[2012,2008,2004,2000,1996])
olympic_series_country = pd.Series(['London','Beijing','Athens','Sydney','Atlanta'],
                                  index=[2012,2008,2004,2000,1996])

In [23]:
df_olympic_series = pd.DataFrame({'Host Cities':olympic_series_country,
                                  'NO of Participating Countries':olympic_series_participation
                                 })

In [24]:
df_olympic_series

Unnamed: 0,Host Cities,NO of Participating Countries
2012,London,241
2008,Beijing,200
2004,Athens,201
2000,Sydney,235
1996,Atlanta,240


In [25]:
#dataframe from ndarray

np_array = np.array([2012,2008,2004,2006])
dict_ndarray = {'year':np_array}

In [26]:
df_ndarray = pd.DataFrame(dict_ndarray)

In [27]:
df_ndarray

Unnamed: 0,year
0,2012
1,2008
2,2004
3,2006


In [28]:
#dataframe from df

df_from_df = pd.DataFrame(df_olympic_series)
df_from_df

Unnamed: 0,Host Cities,NO of Participating Countries
2012,London,241
2008,Beijing,200
2004,Athens,201
2000,Sydney,235
1996,Atlanta,240


# Handling Null value 

In [29]:
sum_vectors

a    11.0
b    22.0
c     NaN
d    34.0
f     NaN
dtype: float64

In [30]:
#drop NoN
dropna_s = sum_vectors.dropna()

In [31]:
dropna_s

a    11.0
b    22.0
d    34.0
dtype: float64

In [32]:
#fillna fills the null values
fillna_s = sum_vectors.fillna(0)
fillna_s

a    11.0
b    22.0
c     0.0
d    34.0
f     0.0
dtype: float64

In [33]:
fillna_mean = sum_vectors.fillna(sum_vectors.mean())
fillna_mean

a    11.000000
b    22.000000
c    22.333333
d    34.000000
f    22.333333
dtype: float64

In [34]:
sum_vectors.median()

22.0

In [35]:
sum_vectors.max()

34.0

In [36]:
sum_vectors.describe()

count     3.000000
mean     22.333333
std      11.503623
min      11.000000
25%      16.500000
50%      22.000000
75%      28.000000
max      34.000000
dtype: float64

In [37]:
df1 = pd.DataFrame(np.random.randn(4,4),index=['a','b','c','d'],columns=['one','two','three','four'])
print(df1)


        one       two     three      four
a -2.190407 -1.065153  1.016171  0.529247
b -1.177577 -0.236251  1.076982  1.422578
c -0.211276  0.052162 -0.491279  0.593539
d  1.318818  0.612284  0.677481 -0.372788


In [38]:
df1 = df1.reindex(['a','b','c','d','e','f','g','h','i','j','k'])
df1

Unnamed: 0,one,two,three,four
a,-2.190407,-1.065153,1.016171,0.529247
b,-1.177577,-0.236251,1.076982,1.422578
c,-0.211276,0.052162,-0.491279,0.593539
d,1.318818,0.612284,0.677481,-0.372788
e,,,,
f,,,,
g,,,,
h,,,,
i,,,,
j,,,,


In [39]:
df1['two'].isnull() #checking if coloumn with a null value

a    False
b    False
c    False
d    False
e     True
f     True
g     True
h     True
i     True
j     True
k     True
Name: two, dtype: bool

In [40]:
df1.info() #understanding the non null values

<class 'pandas.core.frame.DataFrame'>
Index: 11 entries, a to k
Data columns (total 4 columns):
one      4 non-null float64
two      4 non-null float64
three    4 non-null float64
four     4 non-null float64
dtypes: float64(4)
memory usage: 760.0+ bytes


In [41]:
#multi indexing

X = pd.MultiIndex . from_tuples ([( 'A' , 'cat'),( 'B' , 'dog'),
                                 ('A' , 'cat'),( 'B' , 'dog')],
                                names = ['exp' , 'animal'])
Y = pd. MultiIndex . from_product ([('bar' , 'baz' , 'foo' , 'qux'),
                                   ('one' , 'two')],
                                  names= ['first' , 'second' ])

df = pd.DataFrame(np.random.randn(8,4),index=Y, columns=X)
df2 = df.iloc[[0,1,2,4,5,7]]
print(df)
df2

exp                  A         B         A         B
animal             cat       dog       cat       dog
first second                                        
bar   one    -1.272216  0.646307 -1.508163 -1.691205
      two    -0.635418 -0.249534 -1.202287  0.452899
baz   one     0.463832 -2.724349  1.270149  0.575801
      two     0.142565  2.287170  1.764573 -1.329702
foo   one     1.946546  1.057615  1.806929 -1.525793
      two     1.026815 -0.026301  1.534341 -0.301522
qux   one    -1.030522  0.928350 -0.790363  0.704139
      two    -1.449515 -0.041831 -0.366282  1.029307


Unnamed: 0_level_0,exp,A,B,A,B
Unnamed: 0_level_1,animal,cat,dog,cat,dog
first,second,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
bar,one,-1.272216,0.646307,-1.508163,-1.691205
bar,two,-0.635418,-0.249534,-1.202287,0.452899
baz,one,0.463832,-2.724349,1.270149,0.575801
foo,one,1.946546,1.057615,1.806929,-1.525793
foo,two,1.026815,-0.026301,1.534341,-0.301522
qux,two,-1.449515,-0.041831,-0.366282,1.029307


In [42]:
#replacing dataframe values
df = pd.DataFrame({'one':[10,20,30,40,50,2000],
                  'two':[1000,0,30,40,50,60]})
print(df)
print()
print("The replaced data frame is...")
print(df.replace({1000:10,2000:60}))

    one   two
0    10  1000
1    20     0
2    30    30
3    40    40
4    50    50
5  2000    60

The replaced data frame is...
   one  two
0   10   10
1   20    0
2   30   30
3   40   40
4   50   50
5   60   60
