# Pandas

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.DataFrame({
    'col-1' : ['Item-1','Item-2','Item-3','Item-4'],
    'col-2' : ['Gold','Silver','Bronze','Gold'],
    'col-3' : [1,2,np.nan,4]
})

In [4]:
print(df)

    col-1   col-2  col-3
0  Item-1    Gold    1.0
1  Item-2  Silver    2.0
2  Item-3  Bronze    NaN
3  Item-4    Gold    4.0


### Creating a dataframe

In [5]:
data = {'Name':['Tom','Jack','Steve','Ricky'], 'Age':[28,34,29,42]}

In [6]:
df = pd.DataFrame(data)

In [7]:
print(df)

    Name  Age
0    Tom   28
1   Jack   34
2  Steve   29
3  Ricky   42


In [8]:
print(data)

{'Name': ['Tom', 'Jack', 'Steve', 'Ricky'], 'Age': [28, 34, 29, 42]}


In [10]:
# Creating dictionary of series
dict1 ={'Name':pd.Series(['Tom','Jack','Steve','Ricky','Vin','James','Smith']),
       'Age':pd.Series([28,34,29,42,32,43,45]),
       'Rating':pd.Series([4.23,4.1,3.4,4.5,5.2,3.4,3.2])}
df = pd.DataFrame(dict1)
print(df)

    Name  Age  Rating
0    Tom   28    4.23
1   Jack   34    4.10
2  Steve   29    3.40
3  Ricky   42    4.50
4    Vin   32    5.20
5  James   43    3.40
6  Smith   45    3.20


In [11]:
# Transpose
print(df.T)

           0     1      2      3    4      5      6
Name     Tom  Jack  Steve  Ricky  Vin  James  Smith
Age       28    34     29     42   32     43     45
Rating  4.23   4.1    3.4    4.5  5.2    3.4    3.2


In [12]:
# Axes
print(df.axes)

[RangeIndex(start=0, stop=7, step=1), Index(['Name', 'Age', 'Rating'], dtype='object')]


In [13]:
# dtypes
print(df.dtypes)

Name       object
Age         int64
Rating    float64
dtype: object


In [14]:
# shape
print(df.shape)

(7, 3)


In [15]:
# values
print(df.values)

[['Tom' 28 4.23]
 ['Jack' 34 4.1]
 ['Steve' 29 3.4]
 ['Ricky' 42 4.5]
 ['Vin' 32 5.2]
 ['James' 43 3.4]
 ['Smith' 45 3.2]]


In [16]:
# head 
print(df.head())

    Name  Age  Rating
0    Tom   28    4.23
1   Jack   34    4.10
2  Steve   29    3.40
3  Ricky   42    4.50
4    Vin   32    5.20


In [17]:
print(df.head(2))

   Name  Age  Rating
0   Tom   28    4.23
1  Jack   34    4.10


In [18]:
# tail

print(df.tail())

    Name  Age  Rating
2  Steve   29     3.4
3  Ricky   42     4.5
4    Vin   32     5.2
5  James   43     3.4
6  Smith   45     3.2


In [19]:
print(df.tail(2))

    Name  Age  Rating
5  James   43     3.4
6  Smith   45     3.2


## Statistics

In [20]:
print(df)

    Name  Age  Rating
0    Tom   28    4.23
1   Jack   34    4.10
2  Steve   29    3.40
3  Ricky   42    4.50
4    Vin   32    5.20
5  James   43    3.40
6  Smith   45    3.20


In [21]:
print(df.sum())

Name      TomJackSteveRickyVinJamesSmith
Age                                  253
Rating                             28.03
dtype: object


In [22]:
print(df.sum(1))

0    32.23
1    38.10
2    32.40
3    46.50
4    37.20
5    46.40
6    48.20
dtype: float64


In [23]:
# mean()
print(df.mean())

Age       36.142857
Rating     4.004286
dtype: float64


In [24]:
# std()
print(df.std())

Age       7.057586
Rating    0.720274
dtype: float64


In [25]:
print(df)

    Name  Age  Rating
0    Tom   28    4.23
1   Jack   34    4.10
2  Steve   29    3.40
3  Ricky   42    4.50
4    Vin   32    5.20
5  James   43    3.40
6  Smith   45    3.20


In [26]:
# describe() -> summarize the data
print(df.describe())

             Age    Rating
count   7.000000  7.000000
mean   36.142857  4.004286
std     7.057586  0.720274
min    28.000000  3.200000
25%    30.500000  3.400000
50%    34.000000  4.100000
75%    42.500000  4.365000
max    45.000000  5.200000


In [27]:
# include object, number, all
print(df.describe(include = ['object']))

       Name
count     7
unique    7
top     Tom
freq      1


In [28]:
print(df.describe(include = ['number']))

             Age    Rating
count   7.000000  7.000000
mean   36.142857  4.004286
std     7.057586  0.720274
min    28.000000  3.200000
25%    30.500000  3.400000
50%    34.000000  4.100000
75%    42.500000  4.365000
max    45.000000  5.200000


In [30]:
print(df.describe(include = 'all'))

       Name        Age    Rating
count     7   7.000000  7.000000
unique    7        NaN       NaN
top     Tom        NaN       NaN
freq      1        NaN       NaN
mean    NaN  36.142857  4.004286
std     NaN   7.057586  0.720274
min     NaN  28.000000  3.200000
25%     NaN  30.500000  3.400000
50%     NaN  34.000000  4.100000
75%     NaN  42.500000  4.365000
max     NaN  45.000000  5.200000


## Series Data Structure

In [32]:
s = pd.Series()
print(s)

Series([], dtype: float64)


In [33]:
# creating series from ndarray
data = np.array(['a','b','c','d'])
s = pd.Series(data)
print(s)

0    a
1    b
2    c
3    d
dtype: object


In [34]:
# creating series from dict

data = {'a':1.,'b':2.,'c':3.}
s = pd.Series(data)
print(s)

a    1.0
b    2.0
c    3.0
dtype: float64


In [37]:
s = pd.Series([1,2,3,4,5], index = ['a','b','c','d','e'])
print(s)

a    1
b    2
c    3
d    4
e    5
dtype: int64


In [38]:
print(s['a'])

1


In [39]:
# retreiving multiple elements

print(s[['a','b','e']])

a    1
b    2
e    5
dtype: int64
