In [1]:
import numpy as np
import pandas as pd

In [2]:
# --------------------------------------------------Series---------------------------------------------------------

marks = [98, 87, 99]
subject = ['p', 'c', 'm']

s = pd.Series(marks)
print(s)

0    98
1    87
2    99
dtype: int64


In [3]:
s = pd.Series(marks, subject)    # arguments name = data, index
print(s)

p    98
c    87
m    99
dtype: int64


In [4]:
dic = {'p':98, 'c':87, 'm':99}   # using dictionary to create Series
s = pd.Series(dic) 
print(s)

p    98
c    87
m    99
dtype: int64


In [5]:
s = pd.Series([sum, max, min])   # Series can hold any object or function too
print(s)

0    <built-in function sum>
1    <built-in function max>
2    <built-in function min>
dtype: object


In [6]:
# Using an index

s = pd.Series([4,5,6], ['p','c','m'])
print(s['c'], s[1])

5 5


In [7]:
# ----------------------------------------------Data Frames--------------------------------------------------------

index = ['A', 'B', 'C', 'D']
column = ['X', 'Y', 'Z']
data = [[1,-2,3], [-2,3,-4], [3,4,-5], [4,5,-6]]

df = pd.DataFrame(data, index, column)
df

Unnamed: 0,X,Y,Z
A,1,-2,3
B,-2,3,-4
C,3,4,-5
D,4,5,-6


In [8]:
# Selecting column

print(df['X'])                   # Series type object

A    1
B   -2
C    3
D    4
Name: X, dtype: int64


In [9]:
df[['X', 'Y']]                   # Data Frame typr object

Unnamed: 0,X,Y
A,1,-2
B,-2,3
C,3,4
D,4,5


In [10]:
df['new'] = df['X'] + df['Y']    # Creating new column
df

Unnamed: 0,X,Y,Z,new
A,1,-2,3,-1
B,-2,3,-4,1
C,3,4,-5,7
D,4,5,-6,9


In [11]:
df.drop(['new'], axis=1, inplace=True)     # Droping columns
df

Unnamed: 0,X,Y,Z
A,1,-2,3
B,-2,3,-4
C,3,4,-5
D,4,5,-6


In [12]:
df.drop(['D'], axis=0, inplace=True)       # Droping rows
df

Unnamed: 0,X,Y,Z
A,1,-2,3
B,-2,3,-4
C,3,4,-5


In [13]:
# Selecting rows

df.loc['A']           # Series type object

X    1
Y   -2
Z    3
Name: A, dtype: int64

In [14]:
df.loc[['A', 'B']]    # Data Frame type object

Unnamed: 0,X,Y,Z
A,1,-2,3
B,-2,3,-4


In [15]:
df.iloc[0]            # index based row access 

X    1
Y   -2
Z    3
Name: A, dtype: int64

In [16]:
df.iloc[[0,1]]

Unnamed: 0,X,Y,Z
A,1,-2,3
B,-2,3,-4


In [17]:
# Selecting a value

print(df.loc['A', 'Z'])

3


In [18]:
# Selecting a sub Data Frame

df.loc[['A','B'], ['X','Y']]

Unnamed: 0,X,Y
A,1,-2
B,-2,3


In [19]:
# Conditional Selection

df > 0

Unnamed: 0,X,Y,Z
A,True,False,True
B,False,True,False
C,True,True,False


In [20]:
df[df>0]              # Negative values are replced by NaN

Unnamed: 0,X,Y,Z
A,1.0,,3.0
B,,3.0,
C,3.0,4.0,


In [25]:
df

Unnamed: 0,X,Y,Z
A,1,-2,3
B,-2,3,-4
C,3,4,-5


In [26]:
df[df['X']>0]         # Select the rows, whose X column value is positive.

Unnamed: 0,X,Y,Z
A,1,-2,3
C,3,4,-5


In [27]:
df[df['X']>0]['Y']    # Selecting a column after condition

A   -2
C    4
Name: Y, dtype: int64

In [28]:
df

Unnamed: 0,X,Y,Z
A,1,-2,3
B,-2,3,-4
C,3,4,-5


In [29]:
df[(df['X']>0) & (df['Y']!=4)]      # Multiple Conditions

Unnamed: 0,X,Y,Z
A,1,-2,3


In [30]:
df['name'] = 'Ram Nam Vam'.split()
df

Unnamed: 0,X,Y,Z,name
A,1,-2,3,Ram
B,-2,3,-4,Nam
C,3,4,-5,Vam


In [31]:
df.set_index('name')               # Changing the index column

Unnamed: 0_level_0,X,Y,Z
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Ram,1,-2,3
Nam,-2,3,-4
Vam,3,4,-5


In [32]:
# ----------------------------------------------Missing Data--------------------------------------------------------

df = df[df>0]
df.drop('name', axis=1, inplace=True)
df

Unnamed: 0,X,Y,Z
A,1.0,,3.0
B,,3.0,
C,3.0,4.0,


In [33]:
df['W'] = [2,3,5]
df.loc['D'] = [5,3,1,4]
df

Unnamed: 0,X,Y,Z,W
A,1.0,,3.0,2
B,,3.0,,3
C,3.0,4.0,,5
D,5.0,3.0,1.0,4


In [34]:
df.dropna()             # Delete all rows with Nan value [axis, how, thresh, subset, inplace]

Unnamed: 0,X,Y,Z,W
D,5.0,3.0,1.0,4


In [35]:
df.dropna(axis=1)

Unnamed: 0,W
A,2
B,3
C,5
D,4


In [36]:
df.dropna(how='all')    # If all values are NaN, delete the row ['any', 'all']

Unnamed: 0,X,Y,Z,W
A,1.0,,3.0,2
B,,3.0,,3
C,3.0,4.0,,5
D,5.0,3.0,1.0,4


In [37]:
df.dropna(thresh=3)     # Keep the row with minimum 3 non NaN values

Unnamed: 0,X,Y,Z,W
A,1.0,,3.0,2
C,3.0,4.0,,5
D,5.0,3.0,1.0,4


In [38]:
# filling the Nan Values

df.fillna(0)

Unnamed: 0,X,Y,Z,W
A,1.0,0.0,3.0,2
B,0.0,3.0,0.0,3
C,3.0,4.0,0.0,5
D,5.0,3.0,1.0,4


In [39]:
df['X'].fillna(df['X'].mean())

A    1.0
B    3.0
C    3.0
D    5.0
Name: X, dtype: float64

In [40]:
# ----------------------------------------------GroupBy----------------------------------------------------------

data = {'Company':['GOOG','GOOG','MSFT','MSFT','FB','FB'],
        'Person':['Sam','Charlie','Amy','Vanessa','Carl','Sarah'],
        'Sales':[200,120,340,124,243,350]}

df = pd.DataFrame(data)
df

Unnamed: 0,Company,Person,Sales
0,GOOG,Sam,200
1,GOOG,Charlie,120
2,MSFT,Amy,340
3,MSFT,Vanessa,124
4,FB,Carl,243
5,FB,Sarah,350


In [41]:
grp = df.groupby('Company')
grp.mean()

Unnamed: 0_level_0,Sales
Company,Unnamed: 1_level_1
FB,296.5
GOOG,160.0
MSFT,232.0


In [42]:
grp.describe()        

Unnamed: 0_level_0,Sales,Sales,Sales,Sales,Sales,Sales,Sales,Sales
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
Company,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
FB,2.0,296.5,75.660426,243.0,269.75,296.5,323.25,350.0
GOOG,2.0,160.0,56.568542,120.0,140.0,160.0,180.0,200.0
MSFT,2.0,232.0,152.735065,124.0,178.0,232.0,286.0,340.0


In [43]:
grp.describe().transpose()

Unnamed: 0,Company,FB,GOOG,MSFT
Sales,count,2.0,2.0,2.0
Sales,mean,296.5,160.0,232.0
Sales,std,75.660426,56.568542,152.735065
Sales,min,243.0,120.0,124.0
Sales,25%,269.75,140.0,178.0
Sales,50%,296.5,160.0,232.0
Sales,75%,323.25,180.0,286.0
Sales,max,350.0,200.0,340.0


In [44]:
# ------------------------------------Merging, Joining, Concatenating--------------------------------------------

df1 = pd.DataFrame({'A': ['A0', 'A1', 'A2', 'A3'],
                        'B': ['B0', 'B1', 'B2', 'B3'],
                        'C': ['C0', 'C1', 'C2', 'C3'],
                        'D': ['D0', 'D1', 'D2', 'D3']},
                         index=[0, 1, 2, 3])

df2 = pd.DataFrame({'A': ['A4', 'A5', 'A6', 'A7'],
                        'B': ['B4', 'B5', 'B6', 'B7'],
                        'C': ['C4', 'C5', 'C6', 'C7'],
                        'D': ['D4', 'D5', 'D6', 'D7']},
                         index=[4, 5, 6, 7]) 

In [45]:
df1

Unnamed: 0,A,B,C,D
0,A0,B0,C0,D0
1,A1,B1,C1,D1
2,A2,B2,C2,D2
3,A3,B3,C3,D3


In [46]:
df2

Unnamed: 0,A,B,C,D
4,A4,B4,C4,D4
5,A5,B5,C5,D5
6,A6,B6,C6,D6
7,A7,B7,C7,D7


In [47]:
pd.concat([df1, df2])

Unnamed: 0,A,B,C,D
0,A0,B0,C0,D0
1,A1,B1,C1,D1
2,A2,B2,C2,D2
3,A3,B3,C3,D3
4,A4,B4,C4,D4
5,A5,B5,C5,D5
6,A6,B6,C6,D6
7,A7,B7,C7,D7


In [48]:
pd.concat([df1, df2], axis=1)

Unnamed: 0,A,B,C,D,A.1,B.1,C.1,D.1
0,A0,B0,C0,D0,,,,
1,A1,B1,C1,D1,,,,
2,A2,B2,C2,D2,,,,
3,A3,B3,C3,D3,,,,
4,,,,,A4,B4,C4,D4
5,,,,,A5,B5,C5,D5
6,,,,,A6,B6,C6,D6
7,,,,,A7,B7,C7,D7


In [49]:
# ------------------------------------------------Operations------------------------------------------------------

In [50]:
df

Unnamed: 0,Company,Person,Sales
0,GOOG,Sam,200
1,GOOG,Charlie,120
2,MSFT,Amy,340
3,MSFT,Vanessa,124
4,FB,Carl,243
5,FB,Sarah,350


In [51]:
df['Sales'].unique()  2      # list of uniques values

array([200, 120, 340, 124, 243, 350])

In [52]:
df['Sales'].nunique()       # Number of unique values

6

In [53]:
df['Sales'].value_counts()

120    1
350    1
340    1
124    1
243    1
200    1
Name: Sales, dtype: int64

In [54]:
# Applying functions

def mul(x):
    return x*2

df['Sales'].apply(mul)

0    400
1    240
2    680
3    248
4    486
5    700
Name: Sales, dtype: int64

In [85]:
x = lambda : np.sum()

d = pd.DataFrame([[1,2,3], [2,3,4]], ['A', 'B'], ['X', 'Y', 'Z'])

print(d)

d.apply(lambda a: a-a.mean(), axis=1)

   X  Y  Z
A  1  2  3
B  2  3  4


Unnamed: 0,X,Y,Z
A,-1.0,0.0,1.0
B,-1.0,0.0,1.0


In [71]:
df

Unnamed: 0,Person,Sales
0,Sam,200
1,Charlie,120
2,Amy,340
3,Vanessa,124
4,Carl,243
5,Sarah,350


In [57]:
# Deleting a column or row

del df['Company']
df

Unnamed: 0,Person,Sales
0,Sam,200
1,Charlie,120
2,Amy,340
3,Vanessa,124
4,Carl,243
5,Sarah,350


In [58]:
# Column and index names

df.columns

Index(['Person', 'Sales'], dtype='object')

In [59]:
df.index

RangeIndex(start=0, stop=6, step=1)

In [60]:
df.sort_values(by='Sales')

Unnamed: 0,Person,Sales
1,Charlie,120
3,Vanessa,124
0,Sam,200
4,Carl,243
2,Amy,340
5,Sarah,350


In [61]:
df.dropna()          # Drop rows with Nan values

Unnamed: 0,Person,Sales
0,Sam,200
1,Charlie,120
2,Amy,340
3,Vanessa,124
4,Carl,243
5,Sarah,350
