In [26]:
import pandas as pd

adf = pd.DataFrame(data={'x1': ['A', 'B', 'C'], 'x2' : [1, 2, 3]})

In [27]:
adf

Unnamed: 0,x1,x2
0,A,1
1,B,2
2,C,3


In [28]:
bdf = pd.DataFrame(data={'x1': ['A', 'B', 'D'], 'x3' : ['T', 'F', 'T']})

In [29]:
bdf

Unnamed: 0,x1,x3
0,A,T
1,B,F
2,D,T


In [30]:
# merge adf and bdf based on adf
pd.merge(adf, bdf, how='left')

Unnamed: 0,x1,x2,x3
0,A,1,T
1,B,2,F
2,C,3,


In [31]:
# merge adf and bdf based on bdf
pd.merge(adf, bdf, how='right')

Unnamed: 0,x1,x2,x3
0,A,1.0,T
1,B,2.0,F
2,D,,T


In [32]:
# merge adf and bdf based on what they have in common
# default value is inner if not defined
pd.merge(adf, bdf, how='inner')

Unnamed: 0,x1,x2,x3
0,A,1,T
1,B,2,F


In [33]:
pd.merge(adf, bdf)

Unnamed: 0,x1,x2,x3
0,A,1,T
1,B,2,F


In [34]:
# union of merge for adf and bdf - outer joint - why are they floating points now?
pd.merge(adf, bdf, how="outer")

Unnamed: 0,x1,x2,x3
0,A,1.0,T
1,B,2.0,F
2,C,3.0,
3,D,,T


In [35]:
# appending pandas datafram 
# append dbf and adf (row-wise concatination)

pd.concat([adf, bdf], sort=True)

Unnamed: 0,x1,x2,x3
0,A,1.0,
1,B,2.0,
2,C,3.0,
0,A,,T
1,B,,F
2,D,,T


In [36]:
# column-wise concatenation 
# axis: whether we will concatenate along rows (0) or columns (1)
pd.concat([adf, bdf], axis=1)

Unnamed: 0,x1,x2,x1.1,x3
0,A,1,A,T
1,B,2,B,F
2,C,3,D,T


In [37]:
# when two dataframes have exactly the same columns

In [38]:
df = pd.DataFrame([[1, 2]], columns = ['a', 'b'])
df2 = pd.DataFrame([[5, 6]], columns = ['a', 'b'])

df = df.append(df2, ignore_index=True)

In [39]:
df

Unnamed: 0,a,b
0,1,2
1,5,6


In [40]:
# pandas’ merge and concat can be used to combine subsets of a DataFrame, or even data from different files.
# join function combines DataFrames based on index or column.
# Joining two DataFrames can be done in multiple ways (left, right, and inner) depending on what data must be in the final DataFrame.

In [41]:
# slicing dataframe based on largest value for a specific column

In [42]:
import numpy as np

df = pd.DataFrame({'a': [1, 10, 8, 11, -1], 'b': list('abcde'), 'c':[1.0, 2.0, np.nan, 3.0, 4.0]})

In [43]:
df

Unnamed: 0,a,b,c
0,1,a,1.0
1,10,b,2.0
2,8,c,
3,11,d,3.0
4,-1,e,4.0


In [44]:
# Select the three largest values from column 'a'. 
# Keep only the values in columns 'b' and 'c' that are in the same row as the 3 largest values in 'a'
df.nlargest(3, 'a')

Unnamed: 0,a,b,c
3,11,d,3.0
1,10,b,2.0
2,8,c,


In [45]:
# calculate the number of null values in column 'c'
df['c'].isnull().sum()

1

In [46]:
# calculate the number of null values in the dataframe
df.isnull().sum()

a    0
b    0
c    1
dtype: int64

In [47]:
# using the melt method in Pandas


In [48]:
# initialize the data and dataframe 

data = {'weekday': ["Monday", "Tuesday", "Wednesday", 
         "Thursday", "Friday", "Saturday", "Sunday"],
        'Person 1': [12, 6, 5, 8, 11, 6, 4],
        'Person 2': [10, 6, 11, 5, 8, 9, 12],
        'Person 3': [8, 5, 7, 3, 7, 11, 15]}
df = pd.DataFrame(data, columns=['weekday',
        'Person 1', 'Person 2', 'Person 3'])

In [49]:
df

Unnamed: 0,weekday,Person 1,Person 2,Person 3
0,Monday,12,10,8
1,Tuesday,6,6,5
2,Wednesday,5,11,7
3,Thursday,8,5,3
4,Friday,11,8,7
5,Saturday,6,9,11
6,Sunday,4,12,15
