# Working with Data

## Concatenate & Concat

In [4]:
import numpy as np
import pandas as pd
from pandas import DataFrame,Series

### 1、Numpy -- concatenate

In [2]:
arr1 = np.arange(9).reshape(3,3)
arr2 = np.ones((3,3))
arr2

array([[1., 1., 1.],
       [1., 1., 1.],
       [1., 1., 1.]])

In [3]:
# help()
np.concatenate((arr1,arr2),axis = 1)

array([[0., 1., 2., 1., 1., 1.],
       [3., 4., 5., 1., 1., 1.],
       [6., 7., 8., 1., 1., 1.]])

In [4]:
np.concatenate([arr1,arr2],axis = 0)

array([[0., 1., 2.],
       [3., 4., 5.],
       [6., 7., 8.],
       [1., 1., 1.],
       [1., 1., 1.],
       [1., 1., 1.]])

### 2、Pandas -- concat

In [5]:
# Lets create two Series with no overlap

ser1 =  Series([0,1,2],index=['T','U','V'])
ser2 = Series([3,4,5],index=['X','Y','T'])

# Now let use concat (default is axis=0)
pd.concat([ser1,ser2])

T    0
U    1
V    2
X    3
Y    4
T    5
dtype: int64

In [6]:
# Now passing along another axis will produce a DataFrame
pd.concat([ser1,ser2],axis=1,sort = False)

# index 'T'

Unnamed: 0,0,1
T,0.0,5.0
U,1.0,
V,2.0,
X,,3.0
Y,,4.0


In [7]:
# We can specify which specific axes to be used
pd.concat([ser1,ser2],axis=1,join_axes=[['U','V','T']])

Unnamed: 0,0,1
U,1,
V,2,
T,0,5.0


In [8]:
# Lets say we wanted to add markers.keys to the concatenation result
# hierarchical index

pd.concat([ser1,ser2],keys=['cat1','cat2'])

cat1  T    0
      U    1
      V    2
cat2  X    3
      Y    4
      T    5
dtype: int64

In [10]:
# everything works similarly in DataFrames

dframe1 = DataFrame(np.random.randn(4,3), columns=['X', 'Y', 'Z'])
dframe2 = DataFrame(np.random.randn(3, 3), columns=['Y', 'Q', 'X'])
pd.concat([dframe1,dframe2],sort=False)

Unnamed: 0,X,Y,Z,Q
0,0.147398,2.439993,0.285204,
1,-0.93704,-0.963371,0.192508,
2,-0.077461,0.331864,0.120953,
3,-1.459874,-0.962567,0.917093,
0,-0.172481,0.614341,,-0.248273
1,-0.299031,1.717903,,-0.468955
2,-1.45063,1.815572,,0.58042


In [12]:
# If we dont care about the index info and just awnt to make a complete DataFrame, just use ignore_index

pd.concat([dframe1,dframe2],ignore_index=True,sort=False)

Unnamed: 0,X,Y,Z,Q
0,0.147398,2.439993,0.285204,
1,-0.93704,-0.963371,0.192508,
2,-0.077461,0.331864,0.120953,
3,-1.459874,-0.962567,0.917093,
4,-0.172481,0.614341,,-0.248273
5,-0.299031,1.717903,,-0.468955
6,-1.45063,1.815572,,0.58042


## Combinig

In [2]:
import pandas as pd
import numpy as np
from pandas import DataFrame,Series

In [5]:
#First Series
ser1 = Series([2,np.nan,4,np.nan,6,np.nan],
           index=['Q','R','S','T','U','V'])

#Second Series (based off length of ser1)
ser2 = Series(np.arange(len(ser1), dtype=np.float64),
           index=['Q','R','S','T','U','V'])
ser2[-1] = np.nan

print(ser1)
ser2

Q    2.0
R    NaN
S    4.0
T    NaN
U    6.0
V    NaN
dtype: float64


Q    0.0
R    1.0
S    2.0
T    3.0
U    4.0
V    NaN
dtype: float64

In [7]:
# Now let's get a series where the value of ser1 is chosen if ser2 is NAN,otherwise let the value be ser1

print(np.where(pd.isnull(ser1),ser2,ser1))
Series(np.where(pd.isnull(ser1),ser2,ser1),index=ser1.index)

[ 2.  1.  4.  3.  6. nan]


Q    2.0
R    1.0
S    4.0
T    3.0
U    6.0
V    NaN
dtype: float64

In [9]:
# Now we can do the same thing simply by using combine_first with pandas
ser1.combine_first(ser2)

#This combines the Series values, choosing the values of the calling Series first, unless its a NAN

Q    2.0
R    1.0
S    4.0
T    3.0
U    6.0
V    NaN
dtype: float64

In [10]:
# Dataframe
dframe_odds = DataFrame({'X': [1., np.nan, 3., np.nan],
                     'Y': [np.nan, 5., np.nan, 7.],
                     'Z': [np.nan, 9., np.nan, 11.]})
dframe_evens = DataFrame({'X': [2., 4., np.nan, 6., 8.],
                     'Y': [np.nan, 10., 12., 14., 16.]})

print(dframe_odds)
dframe_evens

     X    Y     Z
0  1.0  NaN   NaN
1  NaN  5.0   9.0
2  3.0  NaN   NaN
3  NaN  7.0  11.0


Unnamed: 0,X,Y
0,2.0,
1,4.0,10.0
2,,12.0
3,6.0,14.0
4,8.0,16.0


In [11]:
# Now lets combine using odds values first, unless theres a NAN, then put the evens values
dframe_odds.combine_first(dframe_evens)

Unnamed: 0,X,Y,Z
0,1.0,,
1,4.0,5.0,9.0
2,3.0,12.0,
3,6.0,7.0,11.0
4,8.0,16.0,
