<a href="https://colab.research.google.com/github/plus2net/numpy/blob/main/numpy_3_pandas_interop.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

![alt text](https://www.plus2net.com/images/top2.jpg)        Read more on [Pandas Interoperability Guide ](https://www.plus2net.com/python/numpy-pandas-interop.php) | [ Numpy ](https://www.plus2net.com/python/numpy.php)

In [14]:
import numpy as np
import pandas as pd

df = pd.DataFrame({
    'a': [1, 2, 3],
    'b': [1.5, np.nan, 3.2],
    'c': pd.Categorical(['x','y','x'])
})

A = df.to_numpy()       # may become object if columns are mixed
print(A.dtype)

# Column subset with homogeneous types -> better numeric dtype
A_num = df[['a','b']].to_numpy(dtype='float64')
print(A_num.dtype, A_num.shape)

object
float64 (3, 2)


In [15]:
X = np.arange(6).reshape(3,2)
df2 = pd.DataFrame(X, columns=['col1','col2'], index=['r1','r2','r3'])
s = pd.Series(np.array([10, 20, 30]), index=df2.index, name='bonus')
print(df2)
print(s)

    col1  col2
r1     0     1
r2     2     3
r3     4     5
r1    10
r2    20
r3    30
Name: bonus, dtype: int64


In [16]:
dfA = pd.DataFrame({'x':[1,2,3]}, index=['a','b','c'])
dfB = pd.DataFrame({'x':[10,20,30]}, index=['b','c','d'])

# Pandas aligns by index labels
print((dfA + dfB))
#   rows 'a' and 'd' become NaN due to misalignment

# NumPy would add positionally if you convert to arrays:
print(dfA.to_numpy() + dfB.to_numpy())  # shapes must match

      x
a   NaN
b  12.0
c  23.0
d   NaN
[[11]
 [22]
 [33]]


In [17]:
df = pd.DataFrame({'i':[1, None, 3], 'f':[1.0, np.nan, 3.2]})
print(df.dtypes)              # Int64 (nullable) and float64
A = df.to_numpy()
print(A.dtype)                # may be object to hold mixed types

# Strategy: convert subsets with a numeric dtype
A_float = df[['f']].to_numpy(dtype='float64')  # preserves NaN

i    float64
f    float64
dtype: object
float64


In [18]:
df = pd.DataFrame({'x': np.arange(5), 'y': np.arange(5,10)})
xz = (df['x'].to_numpy() - df['x'].mean()) / df['x'].std()
df['x_z'] = xz
print(df.head())

   x  y       x_z
0  0  5 -1.264911
1  1  6 -0.632456
2  2  7  0.000000
3  3  8  0.632456
4  4  9  1.264911


In [19]:
X = np.random.randn(4,3)
mu = X.mean(axis=0)
sd = X.std(axis=0)

stats = pd.DataFrame({'mean': mu, 'std': sd}, index=['c1','c2','c3'])
print(stats)

        mean       std
c1  0.703543  0.585464
c2 -0.387941  0.925212
c3 -0.722612  0.593090


In [20]:
dates = pd.date_range('2025-01-01', periods=3, freq='D')
df = pd.DataFrame({'when': dates, 'val':[1,2,3]})
print(df['when'].dtype)                   # datetime64[ns]
dt64 = df['when'].to_numpy()              # NumPy datetime64[ns]
print(dt64.dtype)

# Categorical -> use .cat.codes if you need numeric arrays
cats = pd.Categorical(['low','med','high'])
arr_codes = pd.Series(cats).cat.codes.to_numpy()

datetime64[ns]
datetime64[ns]


In [21]:
# Example: compute standardized column with NumPy, join back:
vals = df[['val']].to_numpy(dtype='float64').ravel()
z = (vals - vals.mean()) / vals.std()
df['z_val'] = z

In [22]:
rec = np.array([(1, 1.5), (2, 2.5)], dtype=[('id','i4'),('score','f8')])
df_rec = pd.DataFrame.from_records(rec)
print(df_rec)

   id  score
0   1    1.5
1   2    2.5


In [23]:
import numpy as np, pandas as pd

# 1) Convert a mixed-type DataFrame; keep numeric block as float64
df = pd.DataFrame({'a':[1,2,3], 'b':[1.1, np.nan, 3.3], 'c':['x','y','x']})
num = df[['a','b']].to_numpy(dtype='float64')
print(num.dtype, num.shape)

# 2) Create two frames with different indexes; show alignment and fix with reindex
A = pd.DataFrame({'v':[1,2,3]}, index=['r1','r2','r3'])
B = pd.DataFrame({'v':[10,20,30]}, index=['r2','r3','r4'])
print(A + B)                      # alignment -> NaNs
print((A.reindex(B.index) + B))   # aligned on B's index

# 3) Standardize each numeric column using NumPy and assign back
X = df[['a','b']]
arr = X.to_numpy(dtype='float64')
Z = (arr - arr.mean(axis=0)) / arr.std(axis=0)
df[['a_z','b_z']] = Z
print(df.head())

float64 (3, 2)
       v
r1   NaN
r2  12.0
r3  23.0
r4   NaN
       v
r2  12.0
r3  23.0
r4   NaN
   a    b  c       a_z  b_z
0  1  1.1  x -1.224745  NaN
1  2  NaN  y  0.000000  NaN
2  3  3.3  x  1.224745  NaN
