In [49]:
import numpy as np

# load CSV data
data = np.genfromtxt("AI_Developer_Performance_Extended_1000.csv",
    delimiter=",")

print("Shape:", data.shape)
print("First 5 rows:\n", data[:5])

Shape: (1001, 13)
First 5 rows:
 [[  nan   nan   nan   nan   nan   nan   nan   nan   nan   nan   nan   nan
    nan]
 [  7.  416.    9.    7.    6.    5.9  92.   34.    7.   99.   10.5  20.
    3. ]
 [  4.  269.   16.   13.    5.    5.1  85.   36.    2.  100.    9.5  17.
    8. ]
 [ 11.  439.    3.    0.    2.    6.2  38.   79.    2.   55.   18.3  35.
    2. ]
 [  8.  472.   15.    9.    4.    4.2  26.   94.    5.   30.   12.6  28.
    4. ]]


In [50]:
# total NaNs
np.isnan(data).sum()

np.int64(13)

In [51]:
# column-wise NaNs
np.isnan(data).sum(axis=0)

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

In [52]:
# row-wise NaNs
np.isnan(data).sum(axis=1)

array([13,  0,  0, ...,  0,  0,  0], shape=(1001,))

In [53]:
clean_data = data[~np.isnan(data).any(axis=1)]

print("Original shape:", data.shape)
print("Clean shape:", clean_data.shape)


Original shape: (1001, 13)
Clean shape: (1000, 13)


In [54]:
# first row
first_row = clean_data[0]

# first column (feature)
first_column = clean_data[:, 0]

# first 10 rows, first 3 columns
subset = clean_data[:10, :3]

print(first_row)
print(first_column[:5])
print(subset)

[  7.  416.    9.    7.    6.    5.9  92.   34.    7.   99.   10.5  20.
   3. ]
[ 7.  4. 11.  8.  5.]
[[  7. 416.   9.]
 [  4. 269.  16.]
 [ 11. 439.   3.]
 [  8. 472.  15.]
 [  5. 265.  19.]
 [  7. 196.  14.]
 [ 10. 516.   4.]
 [  3. 234.   5.]
 [  7. 440.  16.]
 [ 11. 297.   0.]]


In [55]:
# add 10 to all values
added = clean_data + 10

# multiply by 2
scaled = clean_data * 2

# square of values
squared = clean_data ** 2

print("Added sample:\n", added[:5])
print("Scaled sample:\n", scaled[:5])
print("Squared sample:\n", squared[:5])

Added sample:
 [[ 17.  426.   19.   17.   16.   15.9 102.   44.   17.  109.   20.5  30.
   13. ]
 [ 14.  279.   26.   23.   15.   15.1  95.   46.   12.  110.   19.5  27.
   18. ]
 [ 21.  449.   13.   10.   12.   16.2  48.   89.   12.   65.   28.3  45.
   12. ]
 [ 18.  482.   25.   19.   14.   14.2  36.  104.   15.   40.   22.6  38.
   14. ]
 [ 15.  275.   29.   26.   15.   18.1  92.   43.   16.   92.   17.   35.
   19. ]]
Scaled sample:
 [[ 14.  832.   18.   14.   12.   11.8 184.   68.   14.  198.   21.   40.
    6. ]
 [  8.  538.   32.   26.   10.   10.2 170.   72.    4.  200.   19.   34.
   16. ]
 [ 22.  878.    6.    0.    4.   12.4  76.  158.    4.  110.   36.6  70.
    4. ]
 [ 16.  944.   30.   18.    8.    8.4  52.  188.   10.   60.   25.2  56.
    8. ]
 [ 10.  530.   38.   32.   10.   16.2 164.   66.   12.  164.   14.   50.
   18. ]]
Squared sample:
 [[4.90000e+01 1.73056e+05 8.10000e+01 4.90000e+01 3.60000e+01 3.48100e+01
  8.46400e+03 1.15600e+03 4.90000e+01 9.80100e+03 1.1025

In [56]:
col_mean = np.nanmean(clean_data, axis=0)
row_sum  = np.nansum(clean_data, axis=0)
col_min  = np.nanmin(clean_data, axis=0)
col_max  = np.nanmax(clean_data, axis=0)

print("Column Mean:", col_mean[:5])
print("Row Sum (first 5):", row_sum[:5])
print("Column Min:", col_min[:5])
print("Column Max:", col_max[:5])

Column Mean: [  5.84  356.234   9.876   7.153   2.961]
Row Sum (first 5): [  5840. 356234.   9876.   7153.   2961.]
Column Min: [ 1. 26.  0.  0.  0.]
Column Max: [ 11. 993.  19.  19.   6.]


In [57]:
# normalize columns (feature scaling)
normalized = (clean_data - col_mean) / np.nanstd(clean_data, axis=0)

print("Normalized data sample:\n", normalized[:5])


Normalized data sample:
 [[ 0.36744203  0.31780072 -0.15121299 -0.02799383  1.50425657 -0.393242
   1.61178529 -1.08524762  1.55143197  1.49122492  0.30048946  0.25361258
  -0.54759816]
 [-0.58283908 -0.46385952  1.05710997  1.06980326  1.00927251 -0.94925756
   1.29004414 -0.98911442 -0.58434993  1.53698206  0.1335416  -0.02335784
   1.22571195]
 [ 1.63448352  0.4401013  -1.18691838 -1.30875709 -0.47567969 -0.18473617
  -0.87021789  1.07774923 -0.58434993 -0.52208887  1.60268279  1.6384647
  -0.90226019]
 [ 0.6842024   0.61557605  0.8844924   0.33793854  0.51428844 -1.57477506
  -1.42177416  1.79874817  0.69711921 -1.66601717  0.65107997  0.99220038
  -0.19293614]
 [-0.26607871 -0.48512918  1.57496266  1.6187018   1.00927251  1.13580078
   1.15215507 -1.13331421  1.12427559  0.71335368 -0.28382806  0.71522995
   1.58037398]]


In [58]:
import time

# convert to list
data_list = data.tolist()

# NumPy timing
start = time.time()
np.sum(data)
print("NumPy Time:", time.time() - start)

# Python list timing
start = time.time()
total = 0
for row in data_list:
    for val in row:
        total += val
print("Python List Time:", time.time() - start)


NumPy Time: 0.0
Python List Time: 0.0025177001953125


In [None]:
# save
np.save("Project_1_Numpy_Data_Explorer", clean_data)

# load
loaded_data = np.load("Project_1_Numpy_Data_Explorer.npy")

print("Loaded shape:", loaded_data.shape)

Loaded shape: (1000, 13)
