In [3]:
import numpy as np

# Extracting numeric data as NumPy arrays for future analysis
votes = np.array([ 775,  787,  918,   88,  166,  286, 2556,  324,  504,  402])
costs = np.array(["'800.0'" ,"'800.0'", "'800.0'", "'300.0'", "'600.0'", "'600.0'", "'600.0'", "'700.0'" ,"'550.0'", "'500.0'"])

print("Votes (Array):", votes)
print("Costs (Array):", costs)

Votes (Array): [ 775  787  918   88  166  286 2556  324  504  402]
Costs (Array): ["'800.0'" "'800.0'" "'800.0'" "'300.0'" "'600.0'" "'600.0'" "'600.0'"
 "'700.0'" "'550.0'" "'500.0'"]


In [8]:
ratings = [4,4,3,4,5,4]

In [10]:
ratings

[4, 4, 3, 4, 5, 4]

In [9]:
ratings*2

[4, 4, 3, 4, 5, 4, 4, 4, 3, 4, 5, 4]

In [11]:
votes * 2

array([1550, 1574, 1836,  176,  332,  572, 5112,  648, 1008,  804])

In [14]:
import time
import numpy as np

# Large list and array to compare performance
large_list = list(range(1_000_000))
large_array = np.arange(1_000_000)

# Timing sum on a Python list
start = time.time()
sum_list = sum(large_list)
end = time.time()
list_time = end - start

# Timing sum on a NumPy array
start = time.time()
sum_array = np.sum(large_array)
end = time.time()
array_time = end - start

print("List sum time:", list_time, "seconds")
print("Array sum time:", array_time, "seconds")

List sum time: 0.008781194686889648 seconds
Array sum time: 0.0012989044189453125 seconds


In [13]:
range(1,10,0.5)

TypeError: 'float' object cannot be interpreted as an integer

In [15]:
# Take first 5 elements of votes and costs
subset_votes = votes
subset_costs = costs

# Create a 2D array: 5 rows, 2 columns (each row: [vote_count, cost])
two_d_data = np.array([
    subset_votes,
    subset_costs
]).T  # transpose so that each row corresponds to a single restaurant

print("2D Array:\n", two_d_data)
print("Shape:", two_d_data.shape)
print("Dimensions:", two_d_data.ndim)
print("Size:", two_d_data.size)

2D Array:
 [['775' "'800.0'"]
 ['787' "'800.0'"]
 ['918' "'800.0'"]
 ['88' "'300.0'"]
 ['166' "'600.0'"]
 ['286' "'600.0'"]
 ['2556' "'600.0'"]
 ['324' "'700.0'"]
 ['504' "'550.0'"]
 ['402' "'500.0'"]]
Shape: (10, 2)
Dimensions: 2
Size: 20


In [21]:
votes.shape

(10,)

In [22]:
print("Column dtype before conversion:", costs.dtype)

Column dtype before conversion: <U7


In [23]:
print("Column dtype before conversion:", votes.dtype)

Column dtype before conversion: int64


In [24]:
costs = np.char.replace(costs, ',', '')  # Remove commas np.char is similar to str.replace we use in python
costs = np.char.replace(costs, "'", '')  # Remove single quotes

# STEP 3: Convert the cleaned strings to float
costs = costs.astype(float)

# STEP 4: Confirm the dtype
print("Array dtype after conversion:", costs.dtype)
print("Cleaned costs:", costs)

Array dtype after conversion: float64
Cleaned costs: [800. 800. 800. 300. 600. 600. 600. 700. 550. 500.]


In [25]:
mixed_data = np.array([100, '200', 300, 'Hundred'])
print("Mixed data array:", mixed_data)
print("Mixed data dtype:", mixed_data.dtype)

Mixed data array: ['100' '200' '300' 'Hundred']
Mixed data dtype: <U21


In [26]:
mixed_int = mixed_data.astype(int)

ValueError: invalid literal for int() with base 10: np.str_('Hundred')

In [27]:
mixed_data = np.array([100, '200', 300, '400'])
print("Mixed data array:", mixed_data)
print("Mixed data dtype:", mixed_data.dtype)

Mixed data array: ['100' '200' '300' '400']
Mixed data dtype: <U21


In [28]:
mixed_int = mixed_data.astype(int)

In [29]:
mixed_int

array([100, 200, 300, 400])

In [30]:
mixed_data.dtype

dtype('<U21')

In [31]:
mixed_data = mixed_data.astype(int)

In [32]:
mixed_data.dtype

dtype('int64')

In [33]:
mixed_data

array([100, 200, 300, 400])

In [34]:
mixed_data = np.array([100, '200', 300, '400'])

In [37]:
mixed_int = mixed_data.astype(int)

In [36]:
mixed_data.dtype

dtype('<U21')

In [38]:
votes

array([ 775,  787,  918,   88,  166,  286, 2556,  324,  504,  402])

In [39]:
votes[1]

np.int64(787)

In [40]:
votes[-1]

np.int64(402)

In [41]:
two_d_data[: , : ]

array([['775', "'800.0'"],
       ['787', "'800.0'"],
       ['918', "'800.0'"],
       ['88', "'300.0'"],
       ['166', "'600.0'"],
       ['286', "'600.0'"],
       ['2556', "'600.0'"],
       ['324', "'700.0'"],
       ['504', "'550.0'"],
       ['402', "'500.0'"]], dtype='<U21')

In [42]:
two_d_data[:4 , : ]

array([['775', "'800.0'"],
       ['787', "'800.0'"],
       ['918', "'800.0'"],
       ['88', "'300.0'"]], dtype='<U21')

In [43]:
two_d_data[:, :1 ]

array([['775'],
       ['787'],
       ['918'],
       ['88'],
       ['166'],
       ['286'],
       ['2556'],
       ['324'],
       ['504'],
       ['402']], dtype='<U21')

In [44]:
votes

array([ 775,  787,  918,   88,  166,  286, 2556,  324,  504,  402])

In [45]:
votes < 800

array([ True,  True, False,  True,  True,  True, False,  True,  True,
        True])

In [49]:
votes[votes< 2550]

array([775, 787, 918,  88, 166, 286, 324, 504, 402])

In [46]:
votes[1]

np.int64(787)

In [50]:
# Take a sample of 50 restaurants
sample_votes = np.array([775, 787, 918, 88, 166, 286, 2556, 324, 504, 402, 150, 164, 424, 918, 90, 133, 144, 93, 62, 180, 62, 148, 219, 506, 172, 415, 230, 1647, 4884, 133, 286, 540, 2556, 36, 244, 804, 679, 245, 345, 618, 1047, 627, 354, 244, 163, 808, 1720, 868, 520, 299])
sample_costs = np.array([800.0, 800.0, 800.0, 300.0, 600.0, 600.0, 600.0, 700.0, 550.0, 500.0, 600.0, 500.0, 450.0, 800.0, 650.0, 800.0, 700.0, 300.0, 400.0, 500.0, 600.0, 550.0, 600.0, 500.0, 750.0, 500.0, 650.0, 600.0, 750.0, 200.0, 500.0, 800.0, 600.0, 400.0, 300.0, 450.0, 850.0, 300.0, 400.0, 750.0, 450.0, 450.0, 800.0, 800.0, 800.0, 850.0, 400.0, 1200.0, 300.0, 300.0])

# Create a 2D array: rows = restaurants, columns = [votes, costs]
restaurants_data = np.column_stack((sample_votes, sample_costs))

print("2D Array (votes, costs):\n", restaurants_data)
print("Shape:", restaurants_data.shape)
print("Dimensions:", restaurants_data.ndim)  # 2D

2D Array (votes, costs):
 [[ 775.  800.]
 [ 787.  800.]
 [ 918.  800.]
 [  88.  300.]
 [ 166.  600.]
 [ 286.  600.]
 [2556.  600.]
 [ 324.  700.]
 [ 504.  550.]
 [ 402.  500.]
 [ 150.  600.]
 [ 164.  500.]
 [ 424.  450.]
 [ 918.  800.]
 [  90.  650.]
 [ 133.  800.]
 [ 144.  700.]
 [  93.  300.]
 [  62.  400.]
 [ 180.  500.]
 [  62.  600.]
 [ 148.  550.]
 [ 219.  600.]
 [ 506.  500.]
 [ 172.  750.]
 [ 415.  500.]
 [ 230.  650.]
 [1647.  600.]
 [4884.  750.]
 [ 133.  200.]
 [ 286.  500.]
 [ 540.  800.]
 [2556.  600.]
 [  36.  400.]
 [ 244.  300.]
 [ 804.  450.]
 [ 679.  850.]
 [ 245.  300.]
 [ 345.  400.]
 [ 618.  750.]
 [1047.  450.]
 [ 627.  450.]
 [ 354.  800.]
 [ 244.  800.]
 [ 163.  800.]
 [ 808.  850.]
 [1720.  400.]
 [ 868. 1200.]
 [ 520.  300.]
 [ 299.  300.]]
Shape: (50, 2)
Dimensions: 2


In [51]:
restaurants_data.size

100

In [53]:
resturants_reshaped_data = restaurants_data.reshape(25,4)

In [54]:
resturants_reshaped_data.shape

(25, 4)

In [55]:
 restaurants_data.reshape(25,5)

ValueError: cannot reshape array of size 100 into shape (25,5)

In [56]:
restaurants_data.reshape(25,-1)

array([[ 775.,  800.,  787.,  800.],
       [ 918.,  800.,   88.,  300.],
       [ 166.,  600.,  286.,  600.],
       [2556.,  600.,  324.,  700.],
       [ 504.,  550.,  402.,  500.],
       [ 150.,  600.,  164.,  500.],
       [ 424.,  450.,  918.,  800.],
       [  90.,  650.,  133.,  800.],
       [ 144.,  700.,   93.,  300.],
       [  62.,  400.,  180.,  500.],
       [  62.,  600.,  148.,  550.],
       [ 219.,  600.,  506.,  500.],
       [ 172.,  750.,  415.,  500.],
       [ 230.,  650., 1647.,  600.],
       [4884.,  750.,  133.,  200.],
       [ 286.,  500.,  540.,  800.],
       [2556.,  600.,   36.,  400.],
       [ 244.,  300.,  804.,  450.],
       [ 679.,  850.,  245.,  300.],
       [ 345.,  400.,  618.,  750.],
       [1047.,  450.,  627.,  450.],
       [ 354.,  800.,  244.,  800.],
       [ 163.,  800.,  808.,  850.],
       [1720.,  400.,  868., 1200.],
       [ 520.,  300.,  299.,  300.]])

In [57]:
restaurants_data.reshape(-1,5)

array([[ 775.,  800.,  787.,  800.,  918.],
       [ 800.,   88.,  300.,  166.,  600.],
       [ 286.,  600., 2556.,  600.,  324.],
       [ 700.,  504.,  550.,  402.,  500.],
       [ 150.,  600.,  164.,  500.,  424.],
       [ 450.,  918.,  800.,   90.,  650.],
       [ 133.,  800.,  144.,  700.,   93.],
       [ 300.,   62.,  400.,  180.,  500.],
       [  62.,  600.,  148.,  550.,  219.],
       [ 600.,  506.,  500.,  172.,  750.],
       [ 415.,  500.,  230.,  650., 1647.],
       [ 600., 4884.,  750.,  133.,  200.],
       [ 286.,  500.,  540.,  800., 2556.],
       [ 600.,   36.,  400.,  244.,  300.],
       [ 804.,  450.,  679.,  850.,  245.],
       [ 300.,  345.,  400.,  618.,  750.],
       [1047.,  450.,  627.,  450.,  354.],
       [ 800.,  244.,  800.,  163.,  800.],
       [ 808.,  850., 1720.,  400.,  868.],
       [1200.,  520.,  300.,  299.,  300.]])

In [58]:
restaurants_data[:, 1] < 500

array([False, False, False,  True, False, False, False, False, False,
       False, False, False,  True, False, False, False, False,  True,
        True, False, False, False, False, False, False, False, False,
       False, False,  True, False, False, False,  True,  True,  True,
       False,  True,  True, False,  True,  True, False, False, False,
       False,  True, False,  True,  True])

In [60]:
np.sum(restaurants_data[:, 0])

np.float64(30583.0)

In [62]:
np.sum(restaurants_data, axis = 0)

array([30583., 29350.])

In [63]:
np.where(restaurants_data[:, 1] > 500 )

(array([ 0,  1,  2,  4,  5,  6,  7,  8, 10, 13, 14, 15, 16, 20, 21, 22, 24,
        26, 27, 28, 31, 32, 36, 39, 42, 43, 44, 45, 47]),)

In [64]:
restaurants_data.shape

(50, 2)

In [65]:
restaurants_data[np.where(restaurants_data[:, 1] > 500 ) & ]

array([[ 775.,  800.],
       [ 787.,  800.],
       [ 918.,  800.],
       [ 166.,  600.],
       [ 286.,  600.],
       [2556.,  600.],
       [ 324.,  700.],
       [ 504.,  550.],
       [ 150.,  600.],
       [ 918.,  800.],
       [  90.,  650.],
       [ 133.,  800.],
       [ 144.,  700.],
       [  62.,  600.],
       [ 148.,  550.],
       [ 219.,  600.],
       [ 172.,  750.],
       [ 230.,  650.],
       [1647.,  600.],
       [4884.,  750.],
       [ 540.,  800.],
       [2556.,  600.],
       [ 679.,  850.],
       [ 618.,  750.],
       [ 354.,  800.],
       [ 244.,  800.],
       [ 163.,  800.],
       [ 808.,  850.],
       [ 868., 1200.]])