In [1]:
import numpy as np

### Numpy vectorization 
* the process of converting an algorithm from operating on a single value at a time to operating on a set of values (vector) at one time.

In [5]:
# Example

a = np.array([1,4,5,6,9])
a* 4 # Here the algorithm multiplication is vectorized and can act on a set of values (in our case it is the list)

array([ 4, 16, 20, 24, 36])

### Numpy is faster than lists
* Numpy internally uses C arrays.

In [9]:
# Faster Execution speed
%timeit np.array(range(100000))

3.47 ms ± 30.3 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [11]:
# Slower as compared numpy storage
%timeit [range(100000)]

121 ns ± 1.39 ns per loop (mean ± std. dev. of 7 runs, 10,000,000 loops each)


### Initializing array using python list

In [45]:
arr1 = np.array([4, 11, 53, 2, 9, ])
arr1


array([ 4, 11, 53,  2,  9])

### Creates 2-D array of float data type
array([1., 2., 3.],
 [4., 5., 6.]])

In [46]:
arr2 = np.array([[1.,2.,3.],[4.,5.,6.]],dtype=float)

### Initializing numpy array range. Similar to
python range().
Step size can be float.

In [47]:
# Start, stop , step
np.arange(2,10,1.5)

array([2. , 3.5, 5. , 6.5, 8. , 9.5])

### Returns evenly spaced numbers over specified interval

In [48]:
# Start, stop , step
np.linspace(0,100,5) 

array([  0.,  25.,  50.,  75., 100.])

### Creates an array with all elements as 0
similar func: np.ones()

In [49]:
print(np.zeros((3)))

print(np.zeros((2,3)))



[0. 0. 0.]
[[0. 0. 0.]
 [0. 0. 0.]]


### Generates array with elements belonging
to continuous uniform distribution
Range: [0, 1)

In [50]:
# Shape can be mentioned inside
np.random.rand(3,2)

array([[0.32627133, 0.46321883],
       [0.72873441, 0.58350453],
       [0.20584249, 0.23265144]])

### number of dimensions of the array.

In [51]:
# shape of array
np.ndim(a)

1

### datatype of array.

In [52]:
arr2.dtype

dtype('float64')

### access element present at that index.
Index start from 0

In [53]:
arr1[2]
print(arr2)
arr2[1, 2]


[[1. 2. 3.]
 [4. 5. 6.]]


6.0

### Negative index based indexing

In [54]:
arr1[-1] # Last element

9

## Accessing Sequence(Slicing)

Slice out and get part of the numpy array.
Can use negative indexes for slicing as
well.
Slicing returns View not copy.

In [55]:
print(arr1)
arr1[3:]

[ 4 11 53  2  9]


array([2, 9])

In [56]:
arr1[:4]

array([ 4, 11, 53,  2])

In [57]:
arr1[1: 4: 2]

array([11,  2])

In [58]:
arr1[-4: -1]

array([11, 53,  2])

In [61]:
print(arr2)
arr2[:1, :]
# fetches first row
# [[2., 7. , 11.]]

[[1. 2. 3.]
 [4. 5. 6.]]


array([[1., 2., 3.]])

In [62]:
arr2[:, 2:] # arr[row slicing, column slicing]

array([[3.],
       [6.]])

## Indexing based on condition.
Masking creates a copy of the array not a
view.

In [65]:
arr1[arr1 > 8]

array([11, 53,  9])

In [66]:
arr1[(arr1 >5) & (arr1 <=11)]

array([11,  9])

## Operations

Arithmetic

In [70]:
a = np.array([1, 2, 3, 4])
b = np.array([1, 1, 2, 2])
a + b 

array([2, 3, 5, 6])

Element wise Addition and Subtraction

In [71]:
a - b

array([0, 1, 1, 2])

Returns matrix mul. of arrays provided the
condition for matrix multiplication is
satisfied.

In [75]:
mat1

array([[2],
       [1]])

In [76]:
mat2

array([[2, 4]])

In [74]:

mat1 = np.array([[2], [1]])
mat2 = np.array([[2, 4]])

np.matmul(mat1, mat2)

array([[4, 8],
       [2, 4]])

In [77]:
np.ones((2,3,4),dtype=np.int16)

array([[[1, 1, 1, 1],
        [1, 1, 1, 1],
        [1, 1, 1, 1]],

       [[1, 1, 1, 1],
        [1, 1, 1, 1],
        [1, 1, 1, 1]]], dtype=int16)

In [91]:
arr = np.array([[1,2 ,3],[4 ,5 ,6],[7 ,8 ,9]])

arr.T[::,::-1]

array([[7, 4, 1],
       [8, 5, 2],
       [9, 6, 3]])

In [107]:
birds = np.array(['spoonbills',  'plovers',  'plovers',  'plovers',  'plovers',  'Cranes',  'plovers',  'plovers',  'Cranes',  'spoonbills'])
age = np.array([5.5, 6.0, 3.5, 1.5, 3.0, 4.0, 3.5, 2.0, 5.5, 6.0])

In [109]:
birds[age.argsort()]

array(['plovers', 'plovers', 'plovers', 'plovers', 'plovers', 'Cranes',
       'spoonbills', 'Cranes', 'plovers', 'spoonbills'], dtype='<U10')

In [114]:
arr = np.array([[1, 2, 3],
 [4, 5, 6],
 [7, 8, 9]])

In [126]:
arr[::2,:].flatten()

array([1, 2, 3, 7, 8, 9])

In [112]:
[1, 2, 3, 7, 8, 9]

[1, 2, 3, 7, 8, 9]

In [130]:
A = np.array([2, 0, 1, 9, 1, 1, 1, 0, 3, 5])
list(map(lambda x:x+1,A))
A+1
[x+1 for x in A]


[3, 1, 2, 10, 2, 2, 2, 1, 4, 6]

In [132]:
#a. Write the code using np.arange() to get all even numbers between 21 and 70, 
np.arange(22,71,2)
#a. Write the code using np.arange() to get all odd numbers between 20 and 71,
np.arange(21,72,2)



array([21, 23, 25, 27, 29, 31, 33, 35, 37, 39, 41, 43, 45, 47, 49, 51, 53,
       55, 57, 59, 61, 63, 65, 67, 69, 71])

In [138]:
np.round(np.arange(5, 7 ,0.5),2)

array([5. , 5.5, 6. , 6.5])

In [145]:
end = 1
n = 10
start = 5
k=3
for i in range(0,10):
    end = start + (i * k)
np.arange(start,end+k,k)

array([ 5,  8, 11, 14, 17, 20, 23, 26, 29, 32])

In [149]:
import numpy as np
print(np.sort(np.array(['Ram','Astha','Raghavendra'])))

['Astha' 'Raghavendra' 'Ram']


In [150]:
arr1 = np.array(['Ram','Astha','Brahat'])
arr2 = np.array(['Shyam','Kalyan','Naveen'])
arr1 > arr2

array([False, False, False])

In [155]:
arr = np.array([[1 ,2, 3],[4 ,5 ,6], [7 ,8 ,9]])

arr.T[:,::-1]

array([[7, 4, 1],
       [8, 5, 2],
       [9, 6, 3]])

In [156]:
birds = np.array(['spoonbills',  'plovers',  'plovers',  'plovers',  'plovers',  'Cranes',  'plovers',  'plovers',  'Cranes',  'spoonbills'])
age = np.array([5.5, 6.0, 3.5, 1.5, 3.0, 4.0, 3.5, 2.0, 5.5, 6.0])


In [158]:
indices = np.argsort(age)
birds[indices]

array(['plovers', 'plovers', 'plovers', 'plovers', 'plovers', 'Cranes',
       'spoonbills', 'Cranes', 'plovers', 'spoonbills'], dtype='<U10')

In [165]:
arr = np.array([[1, 2, 3] ,[4, 5, 6],[7,8,9]])
arr

array([[1, 2, 3],
       [4, 5, 6],
       [7, 8, 9]])

In [170]:
arr[::2,::].flatten()

array([1, 2, 3, 7, 8, 9])

In [171]:
import numpy as np
X = np.arange(12).reshape(3,4)


In [184]:
arr = np.array([[5, 3, 9],[2, 1 ,4],[7 ,6 ,8]])
arr

array([[5, 3, 9],
       [2, 1, 4],
       [7, 6, 8]])

In [197]:
j =2
arr[arr[:,j-1].argsort()]

array([[2, 1, 4],
       [5, 3, 9],
       [7, 6, 8]])

In [201]:

np.argmax(np.array([6, 1, 6, 5, 7, 6, 0, 9, 0, 7]))

7

In [202]:
np.array(['sparrow', 'peacock', 'parrot', 'owl', 'peacock', 'macaw', 'macaw', 'parrot', 'macaw', 'peacock'] )[7]

'parrot'

In [204]:
np.ones((5,5))

array([[1., 1., 1., 1., 1.],
       [1., 1., 1., 1., 1.],
       [1., 1., 1., 1., 1.],
       [1., 1., 1., 1., 1.],
       [1., 1., 1., 1., 1.]])

In [244]:
arr = np.array([[ 0,  1,  2,  3],  
 [ 4,  5,  6,  7],   
 [ 8,  9, 10, 11],   
 [12, 13, 14, 15],   
 [16, 17, 18, 19]])

In [245]:
arr = arr[1:4]
arr

array([[ 4,  5,  6,  7],
       [ 8,  9, 10, 11],
       [12, 13, 14, 15]])

In [255]:
arr = arr[::,-3::]

In [256]:
arr = arr[::-1]

In [257]:
arr

array([[13, 14, 15],
       [ 9, 10, 11],
       [ 5,  6,  7]])

In [260]:
np.identity(3) * k

array([[1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.]])

In [278]:
arr = np.array([[1,1, 1 ,1 ,1]  ,
 [0, 2 ,2 ,9, 3]  ,
 [6 ,0 ,2 ,5 ,2]  ,
 [2, 4 ,3 ,5 ,5]]  )


In [279]:
arr2 = [[8, 6, 6 ,8 ,3]  ,
 [2 ,7 ,0 ,3 ,1]  ,
 [3 ,2 ,1 ,5, 2]  ,
 [7 ,0 ,7 ,6 ,8]  ,
 [1, 5 ,6 ,4 ,5]] 

In [280]:
arr_out =np.matmul(arr,arr2)
arr_out

array([[21, 20, 20, 26, 19],
       [76, 33, 83, 82, 93],
       [91, 50, 85, 96, 72],
       [73, 71, 80, 93, 81]])

In [275]:
arr1 = np.array([1,2,3])
arr2 = np.array([9,8,7])
np.dot(arr1, arr2)

46

In [277]:
arr1 = np.array([[1,2], [3,4]])
arr2 = np.array([[1], [2]])
np.dot(arr1, arr2)

 

array([[ 5],
       [11]])

In [None]:
[1,2][1,1]
[3,4]

In [295]:
arr = np.arange(10)
np.split(arr,[6,7])

[array([0, 1, 2, 3, 4, 5]), array([6]), array([7, 8, 9])]

In [298]:
arr =np.array([[0, 1, 2, 3],
 [4, 5, 6, 7],
 [8, 9, 10, 11],
 [12, 13, 14, 15],
 [16, 17, 18, 19],
 [20, 21, 22, 23]])



In [308]:
np.hsplit(arr,[2,3]) 

[array([[ 0,  1],
        [ 4,  5],
        [ 8,  9],
        [12, 13],
        [16, 17],
        [20, 21]]),
 array([[ 2],
        [ 6],
        [10],
        [14],
        [18],
        [22]]),
 array([[ 3],
        [ 7],
        [11],
        [15],
        [19],
        [23]])]

In [310]:
x = np.array([[200,200,200],[300,300,300],[400,400,400]])
x

array([[200, 200, 200],
       [300, 300, 300],
       [400, 400, 400]])

In [312]:
v = np.array([200,300,400])
v



array([200, 300, 400])

In [None]:
[0 ]      
[10 ]
[20 ]

In [None]:
[10 ,11 ,12]
[10, 11, 12]
[10, 11, 12]

In [None]:
[2 2 2 2]

[2 3 4 5]
[1 7 3 5]
[2 8 6 9]

In [None]:
a = 2,4,3


In [327]:
import pandas as pd
df = pd.DataFrame({'roll_no': [1,2,1,3,1,3,3,3,2,2,1,2], 'subject': ['NN','DL','ML','Prob','DL','ML','DL','NN','NN','Prob','Prob','ML']
              ,'marks':[97,63,63,71,64,90,66,46,74,62,94,67] })


max_marks = df[['subject','marks']].groupby(['subject']).max().reset_index()

max_marks.merge(df,on=['subject','marks'])

Unnamed: 0,subject,marks,roll_no
0,DL,66,3
1,ML,90,3
2,NN,97,1
3,Prob,94,1


In [328]:
df = pd.DataFrame({'ord_no':[70001,70009,70002,70004,70007,70005,70008,70010,70003,70012,70011,70013],'purch_amt':[150.5,270.65,65.26,110.5,948.5,2400.6,5760,1983.43,2480.4,250.45, 75.29,3045.6],'ord_date': ['2012-10-05','2012-09-10','2012-10-05','2012-08-17','2012-09-10','2012-07-27','2012-09-10','2012-10-10','2012-10-10','2012-06-27','2012-08-17','2012-04-25'],'customer_id':[3005,3001,3002,3009,3005,3007,3002,3004,3009,3008,3003,3002],'salesman_id': [5002,5005,5001,5003,5002,5001,5001,5006,5003,5002,5007,5001]})
df[['salesman_id','customer_id','ord_no']].groupby(['salesman_id','custo)mer_id']).count().reset_index(

In [340]:
df = pd.DataFrame({'Name': {0: 'Julie Dsouza', 1: 'Kelly Sebastian', 2: 'Daniel Fernandez', 3: 'Julie Dsouza', 4: 'John Jacob'}, 'Product': {0: 'Apple Airpods Headphones', 1: 'Apple Airpods Headphones', 2: '27in 4K Gaming Monitor', 3: 'Bose SoundSport Headphones', 4: 'Wired Headphones'}, 'Quantity Ordered': {0: 1, 1: 1, 2: 1, 3: 1, 4: 2}, 'Price Each': {0: 150.0, 1: 150.0, 2: 389.99, 3: 99.99, 4: 11.99}, 'Order Date': {0: '01/22/19 21:20', 1: '01/24/19 8:13', 2: '01/26/19 12:16', 3: '01/01/19 10:30', 4: '01/17/19 13:33'}, 'Purchase Address': {0: '868 Willow St, Los Angeles, CA 90001', 1: '442 Cedar St, Portland, OR 97035', 2: '741 10th St, Los Angeles, CA 90001', 3: '867 Willow St, Los Angeles, CA 90001', 4: '946 Walnut St, Boston, MA 02215'}})
x = 2
y = 100
df.groupby(['Name']).filter(lambda x: x["Quantity Ordered"].sum()>=x and x["Price Each"].mean()>=y)

TypeError: '>=' not supported between instances of 'numpy.ndarray' and 'str'

In [461]:
runner_df = pd.DataFrame({'distance': {0: 82,   1: 101,   2: 84,   3: 106,   4: 93,   5: 86,   6: 87,   7: 82,   8: 92,   9: 86,   10: 98,   11: 84},  'runner': {0: 'runner1',   1: 'runner2',   2: 'runner3',   3: 'runner4',   4: 'runner1',   5: 'runner2',   6: 'runner3',   7: 'runner4',   8: 'runner1',   9: 'runner2',   10: 'runner3',   11: 'runner4'}})


In [359]:
runner_df.groupby('runner').transform('mean')


Unnamed: 0,distance,mean
0,89.0,89.0
1,91.0,91.0
2,89.666667,89.666667
3,90.666667,90.666667
4,89.0,89.0
5,91.0,91.0
6,89.666667,89.666667
7,90.666667,90.666667
8,89.0,89.0
9,91.0,91.0


In [357]:
runner_df['runner'][runner_df['distance'] > runner_df['mean']].value_counts()

runner
runner1    2
runner2    1
runner4    1
runner3    1
Name: count, dtype: int64

In [364]:
ORG

In [365]:
df

Unnamed: 0,name,age
0,Ram,kid
1,Shyam,adult
2,Mukesh,senior
3,Suresh,adult


In [382]:
df = pd.DataFrame({
  "Accessories": ["Laptop", "Laptop", "Ipad", "Ipad", "Tablet", "Laptop"],
  "customer": ["Andrew", np.NaN, "Tom", "Andrew", "Tobey", "Peter"],
  "quantity": [1, 2, 2, 3, 1, 2],
})

df.groupby(['Accessories','customer']).quantity.sum()

Accessories  customer
Ipad         Andrew      3
             Tom         2
Laptop       Andrew      1
             Peter       2
Tablet       Tobey       1
Name: quantity, dtype: int64

In [378]:
df[df.isnull().any(axis=1)] #each row

Unnamed: 0,Accessories,customer,quantity


In [379]:
df.drop(df[df.isna().all(axis=1)].index, inplace=True)
df

Unnamed: 0,Accessories,customer,quantity
0,Laptop,Andrew,1
1,Laptop,,2
2,Ipad,Tom,2
3,Ipad,Andrew,3
4,Tablet,Tobey,1
5,Laptop,Peter,2


In [383]:
df.isna().sum().sum()

1

In [387]:
df.loc[[3]]

Unnamed: 0,Accessories,customer,quantity
3,Ipad,Andrew,3


In [388]:
df = pd.DataFrame({'access_id': {0: 'ORG6684',   1: '4564',   2: 'ORG6995',   3: '2130',   4: '5839',   5: 'ORG1281',   6: 'ORG2651',   7: 'ORG9870',   8: 'ORG4089',   9: 'ORG3794'}})

In [397]:
df
df[~(df["access_id"].str[0:3] == 'ORG')]['access_id'].count()

3

In [399]:
df = pd.DataFrame({'name': ['elon', 'suzlon', 'keylon', 'dusk'], 'username': ['user_spaceboyelon_2022', 'user_suzlon123', 'tothemoon123_2022', 'user_duskmusk_2022']})
df

Unnamed: 0,name,username
0,elon,user_spaceboyelon_2022
1,suzlon,user_suzlon123
2,keylon,tothemoon123_2022
3,dusk,user_duskmusk_2022


In [403]:
df[df["username"].str.startswith("user_") & df["username"].str.endswith("_2022")]["name"].count()

2

In [404]:
df = pd.DataFrame({'Date':["2015-12-06", "2011-12-27", "2015-09-07", "2012-12-21", "2020-02-13", "2015-06-09"], 'RID':[498, 721, 375, 464, 813, 853], 'Phy':[22, 45, 1, 65, 22, 17], 'Chem':[52, 56, 32, 50, 24, 61], 'Math':[63, 37, 68, 62, 43 ,42]})

In [407]:
df["Date"] = pd.to_datetime(df["Date"])

In [438]:
df["month"] = df["Date"].dt.month_name().str[0:3]
df.month = df["month"].str.upper()
month = df["month"].value_counts().index[0]

In [437]:
freq = df[df["month"] == df["month"].value_counts().index[0]]["month"].count()

3

In [454]:
df_score = df[df["month"] == month][["Phy","Chem","Math","month"]]

In [460]:
np.round(df_score["Phy"].mean(),2)
np.round(df_score["Chem"].mean(),2)
np.round(df_score["Math"].mean(),2)


54.0

In [469]:
runner_df = pd.DataFrame({'distance': {0: 82,   1: 101,   2: 84,   3: 106,   4: 93,   5: 86,   6: 87,   7: 82,   8: 92,   9: 86,   10: 98,   11: 84},  'runner': {0: 'runner1',   1: 'runner2',   2: 'runner3',   3: 'runner4',   4: 'runner1',   5: 'runner2',   6: 'runner3',   7: 'runner4',   8: 'runner1',   9: 'runner2',   10: 'runner3',   11: 'runner4'}})

runner_df['mean'] = runner_df.groupby("runner").transform('mean')
runner_df[runner_df["distance"] > runner_df["mean"]]["runner"].value_counts()


In [476]:
runner_df[runner_df["distance"] > runner_df["mean"]]["runner"].value_counts()


SyntaxError: invalid syntax (638260391.py, line 1)

In [129]:
import pandas as pd
df = pd.read_csv('data1.csv')
df_new = df[["Invoice","Customer ID", "Quantity","Price"]]
df_new["Purchase_Price"] = df_new["Quantity"] * df_new["Price"]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_new["Purchase_Price"] = df_new["Quantity"] * df_new["Price"]


In [135]:
df_new[df_new["Purchase_Price"] == max(df_new["Purchase_Price"])]


Unnamed: 0,Invoice,Customer ID,Quantity,Price,Purchase_Price
1065882,581483,16446.0,80995,2.08,168469.6


In [None]:
df[df["Country"]=="United Kingdom"].groupby('StockCode').nunique().reset_index()["StockCode"].count()

In [119]:
mod=df.groupby('Description').sum()
mod[mod['Quantity']==max(mod['Quantity'])].reset_index()["Description"]

0    WORLD WAR 2 GLIDERS ASSTD DESIGNS
Name: Description, dtype: object

In [96]:
df["total_quantity"] = df["count"] * df["Quantity"]


KeyError: 'count'

In [63]:
df.sort_values("total_quantity",ascending=False)

Unnamed: 0,Invoice,StockCode,Description,Quantity,InvoiceDate,Price,Customer ID,Country,count,total_quantity
587080,541431,23166,MEDIUM CERAMIC TOP STORAGE JAR,74215,2011-01-18 10:01:00,1.04,12346.0,United Kingdom,260.0,19295900.0
686003,550461,85123A,WHITE HANGING HEART T-LIGHT HOLDER,1930,2011-04-18 13:20:00,2.40,15749.0,United Kingdom,5817.0,11226810.0
578170,540815,85123A,WHITE HANGING HEART T-LIGHT HOLDER,1930,2011-01-11 12:55:00,2.55,15749.0,United Kingdom,5817.0,11226810.0
816710,562439,84879,ASSORTED COLOUR BIRD ORNAMENT,2880,2011-08-04 18:06:00,1.45,12931.0,United Kingdom,2958.0,8519040.0
452200,532358,84879,ASSORTED COLOUR BIRD ORNAMENT,2880,2010-11-11 17:05:00,1.45,12931.0,United Kingdom,2958.0,8519040.0
...,...,...,...,...,...,...,...,...,...,...
1060783,581199,84581,,-2,2011-12-07 18:26:00,0.00,,United Kingdom,,
1060787,581203,23406,,15,2011-12-07 18:31:00,0.00,,United Kingdom,,
1060793,581209,21620,,6,2011-12-07 18:35:00,0.00,,United Kingdom,,
1062442,581234,72817,,27,2011-12-08 10:33:00,0.00,,United Kingdom,,


In [181]:
# 22 June 2011 
df["date"] = pd.to_datetime(df["InvoiceDate"])
df[df["date"].dt.date == "2011-06-22"]



Unnamed: 0,Invoice,StockCode,Description,Quantity,InvoiceDate,Price,Customer ID,Country,date


In [166]:
df_new

Unnamed: 0,Customer ID,InvoiceDate,month_name,date,year
0,13085.0,2009-12-01 07:45:00,December,1,2009
1,13085.0,2009-12-01 07:45:00,December,1,2009
2,13085.0,2009-12-01 07:45:00,December,1,2009
3,13085.0,2009-12-01 07:45:00,December,1,2009
4,13085.0,2009-12-01 07:45:00,December,1,2009
...,...,...,...,...,...
1067366,12680.0,2011-12-09 12:50:00,December,9,2011
1067367,12680.0,2011-12-09 12:50:00,December,9,2011
1067368,12680.0,2011-12-09 12:50:00,December,9,2011
1067369,12680.0,2011-12-09 12:50:00,December,9,2011
