In [None]:
import numpy as np
import pandas as pd

# Operations on Data in Pandas

In [None]:
rng = np.random.RandomState(42)
ser = pd.Series(rng.randint(0, 10, 4))
ser

0    6
1    3
2    7
3    4
dtype: int64

In [None]:
ser2=pd.Series(rng.randint(0,10,4),index=[1,2,3,4])
ser2

1    6
2    9
3    2
4    6
dtype: int64

In [None]:
ser+ser2

0     NaN
1     9.0
2    16.0
3     6.0
4     NaN
dtype: float64

In [None]:
df = pd.DataFrame(rng.randint(0, 10, (3, 4)),
                  columns=['A', 'B', 'C', 'D'])
df

Unnamed: 0,A,B,C,D
0,7,4,3,7
1,7,2,5,4
2,1,7,5,1


In [None]:
np.exp(ser)

0     403.428793
1      20.085537
2    1096.633158
3      54.598150
dtype: float64

In [None]:
np.sin(df * np.pi / 4)

Unnamed: 0,A,B,C,D
0,-0.707107,1.224647e-16,0.707107,-0.7071068
1,-0.707107,1.0,-0.707107,1.224647e-16
2,0.707107,-0.7071068,-0.707107,0.7071068


In [None]:
area = pd.Series({'Alaska': 1723337, 'Texas': 695662,
                  'California': 423967}, name='area')
population = pd.Series({'California': 38332521, 'Texas': 26448193,
                        'New York': 19651127}, name='population')

In [None]:
population / area

Alaska              NaN
California    90.413926
New York            NaN
Texas         38.018740
dtype: float64

In [None]:
area.index | population.index

  """Entry point for launching an IPython kernel.


Index(['Alaska', 'California', 'New York', 'Texas'], dtype='object')

In [None]:
A = pd.Series([2, 4, 6], index=[0, 1, 2])
B = pd.Series([1, 3, 5], index=[1, 2, 3])
A + B

0    NaN
1    5.0
2    9.0
3    NaN
dtype: float64

In [None]:
A.add(B)

0    NaN
1    5.0
2    9.0
3    NaN
dtype: float64

In [None]:
A.add(B, fill_value=0)

0    2.0
1    5.0
2    9.0
3    5.0
dtype: float64

## Index alignment in DataFrame
A similar type of alignment takes place for both columns and indices when performing operations on DataFrames:

In [None]:
A = pd.DataFrame(rng.randint(0, 20, (2, 2)),
                 columns=list('AB'))
A

Unnamed: 0,A,B
0,19,14
1,6,11


In [None]:
B = pd.DataFrame(rng.randint(0, 10, (3, 3)),
                 columns=list('BAC'))
B

Unnamed: 0,B,A,C
0,7,2,0
1,3,1,7
2,3,1,5


In [None]:
A + B

Unnamed: 0,A,B,C
0,21.0,21.0,
1,7.0,14.0,
2,,,


Notice that indices are aligned correctly irrespective of their order in the two objects, and indices in the result are sorted. As was the case with Series, we can use the associated object's arithmetic method and pass any desired fill_value to be used in place of missing entries. Here we'll fill with the mean of all values in A (computed by first stacking the rows of A):

In [None]:
fill=A.stack().mean()

In [None]:
fill = A.stack().mean()
A.add(B, fill_value=fill)

Unnamed: 0,A,B,C
0,21.0,21.0,12.5
1,7.0,14.0,19.5
2,13.5,15.5,17.5


In [None]:
A.add(B,axis="index",fill_value=fill)

Unnamed: 0,A,B,C
0,21.0,21.0,12.5
1,7.0,14.0,19.5
2,13.5,15.5,17.5


The following table lists Python operators and their equivalent Pandas object methods:

Python Operator	Pandas Method(s)  


|Python Operator|Pandas methods|  
|:---:|:---:|  
|+|	add()|  
|-|	sub(), subtract()|  
|*|	mul(), multiply()|  
|/|	truediv(), div(), divide()|  
|//|	floordiv()|  
|%|	mod()|  
|**|	pow()|  

In [None]:
A.subtract(B,fill_value=0)

Unnamed: 0,A,B,C
0,17.0,7.0,0.0
1,5.0,8.0,-7.0
2,-1.0,-3.0,-5.0


In [None]:
median=A.stack().median()

In [None]:
A.mul(B,fill_value=median)

Unnamed: 0,A,B,C
0,38.0,98.0,0.0
1,6.0,33.0,87.5
2,12.5,37.5,62.5


In [None]:
A.mul(B,axis=0,fill_value=median)

Unnamed: 0,A,B,C
0,38.0,98.0,0.0
1,6.0,33.0,87.5
2,12.5,37.5,62.5


In [None]:
A.truediv(B,fill_value=median)

Unnamed: 0,A,B,C
0,9.5,2.0,inf
1,6.0,3.666667,1.785714
2,12.5,4.166667,2.5


In [None]:
A.div(B,fill_value=median)

Unnamed: 0,A,B,C
0,9.5,2.0,inf
1,6.0,3.666667,1.785714
2,12.5,4.166667,2.5


In [None]:
A.divide(B,fill_value=median)

Unnamed: 0,A,B,C
0,9.5,2.0,inf
1,6.0,3.666667,1.785714
2,12.5,4.166667,2.5


In [None]:
A.floordiv(B,fill_value=median)

Unnamed: 0,A,B,C
0,9.0,2.0,inf
1,6.0,3.0,1.0
2,12.0,4.0,2.0


In [None]:
A.mod(B,fill_value=median)

Unnamed: 0,A,B,C
0,1.0,0.0,
1,0.0,2.0,5.5
2,0.5,0.5,2.5


In [None]:
A.pow(B,fill_value=0)

Unnamed: 0,A,B,C
0,361.0,105413504.0,1.0
1,6.0,1331.0,0.0
2,0.0,0.0,0.0


## Ufuncs: Operations Between DataFrame and Series  
When performing operations between a `DataFrame` and a `Series`, the `index` and `column` alignment is similarly maintained. Operations between a `DataFrame` and a `Series` are similar to operations between a two-dimensional and one-dimensional `NumPy array`. Consider one common operation, where we find the difference of a two-dimensional array and one of its rows:

In [None]:
A = rng.randint(10, size=(3, 4))
A

array([[4, 8, 6, 1],
       [3, 8, 1, 9],
       [8, 9, 4, 1]])

In [None]:
A - A[0]

array([[ 0,  0,  0,  0],
       [-1,  0, -5,  8],
       [ 4,  1, -2,  0]])

According to NumPy's broadcasting rules ([see Computation on Arrays: Broadcasting](https://jakevdp.github.io/PythonDataScienceHandbook/02.05-computation-on-arrays-broadcasting.html)), subtraction between a two-dimensional array and one of its rows is applied row-wise.

In Pandas, the convention similarly operates row-wise by default:

In [None]:
df = pd.DataFrame(A, columns=list('QRST'))
df - df.iloc[0]

Unnamed: 0,Q,R,S,T
0,0,0,0,0
1,-1,0,-5,8
2,4,1,-2,0


If you would instead like to operate column-wise, you can use the object methods mentioned earlier, while specifying the axis keyword:

In [None]:
df.subtract(df['R'], axis=0)

Unnamed: 0,Q,R,S,T
0,-4,0,-2,-7
1,-5,0,-7,1
2,-1,0,-5,-8


In [None]:
df

Unnamed: 0,Q,R,S,T
0,4,8,6,1
1,3,8,1,9
2,8,9,4,1


In [None]:
halfrow = df.iloc[0, ::2]
halfrow

Q    4
S    6
Name: 0, dtype: int64

In [None]:
df - halfrow

Unnamed: 0,Q,R,S,T
0,0.0,,0.0,
1,-1.0,,-5.0,
2,4.0,,-2.0,


This preservation and alignment of indices and columns means that operations on data in Pandas will always maintain the data context, which prevents the types of silly errors that might come up when working with heterogeneous and/or misaligned data in raw NumPy arrays.

# Operations on String Data in Pandas

In [None]:

s = pd.Series(['Tom', 'William Rick', 'John', 'Alber@t', np.nan, '1234','SteveSmith'])

print (s)

0             Tom
1    William Rick
2            John
3         Alber@t
4             NaN
5            1234
6      SteveSmith
dtype: object


In [None]:
print (s.str.lower())

0             tom
1    william rick
2            john
3         alber@t
4             NaN
5            1234
6      stevesmith
dtype: object


In [None]:
print (s.str.upper())

In [None]:
print (s.str.len())

0     3.0
1    12.0
2     4.0
3     7.0
4     NaN
5     4.0
6    10.0
dtype: float64


In [None]:
print (s.str.strip())

0             Tom
1    William Rick
2            John
3         Alber@t
4             NaN
5            1234
6      SteveSmith
dtype: object


In [None]:
print (s.str.split())

0              [Tom]
1    [William, Rick]
2             [John]
3          [Alber@t]
4                NaN
5             [1234]
6       [SteveSmith]
dtype: object


In [None]:
print (s.str.cat(sep="_"))

Tom_William Rick_John_Alber@t_1234_SteveSmith


In [None]:
print (s.str.get_dummies())

   1234  Alber@t  John  SteveSmith  Tom  William Rick
0     0        0     0           0    1             0
1     0        0     0           0    0             1
2     0        0     1           0    0             0
3     0        1     0           0    0             0
4     0        0     0           0    0             0
5     1        0     0           0    0             0
6     0        0     0           1    0             0


In [None]:
print (s.str.get_dummies().sum())

1234            1
Alber@t         1
John            1
SteveSmith      1
Tom             1
William Rick    1
dtype: int64


In [None]:
print (s.str.contains(' '))

0    False
1     True
2    False
3    False
4      NaN
5    False
6    False
dtype: object


In [None]:
print (s.str.replace('a','@'))

0             Tom
1    Willi@m Rick
2            John
3         Alber@t
4             NaN
5            1234
6      SteveSmith
dtype: object


In [None]:
print (s.str.repeat(3))

0                               TomTomTom
1    William RickWilliam RickWilliam Rick
2                            JohnJohnJohn
3                   Alber@tAlber@tAlber@t
4                                     NaN
5                            123412341234
6          SteveSmithSteveSmithSteveSmith
dtype: object


In [None]:
print (s.str.count('m'))

In [None]:
print (s.str.startswith('A'))

0    False
1    False
2    False
3     True
4      NaN
5    False
6    False
dtype: object


In [None]:
print (s.str.endswith('n'))

0    False
1    False
2     True
3    False
4      NaN
5    False
6    False
dtype: object


In [None]:
print (s.str.findall('m'))

0    [m]
1    [m]
2     []
3     []
4    NaN
5     []
6    [m]
dtype: object


In [None]:
print (s.str.swapcase())

0             tOM
1    wILLIAM rICK
2            jOHN
3         aLBER@T
4             NaN
5            1234
6      sTEVEsMITH
dtype: object


In [None]:
print (s.str.islower())

0    False
1    False
2    False
3    False
4      NaN
5    False
6    False
dtype: object


In [None]:
print (s.str.isupper())

In [None]:
print (s.str.isnumeric())

0    False
1    False
2    False
3    False
4      NaN
5     True
6    False
dtype: object


In [None]:
print (s.str.isalnum())

0     True
1    False
2     True
3    False
4      NaN
5     True
6     True
dtype: object


In [None]:
s.str.center(10)

0         Tom    
1    William Rick
2         John   
3       Alber@t  
4             NaN
5         1234   
6      SteveSmith
dtype: object

In [None]:
s.str.count('a')

0    0.0
1    1.0
2    0.0
3    0.0
4    NaN
5    0.0
6    0.0
dtype: float64

In [None]:
s.str.extract()

In [None]:
s.str.find('a')

0   -1.0
1    5.0
2   -1.0
3   -1.0
4    NaN
5   -1.0
6   -1.0
dtype: float64

In [None]:
s.str.get([1:5])

SyntaxError: invalid syntax (<ipython-input-84-b7136af2ea5f>, line 1)


2. For all the URLH which are overlapping, calculate the price difference (wrt available_price) if there is any between yesterday's and today's crawls (scraped data). There might be duplicate URLHs in which case you can choose the first valid (with http_status 200) record.

today_urlh=[]
yesterday_urlh=[]
for i in status_list_today:
today_urlh.append(i['urlh'])
for i in status_list_yesterday:
yesterday_urlh.append(i['urlh'])
overlapped_urlh=list(set(today_urlh).intersection(set(yesterday_urlh))

In [None]:
today_urlh=[]
yesterday_urlh=[]
for i in status_list_today:
    today_urlh.append(i['urlh'])
for i in status_list_yesterday:
    yesterday_urlh.append(i['urlh'])
overlapped_urlh=list(set(today_urlh).intersection(set(yesterday_urlh))
# here i have only overlapped urlheaders with http_status=200
#now selecting the data by using overlapped urlh:
updated_today_list=[]
updated_yesterday_list=[]
for i in today_urlh:
    for j in overlapped_urlh:
        if i['urlh']==j:
            updated_today_list.append(i)
for i in yester_day_urlh:
    for j in overlapped_urlh:
        if i['urlh']==j:
            updated_yesterday_list.append(i)
dict_={}
for i in updated_today_list:
    for j in updated_yesterday_list:
        if i['urlh']==j['urlh']:
            price_diff=i['Availabale_price']-j['Available_price']
            print(f'price diff for urlh {i[urlh]} from todays available prime and yesterdays availabale priece is: 'price_diff)
            dict_[i[urlh]]=price_diff



In [None]:
prices=[]
for i in today_list:
    prices.append(i['price'])
prices=[int(i) for i in prices]
prices.sort()
max_10=prices[len(prices)-10:len(prices)]
min_10=prices[:10]

# Title with max prices:
for i in max_10:
    for j in today_list:
        if j['price']==str(i):
            print("Title",j['title'],"--> Price: ",i)
# Title with max prices:
for i in min_10:
    for j in today_list:
        if j['price']==str(i):
            print("Title",j['title'],"--> Price: ",i)
 

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=fcf2399d-084b-4173-af36-20a4a45218a8' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>