# 30 Days of Pandas - Extra Questions

In [1]:
import pandas as pd

### 2738. Count Occurences in Text

Write a solution to find the number of files that have at least one occurrence of the words 'bull' and 'bear' as a standalone word, respectively, disregarding any instances where it appears without space on either side (e.g. 'bullet', 'bears', 'bull.', or 'bear' at the beginning or end of a sentence will not be considered) 

Return the word 'bull' and 'bear' along with the corresponding number of occurrences in any order.

##### Output
| word | count |
|------|-------|
| bull | 3     |
| bear | 2     |


In [25]:
data = {
    'file_name': ['draft1.txt', 'draft2.txt', 'draft3.txt'],
    'content': [
        'The stock exchange predicts a bull market which would make many investors happy.',
        ('The stock exchange predicts a bull market which would make many investors happy, '
         'but analysts warn of possibility of too much optimism and that in fact we are '
         'awaiting a bear market.'),
        ('The stock exchange predicts a bull market which would make many investors happy, '
         'but analysts warn of possibility of too much optimism and that in fact we are '
         'awaiting a bear market. As always predicting the future market is an uncertain '
         'game and all investors should follow their instincts and best practices.')
    ]
}

# Create the DataFrame
pd.set_option('display.max_colwidth', None)
files_df = pd.DataFrame(data)
files_df

Unnamed: 0,file_name,content
0,draft1.txt,The stock exchange predicts a bull market which would make many investors happy.
1,draft2.txt,"The stock exchange predicts a bull market which would make many investors happy, but analysts warn of possibility of too much optimism and that in fact we are awaiting a bear market."
2,draft3.txt,"The stock exchange predicts a bull market which would make many investors happy, but analysts warn of possibility of too much optimism and that in fact we are awaiting a bear market. As always predicting the future market is an uncertain game and all investors should follow their instincts and best practices."


In [26]:
# note: \b is a word boundary in regular expressions. It matches positions where a word starts or ends, 
# ensuring "bull" and "bear" are matched only when they appear as standalone words (e.g., not as part of "bullet" or "bears").

# find files with 'bull' and 'bear' as a standalone word, we use regular expression
bull_count = files_df['content'].str.contains(r'\bbull\b', regex=True).sum()
bear_count = files_df['content'].str.contains(r'\bbear\b', regex=True).sum()

# display the result
word_df = pd.DataFrame({
    'word' : ['bull', 'bear'],
    'count' : [bull_count, bear_count]})

word_df

Unnamed: 0,word,count
0,bull,3
1,bear,2


### 2082. The Number of Rich Customers

Write a solution to report the number of customers who had at least one bill with an amount strictly greater than 500.

##### Output
| rich_count |
|------------|
| 2          |

In [2]:
data = {
    'bill_id': [6, 8, 4, 11, 13],
    'customer_id': [1, 1, 2, 3, 3],
    'amount': [549, 834, 394, 657, 257]
}

# Create the DataFrame
store_df = pd.DataFrame(data)
store_df

Unnamed: 0,bill_id,customer_id,amount
0,6,1,549
1,8,1,834
2,4,2,394
3,11,3,657
4,13,3,257


In [3]:
# get the total amount paid by customers
customer_totals = store_df.groupby('customer_id')['amount'].sum().reset_index()

# filter the bill amount strictly greater than 500
rich_customers = customer_totals[customer_totals['amount'] > 500]

# print as per the required output
pd.DataFrame({'rich_count': [rich_customers.shape[0]]})

Unnamed: 0,rich_count
0,2


### 1173. Immediate Food Delivery
If the customer’s preferred delivery date is the same as the order date, then the order is called immediate; otherwise, it is called scheduled. Write a solution to find the percentage of immediate orders in the table, rounded to 2 decimal places. The result format is in the following example.

##### Output
| immediate_percentage |
|----------------------|
| 33.33                |

In [79]:
data = {
    'delivery_id': [1, 2, 3, 4, 5, 6],
    'customer_id': [1, 5, 1, 3, 4, 2],
    'order_date': pd.to_datetime(['2019-08-01', '2019-08-02', '2019-08-11', '2019-08-24', '2019-08-21', '2019-08-11']),
    'customer_pref_delivery_date': pd.to_datetime(['2019-08-02', '2019-08-02', '2019-08-11', '2019-08-26', '2019-08-22', '2019-08-13'])
}

# Create the DataFrame
delivery_df = pd.DataFrame(data)
delivery_df

Unnamed: 0,delivery_id,customer_id,order_date,customer_pref_delivery_date
0,1,1,2019-08-01,2019-08-02
1,2,5,2019-08-02,2019-08-02
2,3,1,2019-08-11,2019-08-11
3,4,3,2019-08-24,2019-08-26
4,5,4,2019-08-21,2019-08-22
5,6,2,2019-08-11,2019-08-13


In [102]:
# initial approach

# find the count of immediate orders
immediate_orders = delivery_df[delivery_df['order_date'] == delivery_df['customer_pref_delivery_date']].reset_index()
immediate_orders_count = immediate_orders['customer_pref_delivery_date'].count()

# find the total order count
total_orders = delivery_df['customer_pref_delivery_date'].count()

# calculate percentage
pd.DataFrame({'immediate_percentage' : [((immediate_orders_count/total_orders)*100).round(2)]})

Unnamed: 0,immediate_percentage
0,33.33


In [105]:
# another approach suing mean()

immediate_percentage = (delivery_df['order_date'] == delivery_df['customer_pref_delivery_date']).mean()
pd.DataFrame({'immediate_percentage' : [immediate_percentage.round(2)]})

Unnamed: 0,immediate_percentage
0,0.33


### 1322 Ads Performance

A company is running Ads and wants to calculate the performance of each Ad. Performance of the Ad is measured using Click-Through Rate (CTR) where:

\
CTR = 
\begin{cases} 
0, & \text{if Ad total clicks + Ad total views = 0} \\
\frac{\text{Ad total clicks}}{\text{Ad total clicks} + \text{Ad total views}} \times 100, & \text{otherwise}
\end{cases}



Write a solution to find the ctr of each Ad. Round ctr to two decimal points. Return the result table ordered by ctr in descending order and ad_id in ascending order in case of a tie

##### Output
| ad_id | ctr   |
|-------|-------|
| 1     | 66.67 |
| 3     | 50.00 |
| 2     | 33.33 |
| 5     | 0.00  |


In [136]:
data = {
    'ad_id': [1, 2, 3, 5, 1, 2, 3, 1, 2, 1],
    'user_id': [1, 2, 3, 5, 7, 7, 5, 4, 11, 2],
    'action': ['Clicked', 'Clicked', 'Viewed', 'Ignored', 'Ignored', 'Viewed', 'Clicked', 'Viewed', 'Viewed', 'Clicked']
}

# Creating the DataFrame
ads_df = pd.DataFrame(data)
ads_df

Unnamed: 0,ad_id,user_id,action
0,1,1,Clicked
1,2,2,Clicked
2,3,3,Viewed
3,5,5,Ignored
4,1,7,Ignored
5,2,7,Viewed
6,3,5,Clicked
7,1,4,Viewed
8,2,11,Viewed
9,1,2,Clicked


In [128]:
# pivot the table
ads_df = ads_df.pivot_table(
    index = 'ad_id',
    columns='action',
    aggfunc = 'size',
    fill_value = 0).reset_index()

# calculate the CTR
ads_df['ctr'] = ((ads_df.get('Clicked', 0) / 
                      (ads_df.get('Clicked', 0) + ads_df.get('Viewed', 0))) * 100).round(2)

# fill in the missing values
ads_df = ads_df[['ad_id', 'ctr']].fillna(0)

# display the result
ads_df.sort_values(by=['ctr', 'ad_id'], ascending=[False, True])

action,ad_id,ctr
0,1,66.67
2,3,50.0
1,2,33.33
3,5,0.0


In [139]:
# another approach

# Count the total clicks and views for each ad_id
clicks = ads_df[ads_df['action'] == 'Clicked'].groupby('ad_id').size()
views = ads_df[ads_df['action'] == 'Viewed'].groupby('ad_id').size()
ignores = ads_df[ads_df['action'] == 'Ignored'].groupby('ad_id').size()

# Create a DataFrame with all ad_ids and their corresponding clicks and views
ctr_df = pd.DataFrame({
    'total_clicks': clicks,
    'total_views': views,
    'total_ignores' : ignores
}).fillna(0)  # Fill NaNs with 0 where there are no clicks or views

# Calculate CTR for each ad_id
ctr_df['ctr'] = ((ctr_df['total_clicks'] / (ctr_df['total_clicks'] + ctr_df['total_views'])) * 100).round(2)

# Reset index to make 'ad_id' a column
ctr_df = ctr_df.reset_index()

# Fill CTR as 0 where there are no clicks and views (if any)
ctr_df['ctr'] = ctr_df['ctr'].fillna(0)

# Display the result
ctr_df.sort_values(by=['ctr', 'ad_id'], ascending=[False, True])[['ad_id', 'ctr']]

Unnamed: 0,ad_id,ctr
0,1,66.67
2,3,50.0
1,2,33.33
3,5,0.0
