# From SQL to pandas challenge 11

In [None]:
# import libraries
import pandas as pd

# load data
# This code is made to load our data stored on Google Drive
def gd_path(file_id):
    """Generate a shareable link from Google Drive file id."""
    return f"https://drive.google.com/uc?export=download&id={file_id}"

# Google Drive file ids
files_id = {
    "titleauthor": "1F1JOiYXStWacOBca6coNVfyVtoST7ZgD",
    "titles": "1PLdn50N9GRa53ZbuVWo0l47F_IXdvlEm",
    "sales": "1fzFc9rwYmVIPaGOFmhLVxCi3kg19vNU2", 
    "roysched": "1zPRZPoFPEMKyrNR5VSENeYFHGCBZmxbs", 
    "publishers": "1s9E8_AVOziTrowb3wyh2jg3PV763VOyq",
    "employee": "1h9mUjsVqpP74b1w0x7KOw37n_n9Ulkt5", 
    "authors": "1fEF89Nhe61EebAljKlwFwfEuokK0o6aJ",
    "stores": "1f-GCgip7O93CpbAkYvOsc21eKnSOSHsQ",
}

# Read data from Google Drive
sales = pd.read_csv(gd_path(files_id["sales"]), sep=";")
titles = pd.read_csv(gd_path(files_id["titles"]), sep=";")
publishers = pd.read_csv(gd_path(files_id["publishers"]), sep=";")
employee = pd.read_csv(gd_path(files_id["employee"]), sep=";")
authors = pd.read_csv(gd_path(files_id["authors"]), sep=";")
titleauthor = pd.read_csv(gd_path(files_id["titleauthor"]), sep=";")
roysched = pd.read_csv(gd_path(files_id["roysched"]), sep=";")
stores = pd.read_csv(gd_path(files_id["stores"]), sep=";")

## 1. Using LEFT JOIN: in which cities has "Is Anger the Enemy?" been sold?

In [None]:
(
sales.merge(stores[['stor_id','stor_name','city']],on='stor_id',how='left')
.merge(titles[['title_id','title',]],on='title_id',how='left')
.query('title=="Is Anger the Enemy?"')[['qty','stor_name','city']]
)

Unnamed: 0,qty,stor_name,city
1,3,Eric the Read Books,Seattle
3,75,Barnum's,Tustin
4,10,News & Brews,Los Gatos
8,20,Doc-U-Mat: Quality Laundry and Books,Remulade


In [None]:
p=publishers
t=titles
published_titles=p.merge(t, on='pub_id', how='left')
published_titles[published_titles['title']=='Is Anger the Enemy?'][['title','city']]

Unnamed: 0,title,city
1,Is Anger the Enemy?,Boston


### Hint:

In SQL the syntax is:

```sql
SELECT p.city
FROM publishers AS p
LEFT JOIN titles AS t
ON p.pub_id = t.pub_id
WHERE t.title = 'Is Anger the Enemy?';
```

## 2. Select all the book titles that have a link to the employee Howard Snyder 
    (he works for the publisher that has published those books).

In [None]:
hs = employee.merge(titles, on="pub_id",how="left")
hs[(hs["fname"] == "Howard") & (hs["lname"] == "Snyder")][["title","type","fname","lname"]]

Unnamed: 0,title,type,fname,lname
56,You Can Combat Computer Stress!,business,Howard,Snyder
57,Is Anger the Enemy?,psychology,Howard,Snyder
58,Life Without Fear,psychology,Howard,Snyder
59,Prolonged Data Deprivation: Four Case Studies,psychology,Howard,Snyder
60,Emotional Security: A New Algorithm,psychology,Howard,Snyder


### Hint:

In SQL the syntax is:

```sql
SELECT t.title
FROM employee e
JOIN titles t
ON e.pub_id = t.pub_id
WHERE e.fname = 'Howard'
AND e.lname = 'Snyder';
```

## 3. Using the `merge` of your choice: Select the book title with highest number of sales (qty)

In [None]:
sal_titl = sales.merge(titles,
            how = "left",
            on = "title_id")

sal_titl.groupby(by="title").agg({"qty":"sum"}).nlargest(1, "qty")

Unnamed: 0_level_0,qty
title,Unnamed: 1_level_1
Is Anger the Enemy?,108


### Hint:

In SQL the syntax is:

```sql
SELECT t.title, SUM(qty)
FROM sales AS s 
JOIN titles t
ON s.title_id = t.title_id
GROUP BY t.title_id
ORDER BY SUM(qty) desc
LIMIT 1;
```

# 4. Select all book titles and the full name of their author(s).
      
      - If a book has multiple authors, all authors must be displayed (in 
      multiple rows).
      
      - Books with no authors and authors with no books should not be displayed.

In [None]:
df = (pd.merge(titles, titleauthor, on='title_id', how='inner')
      .merge(authors, on='au_id', how='inner')[["title","au_fname","au_lname"]])
df

Unnamed: 0,title,au_fname,au_lname
0,The Busy Executive's Database Guide,Marjorie,Green
1,You Can Combat Computer Stress!,Marjorie,Green
2,The Busy Executive's Database Guide,Abraham,Bennet
3,Cooking with Computers: Surreptitious Balance ...,Michael,O'Leary
4,"Sushi, Anyone?",Michael,O'Leary
5,Cooking with Computers: Surreptitious Balance ...,Stearns,MacFeather
6,Computer Phobic AND Non-Phobic Individuals: Be...,Stearns,MacFeather
7,Straight Talk About Computers,Dean,Straight
8,Silicon Valley Gastronomic Treats,Innes,del Castillo
9,The Gourmet Microwave,Michel,DeFrance


### Hint:

In SQL the syntax is:

```sql
SELECT
    t.title,
    a.au_fname,
    a.au_lname
FROM titles t
INNER JOIN titleauthor ta 
ON t.title_id = ta.title_id
INNER JOIN authors a 
ON ta.au_id = a.au_id;
```

## 5. Select the full name of authors of Psychology books

   Bonus hint: if you want to prevent duplicates but allow authors with shared
   last names to be displayed, you can concatenate the first and last names
   with CONCAT(), and use the DISTINCT clause on the concatenated names.

In [None]:
au_tita_tit = authors.merge(titleauthor, on='au_id').merge(titles, on='title_id')

au_tita_tit_psy = au_tita_tit[au_tita_tit['type'] == 'psychology'].iloc[:,1:3]

authors_full_name = au_tita_tit_psy["au_fname"] + " " + au_tita_tit_psy["au_lname"]

authors_full_name.drop_duplicates()

0          Johnson White
14     Charlene Locksley
19    Stearns MacFeather
20          Livia Karsen
22           Anne Ringer
23         Albert Ringer
dtype: object

In [None]:
abt = authors.merge(titleauthor,on="au_id",how="inner").merge(titles,on="title_id",how="inner")
abt["full_name"]=abt["au_fname"]+" "+abt["au_lname"]
psy_aut = abt[abt['type'].isin(['psychology'])]
psy_aut["full_name"].unique()

array(['Johnson White', 'Charlene Locksley', 'Stearns MacFeather',
       'Livia Karsen', 'Anne Ringer', 'Albert Ringer'], dtype=object)

### Hint:

In SQL the syntax is:

```sql
SELECT DISTINCT CONCAT(a.au_fname, " ", a.au_lname) AS full_name
FROM authors a
INNER JOIN titleauthor ta ON a.au_id = ta.au_id
INNER JOIN titles t ON ta.title_id = t.title_id
WHERE t.type = "Psychology";
```

## 6. Explore the table roysched and try to grasp the meaning of each column. 
   The notes below will help:
   
   - "Royalty" means the percentage of the sale price paid to the author(s).
   
   - Sometimes, the royalty may be smaller for the first few sales (which have
     to cover the publishing costs to the publisher) but higher for the sales 
     above a certain threshold.
     
   - In the "roysched" table each title_id can appear multiple times, with
     different royalty values for each range of sales.
     
   - Select all rows for particular title_id, for example "BU1111", and explore
	 the data.

### Hint:

In SQL the syntax is:

```sql
SELECT * FROM roysched WHERE title_id = "BU1111";
```

## 7. Select all the book titles and the maximum royalty they can reach.
    Display only titles that are present in the roysched table.

In [None]:
r=roysched
royalty_for_titles=t.merge(r, on='title_id', how='inner')
royalty_for_titles
royalty_for_titles.groupby(["title"]).agg({"royalty_y": "max"}).sort_values(by='royalty_y', ascending=False)

Unnamed: 0_level_0,royalty_y
title,Unnamed: 1_level_1
Cooking with Computers: Surreptitious Balance Sheets,24
"Onions, Leeks, and Garlic: Cooking Secrets of the Mediterranean",24
Straight Talk About Computers,24
The Gourmet Microwave,24
You Can Combat Computer Stress!,24
Fifty Years in Buckingham Palace Kitchens,22
Silicon Valley Gastronomic Treats,20
But Is It User Friendly?,18
Computer Phobic AND Non-Phobic Individuals: Behavior Variations,18
Is Anger the Enemy?,16


### Hint:

In SQL the syntax is:

```sql
SELECT t.title, MAX(r.royalty) max_royalty
FROM titles t
INNER JOIN roysched r 
ON t.title_id = r.title_id
GROUP BY t.title
ORDER BY max_royalty DESC;
```