# Pandas exercise

## Dependencies

In [8]:
import pandas as pd

## Constants

In [9]:
DATASET_PATH = "../dataset/pandas-exercise"

## Read Data

In [10]:
import pandas as pd

df = pd.read_csv(f"{DATASET_PATH}/similarity_scores.csv", dtype={'prod': str, 'reco': str, 'score': float})
df_types = pd.read_csv(f"{DATASET_PATH}/types.csv", dtype=str)

```
similarity_scores.csv
prod,reco,score
0001,0002,0.5679236176242679
0001,0003,0.49684981277365553
0001,0004,0.6356807392029673
0001,0005,0.36400725936301465
...
```

```
similarity_scores.csv
prod,type
0001,type_4
0002,type_1
0003,type_5
0004,type_5
0005,type_3
```

## Exercises

### Exercise 1
For each product, find the top k most similar products, where k is a parameter (fix it to 100 for this exercise). Each product in the list should be sorted by similarity score in descendent order.

In [11]:
def get_top_k_recommendations(df: pd.DataFrame, k: int) -> pd.DataFrame:
    df_recos = df.groupby(["prod"]).apply(lambda x: x.nlargest(k, "score"))["reco"]
    df_recos = df_recos.groupby("prod").agg(list).reset_index(name="recos").rename_axis("id")
    return df_recos

k = 100
df_recos = get_top_k_recommendations(df, k)
df_recos

  df_recos = df.groupby(["prod"]).apply(lambda x: x.nlargest(k, "score"))["reco"]


Unnamed: 0_level_0,prod,recos
id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0001,"[0476, 0718, 0676, 0140, 0155, 0306, 0192, 044..."
1,0002,"[0334, 0592, 0728, 0053, 0828, 0518, 0430, 054..."
2,0003,"[0452, 0034, 0186, 0950, 0028, 0123, 0125, 082..."
3,0004,"[0806, 0516, 0854, 0957, 0198, 0484, 0637, 070..."
4,0005,"[0907, 0302, 0172, 0905, 0734, 0690, 0017, 051..."
...,...,...
995,0996,"[0534, 0481, 0571, 0723, 0613, 0962, 0707, 052..."
996,0997,"[0281, 0193, 0917, 0890, 0423, 0875, 0579, 015..."
997,0998,"[0778, 0542, 0155, 0010, 0054, 0938, 0605, 081..."
998,0999,"[0289, 0112, 0074, 0263, 0221, 0810, 0284, 077..."


#### Expected Output
```
id   prod	recos
0	0001	[0476, 0718, 0676, 0140, 0155, 0306, 0192, 044...
1	0002	[0334, 0592, 0728, 0053, 0828, 0518, 0430, 054...
2	0003	[0452, 0034, 0186, 0950, 0028, 0123, 0125, 082...
3	0004	[0806, 0516, 0854, 0957, 0198, 0484, 0637, 070...
4	0005	[0907, 0302, 0172, 0905, 0734, 0690, 0017, 051...
...	...	...
995	0996	[0534, 0481, 0571, 0723, 0613, 0962, 0707, 052...
996	0997	[0281, 0193, 0917, 0890, 0423, 0875, 0579, 015...
997	0998	[0778, 0542, 0155, 0010, 0054, 0938, 0605, 081...
998	0999	[0289, 0112, 0074, 0263, 0221, 0810, 0284, 077...
999	1000	[0645, 0589, 0421, 0426, 0787, 0232, 0740, 069...
1000 rows × 2 columns
```

### Exercise 2
Filter out from the previous dataframe those recommendations that have different type as the product, while keeping the order of similarity scores. The dataframe should have the same number of rows as the output from exercise 1.

In [12]:
def filter_recos_by_type(df_recos: pd.DataFrame, df_types: pd.DataFrame) -> pd.DataFrame:
    # Decompress list
    df_filtered_recos = df_recos.explode("recos")
    # Merge with types
    df_filtered_recos = df_filtered_recos.merge(df_types, on="prod")
    df_filtered_recos["type_reco"] = df_filtered_recos.merge(df_types, left_on="recos", right_on="prod", suffixes=('', '_recos'))["type_recos"]
    # Filter recommendations of different type than the product
    #     This line already discards all the elements that have no recommendations from the same type (exercise 3) 
    df_filtered_recos = df_filtered_recos[df_filtered_recos["type"] == df_filtered_recos["type_reco"]]
    # Regroup list
    df_filtered_recos = df_filtered_recos.groupby("prod")["recos"].agg(list).reset_index(name="recos").rename_axis("id")
    return df_filtered_recos

filter_recos_by_type(df_recos, df_types)

Unnamed: 0_level_0,prod,recos
id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0001,"[0476, 0718, 0140, 0446, 0198, 0412, 0406, 005..."
1,0002,"[0728, 0545, 0091, 0642, 0473, 0124, 0729, 020..."
2,0003,"[0821, 0499, 0114, 0572, 0973, 0046, 0411, 051..."
3,0004,"[0484, 0953, 0042, 0653, 0705, 0337, 0132, 077..."
4,0005,"[0302, 0172, 0017, 0621, 0068, 0984, 0018, 068..."
...,...,...
995,0996,"[0481, 0962, 0588, 0974, 0286, 0212, 0060, 037..."
996,0997,"[0281, 0917, 0217, 0260, 0136, 0139, 0141, 001..."
997,0998,"[0542, 0155, 0223, 0219, 0825, 0612, 0092, 010..."
998,0999,"[0284, 0546, 0212, 0915, 0459, 0896, 0274, 040..."


#### Expected Output
```
id prod	recos
0	0001	[0476, 0718, 0140, 0446, 0198, 0412, 0406, 005...
1	0002	[0728, 0545, 0091, 0642, 0473, 0124, 0729, 020...
2	0003	[0821, 0499, 0114, 0572, 0973, 0046, 0411, 051...
3	0004	[0484, 0953, 0042, 0653, 0705, 0337, 0132, 077...
4	0005	[0302, 0172, 0017, 0621, 0068, 0984, 0018, 068...
...	...	...
995	0996	[0481, 0962, 0588, 0974, 0286, 0212, 0060, 037...
996	0997	[0281, 0917, 0217, 0260, 0136, 0139, 0141, 001...
997	0998	[0542, 0155, 0223, 0219, 0825, 0612, 0092, 010...
998	0999	[0284, 0546, 0212, 0915, 0459, 0896, 0274, 040...
999	1000	[0645, 0232, 0869, 0888, 0954, 0621, 0931, 084...
1000 rows × 2 columns
```

### Exercise 3
Fix k to 3 in the first exercise, run everything again, and now remove rows where filtered recommendations are empty.

In [13]:
def filter_empty_recommendations(df_recos: pd.DataFrame) -> pd.DataFrame:
    df_filtered_empty_recos = filter_recos_by_type(df_recos, df_types)
    return df_filtered_empty_recos

filter_empty_recommendations(get_top_k_recommendations(df, 3))

  df_recos = df.groupby(["prod"]).apply(lambda x: x.nlargest(k, "score"))["reco"]


Unnamed: 0_level_0,prod,recos
id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0001,"[0476, 0718]"
1,0002,[0728]
2,0005,"[0302, 0172]"
3,0009,"[0504, 0775]"
4,0013,[0975]
...,...,...
433,0994,"[0151, 0387]"
434,0996,[0481]
435,0997,"[0281, 0917]"
436,0998,"[0542, 0155]"


#### Expected Output
```
id prod	recos
0	0001	[0476, 0718]
1	0002	[0728]
2	0005	[0302, 0172]
3	0009	[0504, 0775]
4	0013	[0975]
...	...	...
433	0994	[0151, 0387]
434	0996	[0481]
435	0997	[0281, 0917]
436	0998	[0542, 0155]
437	1000	[0645]
438 rows × 2 columns
```