In [2]:
# pip install pandas numpy requests zipfile36 io

In [1]:
import pandas as pd
import numpy as np
import requests
import zipfile
import io

In [2]:
# --------------------------------------------------
# 0. Download MovieLens 100k
# --------------------------------------------------

url = "https://files.grouplens.org/datasets/movielens/ml-100k.zip"
response = requests.get(url)
z = zipfile.ZipFile(io.BytesIO(response.content))

ratings = pd.read_csv(
    z.open("ml-100k/u.data"),
    sep="\t",
    names=["user_id", "movie_id", "rating", "timestamp"]
)

movies = pd.read_csv(
    z.open("ml-100k/u.item"),
    sep="|",
    encoding="latin-1",
    header=None,
    usecols=[0, 1],
    names=["movie_id", "title"]
)


In [3]:
# --------------------------------------------------
# 1. Dataset description
# --------------------------------------------------
from datetime import datetime

print(datetime.fromtimestamp(ratings['timestamp'].min()))
print(datetime.fromtimestamp(ratings['timestamp'].quantile(0.8)))
print(datetime.fromtimestamp(ratings['timestamp'].max()))

ratings['rating'].describe()

1997-09-20 05:05:10
1998-03-07 03:21:09
1998-04-23 01:10:38


count    100000.000000
mean          3.529860
std           1.125674
min           1.000000
25%           3.000000
50%           4.000000
75%           4.000000
max           5.000000
Name: rating, dtype: float64

In [4]:
# --------------------------------------------------
# 2.  Create simple OK-KO rating 
# --------------------------------------------------
#   1 if rating >= 4 
#   0 otherwise
#

ratings['simple_rating'] = [1 if x >=4 else 0 for x in ratings['rating']]


In [5]:

# --------------------------------------------------
# 3. Compute  Statistics 
# --------------------------------------------------

# Split the rating in two groups based on timestamp
# A subset (oldest 80%) and B subset (newest 20%)
ts_split = ratings['timestamp'].quantile(0.8)

df_A = ratings[ratings['timestamp'] <= ts_split].groupby("movie_id").agg(
    rating_avg_A=("rating", "mean"),    
    simple_rating_avg_A=("simple_rating", "mean"),    
).reset_index()


df_B = ratings[ratings['timestamp'] > ts_split].groupby("movie_id").agg(
    rating_avg_B=("rating", "mean"),    
    simple_rating_avg_B=("simple_rating", "mean"),    
    # rating_count=("rating", "count")
).reset_index()


In [6]:
# # --------------------------------------------------
# # 5. Merge and Keep Common Movies
# # --------------------------------------------------

df = df_A.merge(df_B, on=["movie_id"])
df = df.merge(movies, on="movie_id")


In [7]:
# --------------------------------------------------
# 6. Create Rankings
# --------------------------------------------------

# Ground truth: ranking average on old set
df["rating_A_rank"] = [int(x) for x in df["rating_avg_A"].rank(ascending=False, method="first")]
# aa = list(df["rating_A_rank"])

# Alternative rankings
df["simple_rating_A_rank"] = [int(x) for x in df["simple_rating_avg_A"].rank(ascending=False, method="first")]
df["simple_rating_B_rank"] = [int(x) for x in df["simple_rating_avg_B"].rank(ascending=False, method="first")]
df["rating_B_rank"] = [int(x) for x in df["rating_avg_B"].rank(ascending=False, method="first")]

df = df.sort_values('rating_A_rank')

# Baseline: random vector
np.random.seed(42)
aa = list(range(1, 1383))
df["random"] = np.random.permutation(aa)

# manual ranking with first and last swapped
rank_len = df.shape[0]
df['wrong_first'] = [rank_len] + list(range(1, rank_len)) 



df[:10]



Unnamed: 0,movie_id,rating_avg_A,simple_rating_avg_A,rating_avg_B,simple_rating_avg_B,title,rating_A_rank,simple_rating_A_rank,simple_rating_B_rank,rating_B_rank,random,wrong_first
116,119,5.0,1.0,4.0,0.5,Maya Lin: A Strong Clear Vision (1994),1,2,610,198,310,1382
1120,1189,5.0,1.0,5.0,1.0,Prefontaine (1997),2,7,55,7,742,1
1206,1293,5.0,1.0,5.0,1.0,Star Kid (1997),3,8,64,10,266,2
1207,1294,5.0,1.0,2.833333,0.5,Ayn Rand: A Sense of Life (1997),4,9,692,1063,824,3
1295,1450,5.0,1.0,3.0,0.0,Golden Earrings (1947),5,13,1338,991,779,4
403,408,4.542553,0.904255,4.222222,0.833333,"Close Shave, A (1995)",6,21,164,111,661,5
166,169,4.515152,0.89899,4.210526,0.842105,"Wrong Trousers, The (1993)",7,22,152,115,77,6
110,113,4.5,1.0,3.333333,0.333333,"Horseman on the Roof, The (Hussard sur le toit...",8,1,862,710,185,7
825,850,4.5,1.0,3.5,0.5,"Perfect Candidate, A (1996)",9,3,648,604,746,8
1294,1449,4.5,1.0,4.75,1.0,Pather Panchali (1955),10,12,87,21,487,9


In [9]:
# --------------------------------------------------
# 7. Utilities to print results
# --------------------------------------------------

from ranking_coefficients import standard_gamma_calc
from scipy import stats

def gamma_out_printer(bb, coeff_type, weighting_scheme, wtype, n_0):
    gamma_out = standard_gamma_calc(coeff_type=coeff_type, weighting_scheme=weighting_scheme, wtype=wtype, aa=aa, bb=bb, n_0=n_0)    
    
    print(f'\t wtype: {wtype}, n_0: {n_0}')
    gamma_out['gamma']
    print(f'{coeff_type}: {round(100*gamma_out["gamma"],1)}%')
    print(f'standardized {coeff_type}: {round(100*gamma_out["standard_gamma"],1)}%')
    return None

def coeff_prynter(bb, coeff_type):
    print(f'standard {coeff_type}', end=' ')
    if coeff_type == 'spearman':
        print(f'{round(100*stats.spearmanr(bb, aa)[0],1)}%')
    elif coeff_type == 'kendall':
        print(f'{round(100*stats.kendalltau(bb,aa)[0],1)}%')

    print(f'\t\tWeighted coeff')
    weighting_scheme = 'add'
    print(f'\tweighting_scheme: {weighting_scheme}')
    gamma_out_printer(bb, coeff_type=coeff_type, weighting_scheme=weighting_scheme, wtype=1, n_0=0)
    gamma_out_printer(bb, coeff_type=coeff_type, weighting_scheme=weighting_scheme, wtype=2, n_0=1)
    # weighting_scheme = 'mult'
    # print(f'\tweighting_scheme: {weighting_scheme}')
    # gamma_out_printer(bb, coeff_type=coeff_type, weighting_scheme=weighting_scheme, wtype=1, n_0=0)
    # gamma_out_printer(bb, coeff_type=coeff_type, weighting_scheme=weighting_scheme, wtype=2, n_0=1)

    return None



In [10]:

print('Baseline: random vector')
bb = list(df['random'])

coeff_prynter(bb,'spearman')
print(' ')
coeff_prynter(bb,'kendall')

Baseline: random vector
standard spearman 2.7%
		Weighted coeff
	weighting_scheme: add
	 wtype: 1, n_0: 0
spearman: -33.1%
standardized spearman: 0.9%
	 wtype: 2, n_0: 1
spearman: -71.5%
standardized spearman: -14.1%
 
standard kendall 1.8%
		Weighted coeff
	weighting_scheme: add
	 wtype: 1, n_0: 0
kendall: -27.5%
standardized kendall: 0.1%
	 wtype: 2, n_0: 1
kendall: -52.6%
standardized kendall: 1.5%


In [11]:

print('Baseline: random vector')
bb = list(df['random'])

coeff_prynter(bb,'spearman')
print(' ')
coeff_prynter(bb,'kendall')

Baseline: random vector
standard spearman 2.7%
		Weighted coeff
	weighting_scheme: add
	 wtype: 1, n_0: 0
spearman: -33.1%
standardized spearman: 0.9%
	 wtype: 2, n_0: 1
spearman: -71.5%
standardized spearman: -14.1%
 
standard kendall 1.8%
		Weighted coeff
	weighting_scheme: add
	 wtype: 1, n_0: 0
kendall: -27.5%
standardized kendall: 0.1%
	 wtype: 2, n_0: 1
kendall: -52.6%
standardized kendall: 1.5%


In [12]:
print('Simple ranking')

bb = list(df["simple_rating_A_rank"])
coeff_prynter(bb,'spearman')
print(' ')
coeff_prynter(bb,'kendall')

Simple ranking
standard spearman 95.5%
		Weighted coeff
	weighting_scheme: add
	 wtype: 1, n_0: 0
spearman: 97.0%
standardized spearman: 97.5%
	 wtype: 2, n_0: 1
spearman: 71.8%
standardized spearman: 97.0%
 
standard kendall 82.9%
		Weighted coeff
	weighting_scheme: add
	 wtype: 1, n_0: 0
kendall: 73.9%
standardized kendall: 74.8%
	 wtype: 2, n_0: 1
kendall: 29.0%
standardized kendall: 78.7%


In [13]:
print('Full ranking new')

bb = list(df["rating_B_rank"])

coeff_prynter(bb,'spearman')
print(' ')
coeff_prynter(bb,'kendall')

Full ranking new
standard spearman 62.6%
		Weighted coeff
	weighting_scheme: add
	 wtype: 1, n_0: 0
spearman: 17.0%
standardized spearman: 35.7%
	 wtype: 2, n_0: 1
spearman: -28.6%
standardized spearman: 37.6%
 
standard kendall 47.2%
		Weighted coeff
	weighting_scheme: add
	 wtype: 1, n_0: 0
kendall: 2.1%
standardized kendall: 18.0%
	 wtype: 2, n_0: 1
kendall: -42.6%
standardized kendall: 14.0%


In [14]:
print('Simple ranking new')

bb = list(df["simple_rating_B_rank"])

coeff_prynter(bb,'spearman')
print(' ')
coeff_prynter(bb,'kendall')

Simple ranking new
standard spearman 56.2%
		Weighted coeff
	weighting_scheme: add
	 wtype: 1, n_0: 0
spearman: 5.8%
standardized spearman: 27.6%
	 wtype: 2, n_0: 1
spearman: -36.0%
standardized spearman: 30.2%
 
standard kendall 42.4%
		Weighted coeff
	weighting_scheme: add
	 wtype: 1, n_0: 0
kendall: -3.6%
standardized kendall: 14.3%
	 wtype: 2, n_0: 1
kendall: -36.7%
standardized kendall: 20.9%


In [15]:
print('First-last swapped')

bb = list(df['wrong_first'])

coeff_prynter(bb,'spearman')
print(' ')
coeff_prynter(bb,'kendall')

First-last swapped
standard spearman 99.6%
		Weighted coeff
	weighting_scheme: add
	 wtype: 1, n_0: 0
spearman: 59.9%
standardized spearman: 67.8%
	 wtype: 2, n_0: 1
spearman: -1.6%
standardized spearman: 61.0%
 
standard kendall 99.7%
		Weighted coeff
	weighting_scheme: add
	 wtype: 1, n_0: 0
kendall: 75.5%
standardized kendall: 76.2%
	 wtype: 2, n_0: 1
kendall: 26.6%
standardized kendall: 77.2%
