# Import library

In [2]:
import pandas as pd
import math
import scipy.stats as st

# Declare variables / constants

In [3]:
path_data_csv = 'ProductReview.csv'
name_header_data = ["id", "reviews.rating"]

# Read data

In [4]:
original_data_df = pd.read_csv(path_data_csv, usecols=name_header_data, low_memory=False)
original_data_df.head()

Unnamed: 0,id,reviews.rating
0,1687903,5
1,1533324,5
2,1533324,5
3,1651734,1
4,1651734,1


# Count the number of positive ratings

In [118]:
data_rating_gt_3 = original_data_df[original_data_df["reviews.rating"] >= 3].groupby('id').count().rename(columns = {"reviews.rating": "count_gt3"})
data_rating_gt_3.head()

Unnamed: 0_level_0,count_gt3
id,Unnamed: 1_level_1
913248,27
928287,1
928288,9
935372,1
938363,189


# Count the number of ratings

In [119]:
data_rating_count = original_data_df.groupby("id").agg("count").rename(columns = {"reviews.rating": "count_amount"})
data_rating_count.head()

Unnamed: 0_level_0,count_amount
id,Unnamed: 1_level_1
913248,28
928287,1
928288,10
935372,1
938363,224


# Merge positive ratings and amount ratings

In [122]:
data_processing = pd.merge(data_rating_gt_3, data_rating_count, on = "id", how= "inner")
data_processing.head()

Unnamed: 0_level_0,count_gt3,count_amount
id,Unnamed: 1_level_1,Unnamed: 2_level_1
913248,27,28
928287,1,1
928288,9,10
935372,1,1
938363,189,224


# Declare function Wilson Lower bound Score

In [123]:
"""
:param pos: No of positive ratings
:param n: Total number of ratings
:param confidence: Confidence interval, by default is 95 %
:return: Wilson Lower bound score
"""
def wilson_lower_bound(pos, n, confidence=0.95):
    if n == 0:
        return 0
    z = st.norm.ppf(1 - (1 - confidence) / 2)
    phat = 1.0 * pos / n
    return (phat + z * z / (2 * n) - z * math.sqrt((phat * (1 - phat) + z * z / (4 * n)) / n)) / (1 + z * z / n)

# Wilson lower bound Score score calculation

In [142]:
data_processing["wilson_score"] = data_processing.apply(lambda row: wilson_lower_bound(row["count_gt3"], row["count_amount"]), axis=1)
data_result = data_processing.sort_values(by='wilson_score', ascending=False)
data_result.head()

Unnamed: 0_level_0,count_gt3,count_amount,wilson_score
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1657994,8579,8606,0.995439
1538179,2476,2489,0.991084
1592625,2079,2093,0.988803
1487538,5460,5510,0.988057
1677119,315,315,0.987952


# Declare function Bayesian Approximation

In [18]:
pivot_data = original_data_df.groupby(["id", "reviews.rating"]).agg({"reviews.rating": "count"}).rename(columns = {"reviews.rating": "count"})
pivot_data = pivot_data.pivot_table(index="id", columns="reviews.rating", values="count")
pivot_data = pivot_data.fillna(0)
pivot_data.head(5)

reviews.rating,1,2,3,4,5
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
913248,1.0,0.0,1.0,2.0,24.0
928287,0.0,0.0,0.0,0.0,1.0
928288,0.0,1.0,0.0,1.0,8.0
935372,0.0,0.0,0.0,0.0,1.0
938363,28.0,7.0,7.0,20.0,162.0


In [45]:
"""
Function to calculate wilson score for N star rating system. 
:param n: Array having count of star ratings where ith index represent the votes for that category i.e. [3, 5, 6, 7, 10]
here, there are 3 votes for 1-star rating, similarly 5 votes for 2-star rating. 
:param confidence: Confidence interval
:return: Score
"""
def bayesian_rating_products(row, confidence=0.95):
    n = row[:-1].tolist()

    if sum(n)==0:
        return 0
    K = len(n)
    z = st.norm.ppf(1 - (1 - confidence) / 2)
    N = sum(n)
    first_part = 0.0
    second_part = 0.0
    for k, n_k in enumerate(n):
        first_part += (k+1)*(n[k]+1)/(N+K)
        second_part += (k+1)*(k+1)*(n[k]+1)/(N+K)
    score = first_part - z * math.sqrt((second_part - first_part*first_part)/(N+K+1))
    return score

In [46]:
pivot_data["bayesian_score"] = pivot_data.apply(lambda row: bayesian_rating_products(row), axis=1)
pivot_data = pivot_data.sort_values(by='bayesian_score', ascending=False)
pivot_data.head()

reviews.rating,1,2,3,4,5,bayesian_score
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1592625,13.0,1.0,16.0,244.0,1819.0,4.816589
1657994,16.0,11.0,71.0,1298.0,7210.0,4.810992
1677119,0.0,0.0,6.0,24.0,285.0,4.804581
1488756,18.0,10.0,17.0,114.0,1280.0,4.788321
1643269,2.0,1.0,11.0,75.0,494.0,4.756369
