In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

The goal is to choose a brand of ramen using Bayesian inference 

In [None]:
import pandas as pd 
import numpy as np 

import matplotlib.pyplot as plt
import seaborn as sn

import pymc3 as pm 

In [None]:
ramen =  pd.read_csv("/kaggle/input/ramen-ratings/ramen-ratings.csv")
pd.concat([ramen.head(), ramen.tail()]) 

In [None]:
print('Rows:', ramen.shape[0])
print('Columns:', ramen.shape[1])

In [None]:
ramen.info()

In [None]:
ramen['Stars']=pd.to_numeric(ramen['Stars'], errors='coerce') 

In [None]:
ramen.isna().sum()

In [None]:
ramen.drop(['Top Ten'],inplace=True, axis=1) # Drop column 'Top Ten'
ramen[ramen.isnull().any(axis=1)]

In [None]:
# drop rows 2152 e 2442
ramen.drop(2152, axis=0,inplace=True) 
ramen.drop(2442, axis=0,inplace=True)

In [None]:
ramen.fillna(ramen['Stars'].mean(), inplace=True )

In [None]:
# Account for the variety of ramens 
paises = ramen['Country'].value_counts().sort_values(ascending=True) 

plt.figure(figsize=(10,8))
paises.plot(kind='barh', )
plt.show()

Japan is the country with the largest variety of ramens 

In [None]:
# Most used packaging 
style = ramen['Style'].value_counts()

plt.pie(style[0:5],autopct="%3.01f%%",radius=1.9,explode=[0,0,0,0,0.3], labels=style[0:5].index,
       textprops={'fontsize': 14})
plt.show()

In [None]:
plt.figure(figsize=(9,6))
m = sn.distplot(ramen['Stars'], hist=True, bins=15, kde=True,color='k')
m1 = plt.vlines(ramen['Stars'].mean(),0,0.7,colors='b',label='mean')
m2 =plt.vlines(ramen['Stars'].mode(),0.0,0.7,colors='r',label='mode')
m3 = plt.vlines(ramen['Stars'].median(),0.0,0.7,colors='g',label='median')
plt.title("Distribution of Stars")
plt.xlabel('Stars')
plt.legend()
plt.show() 

In [None]:
# Select brands with ratings greater than 4
jp = ramen.loc[(ramen['Country']=='Japan') & (ramen['Stars']>=4)]

In [None]:
x_jp = jp['Brand'].value_counts()
x_jp = x_jp[:4,]
sn.barplot(x=x_jp.index,y=x_jp.values, palette='Paired')
plt.title('4 brands with score greater than 4')

Nissin is the most popular, we will choose between the other two placed Myojo and Maruchan

In [None]:
myojo = ramen.loc[ramen['Brand'] == "Myojo"]
myojo_rank = myojo['Stars'].value_counts().sort_index()

maruchan = ramen.loc[ramen['Brand'] == "Maruchan"]
maruchan_rank = maruchan['Stars'].value_counts().sort_index()

We will use ratings from 0.0 to 3.0 as negative, and ratings from 4.0 to 5.0 as positive. 

In [None]:
# myojo
myo_neg = myojo_rank.loc[(myojo_rank.index <= 3)].sum() 
myo_pos = myojo_rank.loc[(myojo_rank.index >= 4 )].sum() 

#maruchan
ma_neg = maruchan_rank.loc[(maruchan_rank.index <= 3)].sum()  
ma_pos = maruchan_rank.loc[(maruchan_rank.index >= 4 )].sum() 


In [None]:
labels = ['Positive', 'Negative']
x = np.arange(len(labels))
width = 0.3
fig, ax = plt.subplots(figsize=(7,5))
rects1 = ax.bar(x-width/2, [ma_pos,ma_neg], width=width, color='red', label="Maruchan")
rects2 = ax.bar(x+width/2, [myo_pos,myo_neg], width=width, color='yellowgreen', label='Myojo')
ax.legend()

ax.set_ylabel('Ratings')
ax.set_xticks(x)
ax.set_xticklabels(labels)

plt.show()

To do Bayesian inference we use the Bayes formula

$\frac{P(ratings|positive)*P(positive)}{P(ratings)} = P(positive|ratings)$,  

where

#### prior -> $P(positive)$

#### posteriori -> $P(positive|ratings)$

#### likelihood -> $P(ratings|positive)$

#### trace -> $P(ratings)$

Trace is the classifications created (positive-1 or negative-0) 

In [None]:
# Myojo
Myj = np.array([0]*(myo_neg) + [1]*(myo_pos)) 
Myj = np.random.shuffle(Myj) 

# Maruchan
Mch = np.array([0]*(ma_neg) + [1]*(ma_pos))
Mch = np.random.shuffle(Mch)

In [None]:
with pm.Model() as projeto:
    # 1. prior (Beta distribution)
    #Suppose that both brands have a high probability of being rated positively
    #because they are both well rated in Japan 
    p_myo = pm.Beta('p_myo', 5, 2) 
    p_mch = pm.Beta('p_mch', 5, 2) 
    
        
    # 2. posterior (Bernoulli distribution)
    evidencia_myo = pm.Bernoulli('posteriori_myo', p_myo, observed=Myj)
    evidencia_mch = pm.Bernoulli('posteriori_mch', p_mch, observed=Mch)
     

    # Calculate the difference between distributions
    dif = pm.Deterministic('dif', p_myo - p_mch)     

    # likelihood
    trace = pm.sample(draws=3000, step=pm.Metropolis())
    

In [None]:
plt.figure(figsize=(16,10))

plt.subplot(311)
plt.hist(trace['p_myo'], bins=50, histtype='barstacked', density=True)
plt.xlim([0,1.0])
plt.vlines(trace['p_myo'].mean(),0,5,linestyles='--', color='red', label="Mean")
plt.legend()
plt.title("Myojo Posteriori")

plt.subplot(312)
plt.hist(trace['p_mch'], bins=50, histtype='barstacked', density=True)
plt.xlim([0,1.0])
plt.vlines(trace['p_mch'].mean(),0,5,linestyles='--', color='red', label="Mean")
plt.legend()
plt.title("Maruchan Posteriori ")

plt.subplot(313)
plt.hist(trace['dif'], histtype='barstacked', density=True)
plt.title('Difference between distributions')

plt.show()

In [None]:
print('Likely to like the brand Myojo : {:.2f}'.format(np.mean( trace['dif'] > 0)))
print('Likely to like the brand Murchan: {:.2f}'.format(np.mean( trace['dif'] < 0)))

Conclusion: Myojo is the best choice