In [1]:
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
vine_df = pd.read_csv('vine_table.csv')
vine_df.head()

Unnamed: 0,review_id,star_rating,helpful_votes,total_votes,vine,verified_purchase
0,R2MTG1GCZLR2DK,5.0,0.0,0.0,N,Y
1,R2HBOEM8LE9928,5.0,0.0,0.0,N,Y
2,R1P4RW1R9FDPEE,5.0,1.0,1.0,N,Y
3,R1EBPM82ENI67M,1.0,0.0,0.0,N,Y
4,R372S58V6D11AT,5.0,1.0,1.0,N,Y


In [3]:
vine_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3093869 entries, 0 to 3093868
Data columns (total 6 columns):
 #   Column             Dtype  
---  ------             -----  
 0   review_id          object 
 1   star_rating        float64
 2   helpful_votes      float64
 3   total_votes        float64
 4   vine               object 
 5   verified_purchase  object 
dtypes: float64(3), object(3)
memory usage: 141.6+ MB


# Filter by total votes

To pick reviews that are more likely to be helpful

In [4]:
filtered_vine_df = vine_df[vine_df['total_votes'] >= 20]
filtered_vine_df.head()

Unnamed: 0,review_id,star_rating,helpful_votes,total_votes,vine,verified_purchase
41,R1FBO737KD9F2N,5.0,19.0,23.0,N,Y
145,R227GSNWI6BSZV,1.0,20.0,20.0,N,Y
304,R3SJTYZBYBG4EE,4.0,99.0,99.0,N,Y
419,R248FG65D76D5Y,1.0,42.0,53.0,N,Y
500,R3B6BXFKGW52SG,1.0,32.0,32.0,N,Y


# Find most helpful reviews

To retrieve all the rows where the number of `helpful_votes` divided by `total_votes` is equal to or greater than 50%

In [5]:
new_vine_df = filtered_vine_df.loc[filtered_vine_df.helpful_votes / filtered_vine_df.total_votes >= 0.5]
new_vine_df.head()

Unnamed: 0,review_id,star_rating,helpful_votes,total_votes,vine,verified_purchase
41,R1FBO737KD9F2N,5.0,19.0,23.0,N,Y
145,R227GSNWI6BSZV,1.0,20.0,20.0,N,Y
304,R3SJTYZBYBG4EE,4.0,99.0,99.0,N,Y
419,R248FG65D76D5Y,1.0,42.0,53.0,N,Y
500,R3B6BXFKGW52SG,1.0,32.0,32.0,N,Y


# Reviews written as part of the vine program (paid)

In [6]:
vine_reviews = new_vine_df[new_vine_df['vine'] == 'Y']
vine_reviews.head()

Unnamed: 0,review_id,star_rating,helpful_votes,total_votes,vine,verified_purchase
1932,R184FOUNZZ7KO8,5.0,15.0,20.0,Y,N
4661,R82QWN2X2OCHB,5.0,176.0,208.0,Y,N
11745,R1UYHBYE6790BU,5.0,44.0,53.0,Y,N
17013,R2J3YLX1L4EH2B,5.0,299.0,321.0,Y,N
28097,R3QDI539WTXKE2,5.0,26.0,32.0,Y,N


# Reviews not written as part of the vine program (unpaid)

In [7]:
not_vine_reviews = new_vine_df[new_vine_df['vine'] == 'N']
not_vine_reviews.head()

Unnamed: 0,review_id,star_rating,helpful_votes,total_votes,vine,verified_purchase
41,R1FBO737KD9F2N,5.0,19.0,23.0,N,Y
145,R227GSNWI6BSZV,1.0,20.0,20.0,N,Y
304,R3SJTYZBYBG4EE,4.0,99.0,99.0,N,Y
419,R248FG65D76D5Y,1.0,42.0,53.0,N,Y
500,R3B6BXFKGW52SG,1.0,32.0,32.0,N,Y


# 5-Star Review Analysis: 

Comparing the vine and non-vine program

In [8]:
count_vine = len(vine_reviews)
count_vine

1080

In [9]:
count_nonvine = len(not_vine_reviews)
count_nonvine

49673

Out of all the reviews, there were **33** paid-vine reviews and **45,388** unpaid reviews.

In [10]:
max_rating_vine = len(vine_reviews[vine_reviews['star_rating']==5])
max_rating_vine

454

In [11]:
max_rating_nonvine = len(not_vine_reviews[not_vine_reviews['star_rating']==5])
max_rating_nonvine

23043

Out of those reviews, 15 of the vine reviews were **5** star and **23,733** were non-paid and 5 star

### 5-Star Review percentage by program

*5-star Vine Reviews:*

In [12]:
max_rating_vine/count_vine

0.4203703703703704

In [13]:
max_rating_nonvine/count_nonvine

0.463893865882874

## Results: 

- Vine (paid) Reviews
    - 33 total reviews
    - 15 were 5 star reviews
    - ***45.5%*** of vine (paid) reviews were 5 star

- Unpaid Reviews
    - 45,388 total reviews
    - 23,733 5 star reviews 
    - ***52.3%*** of unpaid reviews were 5 star




## Further Analysis


Average star rating of reviews per program (paid and unpaid) 

- Paid Vine Program

In [14]:
round(vine_reviews['star_rating'].mean(),2)

4.09

- Unpaid Program

In [15]:
round(not_vine_reviews['star_rating'].mean(),2)

3.65