# Phone Reviews and Prices

In [None]:
import numpy as np 
import pandas as pd 
import plotly.express as px

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


In [None]:
items = pd.read_csv('../input/amazon-cell-phones-reviews/20190928-items.csv')
reviews = pd.read_csv('../input/amazon-cell-phones-reviews/20190928-reviews.csv')

In [None]:
items.head(2)

In [None]:
reviews.head(2)

### What brands have the most reviews?

In [None]:
items.groupby('brand')['totalReviews'].count()

### How many ratings per phone?

In [None]:
px.bar(items.groupby('brand')['rating'].count().reset_index().sort_values(by='rating',ascending=False),
       x='brand', y='rating', title='Rating Count')

In this dataset we can see that we have far more **Samsung** ratings than the rest. We will still go ahead and check out the ratings to see how they fare against each other.

### Averate Rating

In [None]:
px.bar(items.groupby('brand')['rating'].mean().reset_index().sort_values(by='rating',ascending=False),
       x='brand', y='rating', title='Average Rating')

 Surprisingly,  on average, **Xiaomi** has the highest ratings of them all, followed by **Huawei**.


In [None]:
px.histogram(items, x='prices', title='Phone Prices')

Here are the prices of phones in the dataset. As we can see, the majority of phones are around the **$200** range.

> ## Bivariate Plots

### Do higher prices get better ratings?

In [None]:
px.scatter(items, x='prices', y='rating', title='Ratings vs Prices')

Not necessarily. No correlation here. We can see that a phone for 874 got a really low rating, while a phone for 759 had a really high rating (5).

In [None]:
reviews.info()

In [None]:
reviews['date'] = pd.to_datetime(reviews.date)

In [None]:
reviews['year'] = reviews.date.dt.year
reviews['month'] = reviews.date.dt.month

### Merge Dataframes

In [None]:
items.asin = items.asin.astype('str')
reviews.asin = reviews.asin.astype('str')

In [None]:
r_items = items.merge(reviews, on='asin', suffixes=('_items', '_reviews'))

In [None]:
r_items.head(2)

### Which months have the best reviews?

In [None]:
r_items.groupby('year')['totalReviews'].sum()

In [None]:
r_items.groupby('month')['totalReviews'].sum().reset_index().sort_values(by='totalReviews', ascending=False)

In [None]:
top_phones = r_items.loc[((r_items.brand == 'Apple') | (r_items.brand == 'Samsung') | (r_items.brand == 'HUAWEI')) &
                        (r_items.year > 2012)]

In [None]:
px.line(r_items.loc[r_items.year > 2012].groupby(['brand', 'year'])[['rating_reviews', 'totalReviews']].sum().reset_index(),
        color='brand', x='year', y='totalReviews', title='Total Reviews By Year')

As we can see, **OnePlus** shot up from 2016. **Samsung** had a steady incline, while **Google** had an incline from 2017. 

In [None]:
px.line(r_items.loc[r_items.year > 2012].groupby(['brand', 'year'])[['rating_reviews', 'totalReviews']].mean().reset_index(),
        color='brand', x='year', y='rating_reviews', title='Average Ratings by Year')

- **Nokia** gradually declined
- **OnePlus** declined, then tapered off

In [None]:
# split the prices to separate the two and take the first
r_items.prices = r_items.prices.str.split(',')

In [None]:
type([9]) == list

In [None]:
def return_element(x):
    if type(x) == list:
        return x[0]
    else:
        return x

r_items.prices = r_items.prices.apply(lambda x: return_element(x))

In [None]:
r_items.prices = r_items.prices.str[1:].astype(float)

In [None]:
r_items.loc[r_items.year > 2012].groupby(['brand', 'year'])[['rating_reviews', 'prices']].mean().head()

### Phone Prices over Years

In [None]:
px.line(r_items.loc[r_items.year > 2012].groupby(['brand', 'year'])[['prices']].mean().reset_index(),
        color='brand', x='year', y='prices', title='Total Reviews By Year')