In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
%matplotlib inline

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Reading the data 
goog_filepath = '../input/google-playstore-apps/Google-Playstore-32K.csv'
goog_data = pd.read_csv(goog_filepath)

In [None]:
# Initial look at the data
goog_data

In [None]:
# Size of the data
goog_data.shape

In [None]:
# Columns
goog_data.columns

**First look at the features**

Let's focus on the features **Rating, Category, Reviews, Installs, and Price** to see :
1. Which categories are rated best?
2. Do the categories affect the ratings?
3. Do price affect the ratings, reviews, or number of installs of the app?

In [None]:
# Types of each feature
goog_data.dtypes

**Data Cleaning**

Before doing any EDA, it's good to check for any errors in the data, whether it be NaN entries or unexpected entries.

In [None]:
#check for duplicates
print('The number of duplicated apps are {:n}'.format(goog_data.duplicated(keep='first').sum()))

In [None]:
# what is the duplicate
dup_app = goog_data[goog_data.duplicated(keep='first')]['App Name']
print('The duplicate app is {}'.format(dup_app.iloc[0]))

In [None]:
# remove duplicate from data
g_data = goog_data.drop_duplicates(keep='first').reset_index().drop('index',axis=1)

In [None]:
#check
g_data.duplicated().sum()

Okay, there are no longer any duplicate apps in the dataset. Now let's check the categories of apps in the dataset

In [None]:
# Categories
g_data['Category'].value_counts()

It looks like there are **3 categories** which seems odd :  **Channel 2 News, ) , and Podcasts**.

Let's analyze the 3 data points corresponding to the 3 odd categories

In [None]:
# Find the 3 odd data points
odd_cat = [' Channel 2 News', ')',' Podcasts']
test = g_data['Category'].isin(odd_cat)
odd_cat_ind = []
for i in range(len(test)):
    if test[i] == True : 
        odd_cat_ind.append(i)
g_data.iloc[odd_cat_ind]

It seems that the inputs are shifted by a number of columns

For the first app (index : 6941), the *Installs* column should be in the *Category* column, the *Size* column should've been in the *Rating* column and so forth. 

For the second and third app, the *Rating* column should be in the *Category* column, the *Reviews* column should be in the *Rating*, so on and so forth.

So, let's correct that, and make the remaining columns NaN

In [None]:
# correcting the false 3 data points
g_data.iloc[odd_cat_ind[0],1:7] = list(g_data.iloc[odd_cat_ind[0],4:10])
g_data.iloc[odd_cat_ind[0],7:] = np.nan
g_data.iloc[odd_cat_ind[1],1:9] = list(g_data.iloc[odd_cat_ind[1],2:10])
g_data.iloc[odd_cat_ind[1],9:] = np.nan
g_data.iloc[odd_cat_ind[2],1:9] = list(g_data.iloc[odd_cat_ind[2],2:10])
g_data.iloc[odd_cat_ind[2],9:] = np.nan
g_data.iloc[odd_cat_ind]

Now let's take a look at the *price* feature

In [None]:
# Price data type
g_data['Price'].value_counts(normalize=True)

Seeing as the number of apps that are free (price of USD 0) takes up 93.59% of the apps in the dataset, a reasonable way of looking at the effect of price might be to divide the apps into 2 groups, **Free** and **Paid**

In [None]:
# Append a new feature (Free/Paid) to the dataset
if g_data['Price'].dtype == 'object' : 
    g_data['Price'] = g_data['Price'].apply(lambda x : x.strip('$')).astype(float)
free_paid = ['Free' if i == 0 else 'Paid' for i in g_data['Price']]
free_paid_ser = pd.Series(free_paid,name = 'Free/Paid')
g_data['Free/Paid'] = free_paid_ser
g_data

As we are not going to focus on the features **Last Updated, Minimum Version, and Latest Version**, we'll drop that from our dataset

In [None]:
# Dropping 'Last Updated', 'Minimum Version', and 'Latest Version'
g_data = g_data.drop(['Last Updated','Minimum Version','Latest Version'],axis=1)

In [None]:
# Changing the 'Rating' and 'Reviews' data types
g_data['Rating'] = g_data['Rating'].astype(float)
g_data['Reviews'] = g_data['Reviews'].astype(int)

Let's take a look at the **Ratings** distribution

In [None]:
# Plot of the ratings
plt.figure(figsize=(8,8))
plt.title('Ratings distribution')
sns.distplot(g_data['Rating'],kde=True,color='orange',fit=stats.norm)
plt.legend(['Normal Distribution','Ratings',])

In [None]:
# Normality check for ratings distribution
ratings = g_data['Rating']
norm_rating = (ratings-ratings.mean())/ratings.std() # Normalize the ratings first
print('The p-value for Kolmogorov-Smirnov Test is {}'.format(stats.kstest(norm_rating,'norm',N = len(norm_rating)).pvalue))

**Conclusion** : Ratings are not normally distributed, this may be attributed to the negative skew and high kurtosis

It could also be seen from the data that there are significant amounts of low rating, dragging the mean towards a lower value than the median

In [None]:
# Check if data fits lognormal distribution
sns.distplot(np.log(ratings),fit=stats.norm,kde=False)

It doesn't seem as if our data follows a lognormal distribution, this will limit us as we cannot use any parametric methods to do any statistical inference on our data. Although, we could still look at some trends in the data.

In [None]:
# dropping data with < 1000 reviews in an attempt if it has an effect on normality of ratings
more_1000_reviews_ind = [i for i,x in enumerate(g_data['Reviews'] >= 1000) if x]
data = g_data.loc[more_1000_reviews_ind].reset_index().drop('index',axis=1)
plt.title('Ratings Distribution (> 1000 Reviews)')
sns.distplot(data['Rating'],fit=stats.norm,color='orange')

That looks much better compared to the distribution before, though it still doesn't approximately follow the normal distribution.

We will be focusing on this dataset as it filters out apps with low number of reviews (We assume that the number of reviews are the number of people who gave ratings for the apps).

Now let's take a look at the **different categories and their ratings**

In [None]:
# Categories
data['Category'].value_counts()

There are a lot of **game categories** that are divided into several subgroups. To simplify, we'll group all the game categories into one **GAME** category

In [None]:
# Change all game categories into 'GAME'
game_ind = [i for i,x in enumerate(data['Category'].str.contains('GAME')) if x]
data_2 = data.copy()
data_2.loc[game_ind,'Category'] = 'GAME'

In [None]:
# Boxplot of the ratings for each category
plt.figure(figsize=(13,8)); 
plt.title('Boxplot of the Ratings of each Category');
sns.boxplot(x=data_2['Category'],y=data['Rating'],showmeans=True)
plt.xticks(rotation=90);

It can be seen from the boxplots, that from almost all the categories, the rating distribution are left-skewed. There are a lot of outliers from each category, consisting of a lot of apps rated much lower than the median rating, resulting in a left-skewed distribution.

Seeing from the boxplots, it seems that some categories rate higher than others (on average). Although we can't say it with some level of certainty (because the rating distribution is not normal), we can look at some tendencies from the categories.

In [None]:
# Group the dataset by Category and sort the values by their average rating
group_cat = data_2.groupby('Category')
sorted_rating_by_cat = group_cat['Rating'].mean().sort_values(ascending=False)
sorted_rating_by_cat

In [None]:
# Taking the top 5 categories and the bottom 5 categories
top_5_cat = sorted_rating_by_cat.index[0:5]
bot_5_cat = sorted_rating_by_cat.index[-5:]
print('The top 5 rated categories are {},{},{},{},and {}'.format(*list(top_5_cat)))
print('The bottom 5 rated categories are {},{},{},{}, and {}'.format(*list(bot_5_cat)))

In [None]:
# Making a dataset consisting of only apps from the top 5 and bottom 5 categories
top_5_cat_ind = [i for i,x in enumerate(data_2['Category'].isin(top_5_cat)) if x]
bot_5_cat_ind = [i for i,x in enumerate(data_2['Category'].isin(bot_5_cat)) if x]
top_5_cat_data = data_2.iloc[top_5_cat_ind].reset_index().drop('index',axis=1)
bot_5_cat_data = data_2.iloc[bot_5_cat_ind].reset_index().drop('index',axis=1)
top_bot_cat_data = pd.concat([top_5_cat_data,bot_5_cat_data],axis=0)
top_bot_cat_data

In [None]:
# Boxplot of these categories' ratings
plt.figure(figsize=(13,8))
plt.title('Boxplots of Top 5 and Bottom 5 Categories\' Ratings')
sns.boxplot(x='Category',y='Rating',data=top_bot_cat_data,showmeans=True)
plt.xticks(rotation=90);

Even though it looks as if there are some differences between the ratings of these top 5 and bottom 5 categories' ratings, they don't seem to differ by much.

Now let's take a look at the different ratings the apps get depending on the price of the app (Are priced apps rated differently from free apps?)

In [None]:
# Group the data according to Free/Paid Apps
group_price = data_2.groupby('Free/Paid')
print('The average rating for the Free apps are {}.'.format(group_price['Rating'].mean().loc['Free']))
print('The average rating for the Paid For apps are {}'.format(group_price['Rating'].mean().loc['Paid']))

In [None]:
# Boxplot of free vs paid apps
plt.figure(figsize=(8,5))
plt.title('Boxplot of Ratings of Free and Paid for Apps')
sns.boxplot(x='Free/Paid',y='Rating',data=data_2,showmeans=True)

It seems that the **Paid For apps have a higher average rating than the Free apps**.

This could be attributed to the larger amount of people who rated the Free apps as oppose to the Paid ones. The larger amount of people might account for the bigger spread of the rating, as more people could have rated the apps with a low rating