In [None]:
#Preparing the data for this topic
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline

In [None]:
df = pd.read_csv('../input/top-play-store-games/android-games.csv', index_col=0)
test = pd.DataFrame({'Title':df['title'] , 'total ratings':df['total ratings']})
test.reset_index()
test.index = test['Title']
test

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.describe().T

In [None]:
df.isnull().sum()

1. Top 10 games with the most ratings

In [None]:
df = pd.read_csv('../input/top-play-store-games/android-games.csv', index_col=0)
df.head(10)

In [None]:
numerical_columns = df.dtypes[df.dtypes != 'object']

In [None]:
categorical_columns = df.dtypes[df.dtypes == 'object']

In [None]:
test['total ratings'][:10].plot(kind='bar')
plt.ylabel('Total Ratings in 10 Million')
plt.xlabel('Games')
plt.title('Top 10 Games with the Most Ratings')
plt.show()

2. Top 10 games with the least ratings

In [None]:
df = pd.read_csv('../input/top-play-store-games/android-games.csv', index_col=0)
df.tail(10)

In [None]:
test['total ratings'][-10:].plot(kind='bar')
plt.ylabel('Total Ratings')
plt.xlabel('Games')
plt.title('Top 10 Games with the Least Ratings')
plt.show()

3. How many games are free?

In [None]:
df['paid'].value_counts()

In [None]:
plt.figure()
CountStatus = pd.value_counts(df['paid'].values, sort=True).head(10)
CountStatus.plot.barh()
plt.ylabel('Price')
plt.xlabel('Number of games')
plt.title('Free games vs. paid games')
plt.show()

In [None]:
plt.figure()
CountStatus = pd.value_counts(df['price'].values, sort=True).head(10)
CountStatus.plot.barh()
plt.ylabel('Price')
plt.xlabel('Number of games')
plt.title('Total Number of Games by Price')
plt.show()

In [None]:
df[df['price'] != 0]

4. Top 10 games with the most good ratings percent with over 1 milion ratings

In [None]:
df['good ratings']= df['4 star ratings'] + df['5 star ratings']
df['bad ratings']= df['1 star ratings'] + df['2 star ratings'] + df['3 star ratings']

df['good ratings percent']= df['good ratings'] / (df['good ratings'] + df['bad ratings'])
df['bad ratings percent']= df['bad ratings'] / (df['good ratings'] + df['bad ratings'])

df.head(2)

In [None]:
df_1m=df[df['total ratings'] > 1000000]

df_1m.sort_values('good ratings percent', ascending= False).head(10)[['title', 'good ratings percent', 'total ratings']]

x=pd.DataFrame(df_1m.sort_values('good ratings percent', ascending= False).head(10)[['title', 'good ratings percent', 'total ratings']])

x

In [None]:
x.plot(kind='bar', x='title', y='good ratings percent')
plt.ylabel('Good ratings percent')
plt.xlabel('Games')
plt.title('Top 10 games with the most good ratings percent with over 1 milion ratings')

5. Top 10 games with the least good ratings percent with over 1 million ratings

In [None]:
df_1m=df[df['total ratings'] > 1000000]

df_1m.sort_values('good ratings percent', ascending= True).head(10)[['title', 'good ratings percent', 'total ratings']]

t=pd.DataFrame(df_1m.sort_values('good ratings percent', ascending= True).head(10)[['title', 'good ratings percent', 'total ratings']])

t

In [None]:
t.plot(kind='bar', x='title', y='good ratings percent')
plt.ylabel('Good ratings percent')
plt.xlabel('Games')
plt.title('Top 10 games with the least good ratings percent with over 1 milion ratings')

6. Top 17 game categories by total number of games

In [None]:
df['category'].value_counts()

In [None]:
plt.figure()
CountStatus = pd.value_counts(df['category'].values, sort=True).head(10)
CountStatus.plot.barh()
plt.ylabel('Category')
plt.xlabel('Number of games')
plt.title('Total Number of Games')
plt.show()

In [None]:
plt.figure(figsize=(14, 7))
labels=df['category'].value_counts().index
plt.pie(df['category'].value_counts().values,labels=labels,
        explode=[0.15, 0.12, 0.1, 0.08, 0.08, 0.08, 0.08, 0.08, 0.08, 0.08, 0.08, 0.08, 0.08, 0.08, 0.08, 0.08, 0.08],
        autopct='%1.1f%%', startangle=90)
plt.title('Category Pie Chart',fontsize=20,pad=40)
plt.axis('equal')
plt.show()

7. How many installs for each game?

In [None]:
df.head()

In [None]:
installs_range = {
    '100.0 k' : 100000,
    '500.0 k' : 500000,
    '1.0 M' : 1000000,
    '5.0 M' : 5000000,
    '10.0 M' :10000000,
    '50.0 M' : 50000000,
    '100.0 M': 100000000,
    '500.0 M': 500000000,
    '1000.0 M': 1000000000,
}

In [None]:
df['installs_num'] = df['installs'].map(installs_range)

In [None]:
df['installs'].value_counts()

In [None]:
plt.figure(figsize= (10, 10))

ax= sns.countplot(x= 'installs', data= df, 
                  order= [ '1000.0 M', '500.0 M', '100.0 M', '50.0 M', '10.0 M', '5.0 M', '1.0 M', '500.0 k', '100.0 k'],
                  palette= 'GnBu_r')
plt.title('Installs', fontsize= 20)

for patch in ax.patches:
    height= patch.get_height()
    width= patch.get_width()
    left_coord= patch.get_x()
    percent= height/len(df)*100
    
    ax.text(left_coord + width/2, height + 5,
           '{}\n({:.1f}%)'.format(height, percent), ha= 'center')

8. Growth of each category with respect to month

In [None]:
#for growth in 1 month
growth_subset = df.groupby('category')[['growth (30 days)']].mean()\
.sort_values(by='growth (30 days)', ascending=False)

growth_subset

In [None]:
#for growth in 1 month
growth_subset.plot(kind='bar')
plt.ylabel('Growth in 30 days')
plt.xlabel('Categories')
plt.title('Growth of each category in 1 month')

In [None]:
#for growth in 2 months
growth_subset2 = df.groupby('category')[['growth (60 days)']].mean()\
.sort_values(by='growth (60 days)', ascending=False)

growth_subset2

In [None]:
#for growth in 2 months
growth_subset2.plot(kind='bar')
plt.ylabel('Growth in 60 days')
plt.xlabel('Categories')
plt.title('Growth of each category in 2 months')

9. Total number of installs for each game category

In [None]:
df.head()

In [None]:
temp_df = df.groupby(by = 'category')['installs_num'].sum()

In [None]:
temp_df = temp_df.reset_index()

In [None]:
temp_df = temp_df.sort_values(by = 'installs_num', ascending=False)

In [None]:
temp_df.head(17)

In [None]:
sns.barplot(y= 'category',x = 'installs_num',data = temp_df)

In [None]:
temp_df = df.sort_values(by = ['installs_num','average rating'], ascending = False).head(1000)

In [None]:
temp_df =temp_df.groupby(by = 'category')['installs_num'].count().reset_index().sort_values(by = 'installs_num', ascending = False)

In [None]:
sns.lineplot(x= 'installs_num', y = 'category' ,data = temp_df)

10. Top 17 categories by total ratings

In [None]:
temp_df = df.groupby(by = 'category')['total ratings'].sum().reset_index().sort_values(by = 'total ratings')

In [None]:
temp_df.head(17).sort_values('total ratings',ascending=False)

In [None]:
sns.barplot(y= 'category',x = 'total ratings',data = temp_df)