In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed


import numpy as np 
import pandas as pd
import os

from matplotlib import pyplot as plt
plt.style.use('ggplot')

import seaborn as sns # for making plots with seaborn
color = sns.color_palette()
sns.set(rc={'figure.figsize':(25,15)})

import plotly
import plotly.express as px
# connected=True means it will download the latest version of plotly javascript library.
plotly.offline.init_notebook_mode(connected=True)
import plotly.graph_objs as go

import plotly.figure_factory as ff
import cufflinks as cf


import warnings
warnings.filterwarnings('ignore')

In [None]:
#Reading the Data 
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        fname=os.path.join(dirname, filename)
        
data=pd.read_csv(fname)


In [None]:
data

In [None]:
#Columns in Data

data.columns

#Duplicate Data

print(data.duplicated(subset='App').any().sum())
data=data.drop_duplicates(subset='App')
data.duplicated(subset='App').any().sum()


In [None]:
#Data Frame Information
data.info()

In [None]:
#No of Unique values

for i in data.columns:
    print(i,data[i].nunique())
    


In [None]:
#Missing Values

data.Type=data.Type.fillna(data['Type'].mode)
data.Type.isna().any()

In [None]:
#Data Cleaning

#Removing + and , sign from Data and convert it to Numeric
data.Installs=[x.strip('+') for x in data.Installs]
data.Installs=data.Installs.str.replace(',','')
data.Installs=pd.to_numeric(data.Installs)

#Removing $ sign from Price
data.Price=[x.strip('$') for x in data.Price]
data.Price=pd.to_numeric(data.Price)

#Converting All size to MB
data.Size=data.Size.apply(lambda x: str(x).replace('Varies with device', '0'))
data.Size=data.Size.apply(lambda x: str(x).replace('M', '') if 'M' in str(x) else x)
data.Size=data.Size.apply(lambda x: str(x).replace(',', '') if 'M' in str(x) else x)
data.Size=data.Size.apply(lambda x: float(str(x).replace('k', '')) / 1000 if 'k' in str(x) else x)
data.Size=pd.to_numeric(data.Size)


# Convert to Date for Time Series Analysis
data['Last Updated']=pd.to_datetime(data['Last Updated'])


In [None]:
data.columns

In [None]:
#Market Segmantation

mark_seg=data['Category'].value_counts()

fig = px.pie(values=mark_seg, names=mark_seg.index, title='Market Segmentaion of Categories')
fig.show()

Here we can conclude that most of applications are made for family Category and least from Beauty******

In [None]:
top1=data.sort_values(by='Installs',ascending=False)[:100]
top1.Type.unique()



In [None]:
# Top 100 Application Installs are from which Category
top=data.sort_values(by='Installs',ascending=False)[:100]
top=top.Category.value_counts()


fig = px.pie(values=top, names=top.index, title='Category of Top Apps')
fig.show()


Here we can say that COMMUNICATION is the bset performed Category althogh number of highest app are from FAMILY
We can also see that there is no app from SPORTS category which is most downloaded

ONE WAY ANOVA App Ratings Accross the Category

In [None]:
#App Rating across the Categories

import scipy.stats as stats
f = stats.f_oneway(data.loc[data.Category == 'BUSINESS']['Rating'].dropna(), 
               data.loc[data.Category == 'FAMILY']['Rating'].dropna(),
               data.loc[data.Category == 'GAME']['Rating'].dropna(),
               data.loc[data.Category == 'PERSONALIZATION']['Rating'].dropna(),
               data.loc[data.Category == 'LIFESTYLE']['Rating'].dropna(),
               data.loc[data.Category == 'FINANCE']['Rating'].dropna(),
               data.loc[data.Category == 'EDUCATION']['Rating'].dropna(),
               data.loc[data.Category == 'MEDICAL']['Rating'].dropna(),
               data.loc[data.Category == 'TOOLS']['Rating'].dropna(),
               data.loc[data.Category == 'PRODUCTIVITY']['Rating'].dropna()
              )

print(f)

groups = data.groupby('Category').filter(lambda x: len(x) > 286).reset_index()
array = groups['Rating'].hist(by=groups['Category'], sharex=True, figsize=(20,20))


In [None]:
#App Rating across the Categories

import scipy.stats as stats
f = stats.f_oneway(data.loc[data.Category == 'BUSINESS']['Size'].dropna(), 
               data.loc[data.Category == 'FAMILY']['Size'].dropna(),
               data.loc[data.Category == 'GAME']['Size'].dropna(),
               data.loc[data.Category == 'PERSONALIZATION']['Size'].dropna(),
               data.loc[data.Category == 'LIFESTYLE']['Size'].dropna(),
               data.loc[data.Category == 'FINANCE']['Size'].dropna(),
               data.loc[data.Category == 'EDUCATION']['Size'].dropna(),
               data.loc[data.Category == 'MEDICAL']['Size'].dropna(),
               data.loc[data.Category == 'TOOLS']['Size'].dropna(),
               data.loc[data.Category == 'PRODUCTIVITY']['Size'].dropna()
              )

print(f)

groups = data.groupby('Category').filter(lambda x: len(x) > 286).reset_index()
array = groups['Size'].hist(by=groups['Category'], sharex=True, figsize=(20,20))


Here We can say large no of apps are between 0-20 mbs

In [None]:
#App Pricing Trend
subset = data[data.Category.isin(['GAME', 'FAMILY', 'PHOTOGRAPHY', 'MEDICAL', 'TOOLS', 'FINANCE',
                                 'LIFESTYLE','BUSINESS'])]
sns.set_style('darkgrid')
fig, ax = plt.subplots()
fig.set_size_inches(15, 8)
p = sns.stripplot(x="Price", y="Category", data=subset, jitter=True, linewidth=1)
title = ax.set_title('App pricing trend across categories')


Some Apps Are Price > 80 to 400 $

Medical and Life Style apps are the most expensive. Some medical apps extend even upto 80$.
 All other apps are priced under 50$.
Surprisingly, all game apps are priced low 20

> **Paid App Downloaded as much Free App?**

In [None]:
trace0 = go.Box(
    y=np.log10(data['Installs'][data.Type=='Paid']),
    name = 'Paid',
    marker = dict(
        color = 'rgb(214, 12, 140)',
    )

)
trace1 = go.Box(
    y=np.log10(data['Installs'][data.Type=='Free']),
    name = 'Free',
    marker = dict(
        color = 'rgb(0, 128, 128)',
    )
)
layout = go.Layout(
    title = "Number of downloads of paid apps Vs free apps",
    yaxis= {'title': 'Number of downloads (log-scaled)'}
)
graph = [trace0, trace1]
plotly.offline.iplot({'data': graph, 'layout': layout})

In [None]:
#size of Paid Apps
paid_df = data[data.Type == 'Paid']
fig = px.scatter(paid_df, x="Rating", y="Size",title="Paid app Ratings vs Size")
fig.show()

* Here We can see that most paid app have goog ratings
* This means that most paid apps are designed and developed to cater to specific functionalities and hence are not bulky.

* Here We can most app has size between 1 to 60 MBS
* Users prefer to pay for apps that are light-weighted. A paid app that is bulky may not perform well in the market.

In [None]:
#Free app Rating vs Size
free_df = data[data.Type == 'Free']
fig = px.scatter(paid_df, x="Rating", y="Size",title="Free app Ratings vs Size")
fig.show()


In [None]:
# Android Version
Android_version=data.groupby('Android Ver').count()

fig = px.bar( x=Android_version.index, y=Android_version.App, title="Most Application Running on Android Version")
fig.show()

Here The many Applications are running on version 4.1

In [None]:
#LAST updated

Updated_data=data.groupby('Last Updated').count()

fig = px.line( x=Updated_data.index, y=Updated_data.App, title="When the App updated Last")
fig.show()

There are many apps which is not updated in last 4 to 5 years

In [None]:
#Content 
Updated_data=data.groupby('Content Rating').count()
fig = go.Figure(go.Funnelarea(
    text = Updated_data.index,
    values = Updated_data.App
    ))
fig.show()


There are 7903 apps which can be used by everyone
while 393 apps are only for 17+ age people

In [None]:
paid_data=data[data['Type']=="Paid"]
paid_data=paid_data.groupby("Category").count()

fig = px.bar( x=paid_data.App, y=paid_data.index, title="Most paid app downloaded from Category")
fig.show()

Family Has the most no of paid app downloaded 