In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as datetime
import plotly.graph_objs as go
from plotly.offline import iplot
import plotly.figure_factory as ff
# Использование cufflinks в офлайн-режиме
import cufflinks
cufflinks.go_offline()

# Настройка глобальной темы cufflinks
cufflinks.set_config_file(world_readable=True, theme='pearl', offline=True)
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

### Import data

In [None]:
data = pd.read_csv('../input/windows-store/msft.csv')

### Looking at the data

In [None]:
data.head()

### Getting some useful information about dataset

In [None]:
print(data.info())
print(data.describe())

## DATA PREPARATION

### Checking missing values

In [None]:
data.isnull().sum()

### We have only 1 row with missning values, so we can drop it without any doubt

In [None]:
data = data.dropna()

### Making Date column as datetime format

In [None]:
data['Date'] = pd.to_datetime(data['Date'])

### Taking a look at the data

In [None]:
print(data.head(7))
print(data.tail(7))

### Price column has "Free" values and price in Indian rupees, so I decided to convert it to dollars, but before it I did some preparations with this column

In [None]:
data['Price'] = data['Price'].str.replace('₹', '') #delete money sign
data['Price'] = data["Price"].apply(lambda x: float(x.lstrip().replace(',', '')) if x!='Free' else x) #transform string to float format
data['Price'] = data["Price"].apply(lambda x: x*0.0133681 if x!='Free' else x) #convert INR to USD

### The data is ready for analysis

In [None]:
data

## EDA

### Taking 'year' and 'month' column to see distributions

In [None]:
data['year'] = data['Date'].dt.year
data['month'] = data['Date'].dt.month

### See how amount of apps was changing through the years

In [None]:
px.histogram(data, x='year', title = 'Amount of apps through the years')

### See how amount of people rated was changing through the years

In [None]:
fig = px.line(data.groupby('year')['No of people Rated'].sum().reset_index(), x='year', y='No of people Rated',
             title = 'Amount of people rated throught the years')
fig.show()

In [None]:
data1 = data.set_index('Date')

### See how much apps in each category we have

In [None]:
px.pie(data.Category.value_counts().reset_index(), values='Category', names='index',
      title = 'Amount of apps by Category')

### See how amount of apps by every category was changing through the years. Peak activity was in 2016, then decrease is started.

In [None]:
category = data1.iloc[:,[3,5,6]]
categ = pd.DataFrame(category.groupby(['year', 'Category']).Category.count())
categ = categ.rename(columns={'Category':'amount'})

In [None]:
categ.reset_index()
fig = px.line(categ.reset_index(), x="year", y="amount", color='Category',
             title='Amount of Category apps through the years')
fig.show()

### See Rating Distribution. It is not normal

In [None]:
hist_labels = [data.Rating.values]
group_labels = ['Rating distribution']
fig = ff.create_distplot(hist_labels, group_labels)
fig.show()


### See amount of people rated distribution

In [None]:
hist_labels = [data['No of people Rated'].values]
group_labels = ['No of people rated distribution']
fig = ff.create_distplot(hist_labels, group_labels)
fig.show()


### See amount of each rate

In [None]:
rate = pd.DataFrame(data.groupby('Rating').size().reset_index())
rate = rate.rename(columns={'Rating':'rate',0:'amount'})

px.pie(rate, values='amount', names='rate', title = 'Amount of each rate')

### See mean rating for each category. The best is @Government and Polytics" and the worst is "Multimedia Design"

In [None]:
px.bar(data.groupby('Category').Rating.mean().reset_index(), x='Category', y='Rating', color='Rating',
       title='Mean rating by category')

In [None]:
data['Price2'] = np.where(data.Price=='Free', 'Free', 'Paid')

### Amount of free and paid apps.

In [None]:
px.pie(data.Price2.value_counts().reset_index(), values = 'Price2', names='index', title='Amount of free and paid apps')

### Amount of free apps by category. The majority is Music and minority is Government and polytics.

In [None]:
px.pie(data[data.Price=='Free'].Category.value_counts().reset_index(), values='Category', names='index',
      title = 'Amount of free apps by Category')

### Amount of paid apps by category. Here we have only 3: Books, Business and Developer Tools.

In [None]:
px.pie(data[data.Price2=='Paid'].Category.value_counts().reset_index(), values='Category', names='index',
      title = 'Amount of paid apps by Category')

### How amount of paid apps was changing through the years

In [None]:
px.line(data[data.Price2=='Paid'].groupby('year').size().reset_index(), x='year', y=0, title = "Amount of paid apps thtough the years")

### Mean rating for each category in paid apps.

In [None]:
px.bar(data[data.Price2=='Paid'].groupby('Category').Rating.mean().reset_index(), x='Category', y='Rating', 
      title = 'Mean rating for each category in paid apps')

In [None]:
paid = data[data.Price!='Free']
paid['Price'] = paid['Price'].astype('float')

### What is mean price for paid apps?

In [None]:
print('Mean price of paid apps is {} $'.format(data[data.Price!='Free'].Price.mean()))

### Mean price for each category in paid apps

In [None]:
print(paid.groupby('Category').Price.mean())
px.bar(paid.groupby('Category').Price.mean().reset_index(), x='Category', y='Price', 
      title = 'Mean price for each category')

### See TOP 20 FREE APPS. Amount of rates is 1.5 for mean amount of rates and rate is higher than 4

In [None]:
top_free = data[(data.Price=='Free')&(data['No of people Rated']>data['No of people Rated'].mean()*1.5)&(data.Rating>4)]

In [None]:
top_free = top_free.sort_values(['Rating', 'No of people Rated'], ascending=False).head(20)
top_free

In [None]:
px.pie(top_free,values='No of people Rated', hover_data=['Name', 'Category'], title='TOP 20 FREE APPS')

### Category distribution for TOP 20 FREE APPS

In [None]:
px.histogram(top_free, x='Category', title = 'Category distribution among top 20 free apps')

### SEE TOP 20 PAID APPS. Amount of rates is 0.7 of mean amount of rates and rating is higher than 4.

In [None]:
top_paid = data[(data.Price!='Free')&(data['No of people Rated']>data['No of people Rated'].mean()*0.7)&(data.Rating>4)]

In [None]:
top_paid = top_paid.sort_values(['Rating', 'No of people Rated'], ascending=False).head(20)
top_paid

In [None]:
px.pie(top_paid,values='No of people Rated', hover_data=['Name', 'Category'], title='TOP 20 PAID APPS')

### Category distribution for TOP 20 PAID APPS

In [None]:
px.histogram(top_paid, x='Category', title = 'Category distribution among top 20 paid apps')

### Mean price for each category in top 20 paid apps

In [None]:
top_paid['Price'] = pd.to_numeric(top_paid['Price'])
px.bar(top_paid.groupby('Category').Price.mean().reset_index(), x='Category', y='Price')

In [None]:
print('Mean price for TOP 20 PAID APPS is {}'.format(top_paid.Price.mean()))

## SUMMARY

### *peak development for windows store was in 2016, then we see decrease.***
### *the majority of apps has rate 3.5-5. ****
### *there is 13 categories for apps. The best by mean rate is Government and polytics, the worst is Multimedia Design****
### *majority of the top 20 free apps is in Developer Tools and Health and Lifestyle categories.****
### *paid apps are in only 3 categories: Books, Developer tools and Business. The best by rate is Developer tools and the worst is Business.****
### *mean price for paid apps is 4.95 dollars and mean price for top 20 paid apps is 3 dollars****