In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

**Importing visualization libraries**

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

**Importing the dataset**

In [None]:
df = pd.read_csv('/kaggle/input/windows-store/msft.csv')

**First, we are taking a quick look in the data, in order to have an idea of what the columns titles are and what these data represent.**

In [None]:
df.head()

In [None]:
df.info()

*The first thing we can notice is that, there are null values in this dataset and more specifically 1 element in the columns 'Name', 'Rating', 'Date' and 'Price'. Therefore, we will have to find those elements.*

In [None]:
df.isnull()

*From the above table, we can deduce that the null elements can all be found in the last row of the dataset. Thereofre, since we do not have much information regarding that item and since it is one out of 5322 total elements, we can drop this row entirely.*

In [None]:
df = df.drop([5321])

In [None]:
df.tail()

*Another main theme that can be noticed is that, there are apps which can be downloaded and used for free and others that are not. Therefore, we can create two separate datasets, one containing the free apps and one containing the rest.*

In [None]:
df_free_apps = df[df['Price'] == 'Free']
df_no_free_apps = df[df['Price'] != 'Free']

In [None]:
len(df_free_apps)

In [None]:
len(df_no_free_apps)

*It can be observed that there are much more free apps compared to non-free apps. Let's have a brif look at these datasets.*

In [None]:
df_free_apps

In [None]:
df_no_free_apps

*It can be noticed that those apps that come at a certain cost were stacked at the bottom 158 rows of the initial dataset. Therefore, we can reset the index of that new dataset, in order to facilitate our later work.*

In [None]:
df_no_free_apps = df_no_free_apps.reset_index()
df_no_free_apps

*With this issue fixed, another issue is the fact that the 'Price' column consists of elements, the type of which is object. Therefore, this does not allow us at the moment to perform visualizations. First, we need to make sure that all of the types can be converted to floats and thus we need to remove all potential decimals.*

In [None]:
for i in range(len(df_no_free_apps)):
    df_no_free_apps['Price'][i] = df_no_free_apps['Price'][i].replace(',', '')

In [None]:
for i in range(len(df_no_free_apps)):
    df_no_free_apps['Price'][i] = df_no_free_apps['Price'][i][2:6]

In [None]:
df_no_free_apps['Price'] = df_no_free_apps['Price'].astype(float)

In [None]:
df_no_free_apps.info()

*Thanks to the process above, we successfully converted the elements of 'Price' column to float objects. The next step is to convert the 'Date' elements to 'datetime' objects. The same applies for the dataset with the free apps data.*

In [None]:
df_no_free_apps = df_no_free_apps.drop(['index'], axis  = 1)
df_no_free_apps

In [None]:
df_no_free_apps['Date'] = pd.to_datetime(df_no_free_apps['Date'])
df_free_apps['Date'] = pd.to_datetime(df_free_apps['Date'])

In [None]:
df_no_free_apps.info()

In [None]:
df_free_apps.info()

*From the 'Date' column, we can now extract the month and the year elements, which can then be used in our EDA.*

In [None]:
df_no_free_apps['Month'] = df_no_free_apps['Date'].dt.month

In [None]:
df_free_apps['Month'] = df_free_apps['Date'].dt.month

In [None]:
df_no_free_apps['Year'] = df_no_free_apps['Date'].dt.year
df_free_apps['Year'] = df_free_apps['Date'].dt.year

In [None]:
df_free_apps.info()

In [None]:
df_no_free_apps.info()

# Exploratory Data Analysis #

**In the beginning, we can visualize the level of ratings for free and non-free apps and check if we can conclude anything from it.**

In [None]:
plt.figure(figsize = (14, 6))
plt.subplot(1, 2, 1)
sns.countplot(x = 'Rating', data = df_free_apps)
plt.title('Rating of free apps')


plt.subplot(1, 2, 2)
sns.countplot(x = 'Rating', data = df_no_free_apps)
plt.title('Rating of non-free apps')

**We can also investigate the distribution of prices for non-free apps.**

In [None]:
plt.figure(figsize = (12, 6))
sns.distplot(df_no_free_apps['Price'], kde = False, bins = 50)
plt.title('Distribution of prices for non-free apps')

**The next step is to display how many apps can be ranked to each category, separately for free apps and for non-free apps.**

In [None]:
plt.figure(figsize = (60, 6))
plt.subplot(1, 2, 1)
sns.countplot(x = 'Category', data = df_free_apps)
plt.title('Categories of free apps')

plt.figure(figsize = (20, 6))
plt.subplot(1, 2, 2)
sns.countplot(x = 'Category', data = df_no_free_apps)
plt.title('Categories of non-free apps')

**We can also create two more dataframes, where all the columns are grouped by 'Category'.**

In [None]:
category_no_free_apps = df_no_free_apps.groupby(['Category']).mean().drop(['Month', 'Year'], axis = 1)
category_free_apps = df_free_apps.groupby(['Category']).mean().drop(['Month', 'Year'], axis = 1)

In [None]:
category_free_apps

In [None]:
category_no_free_apps

**In order to improve our graphs, we can round the results in each column.**

In [None]:
category_free_apps['Rating'] = category_free_apps['Rating'].round(2)
category_free_apps['No of people Rated'] = category_free_apps['No of people Rated'].round()

category_no_free_apps['Rating'] = category_no_free_apps['Rating'].round(2)
category_no_free_apps['No of people Rated'] = category_no_free_apps['No of people Rated'].round()
category_no_free_apps['Price'] = category_no_free_apps['Price'].round()

In [None]:
category_free_apps

In [None]:
category_no_free_apps

In [None]:
plt.figure(figsize = (20, 6))
plt.subplot(1, 3, 1)
category_no_free_apps['Rating'].plot(kind = 'bar')
plt.title('Rating of non-free apps per category')


plt.subplot(1, 3, 2)
category_no_free_apps['No of people Rated'].plot(kind = 'bar')
plt.title('Number of people rated non-free apps per category')


plt.subplot(1, 3, 3)
category_no_free_apps['Price'].plot(kind = 'bar')
plt.title('Price of non-free apps per category')

In [None]:
plt.figure(figsize = (40, 6))
plt.subplot(1, 2, 1)
category_free_apps['Rating'].plot(kind = 'bar')
plt.title('Rating of free apps per category')

plt.figure(figsize = (40, 6))
plt.subplot(1, 2, 2)
category_free_apps['No of people Rated'].plot(kind = 'bar')
plt.title('Number of people rated free apps per category')

**Another interesting aspect we could investigate is that of the number of apps coming out per month and per year.**

In [None]:
plt.figure(figsize = (20,6))
plt.subplot(1, 2, 1)
sns.countplot(df_no_free_apps['Month'])
plt.title('Number of non-free apps released per month')

plt.subplot(1, 2, 2)
sns.countplot(df_no_free_apps['Year'])
plt.title('Number of non-free apps released per year')

*In the above graphs, it can showcased that the majority of non-free apps were released in 2020, while most of these apps were released in May.*

In [None]:
plt.figure(figsize = (20,6))
plt.subplot(1, 2, 1)
sns.countplot(df_free_apps['Month'])
plt.title('Number of free apps released per month')

plt.subplot(1, 2, 2)
sns.countplot(df_free_apps['Year'])
plt.title('Number of free apps released per year')

*In the above graphs, it can be showed that most free apps were released in 2016, followed by 2017 and 2018.*

**We could also group the two dataframes arund the 'Year' and 'Month columns, in order to potentially extract further conclusions.**

In [None]:
month_free_apps = df_free_apps.groupby(['Month']).mean().drop(['Year'], axis = 1)
year_free_apps = df_free_apps.groupby(['Year']).mean().drop(['Month'], axis = 1)

month_no_free_apps = df_no_free_apps.groupby(['Month']).mean().drop(['Year'], axis = 1)
year_no_free_apps = df_no_free_apps.groupby(['Year']).mean().drop(['Month'], axis = 1)

In [None]:
month_free_apps['Rating'] = month_free_apps['Rating'].round(2)
month_free_apps['No of people Rated'] = month_free_apps['No of people Rated'].round()

year_free_apps['Rating'] = year_free_apps['Rating'].round(2)
year_free_apps['No of people Rated'] = year_free_apps['No of people Rated'].round()

month_no_free_apps['Rating'] = month_no_free_apps['Rating'].round(2)
month_no_free_apps['No of people Rated'] = month_no_free_apps['No of people Rated'].round()
month_no_free_apps['Price'] = month_no_free_apps['Price'].round()

year_no_free_apps['Rating'] = year_no_free_apps['Rating'].round(2)
year_no_free_apps['No of people Rated'] = year_no_free_apps['No of people Rated'].round()
year_no_free_apps['Price'] = year_no_free_apps['Price'].round()

In [None]:
plt.figure(figsize = (16, 6))
plt.subplot(1, 2, 1)
month_free_apps['Rating'].plot(kind = 'bar')
plt.title('Rating of free apps released per month')

plt.subplot(1, 2, 2)
month_free_apps['No of people Rated'].plot(kind = 'bar')
plt.title('Number of people that rated free apps released per month')

In [None]:
plt.figure(figsize = (16, 6))
plt.subplot(1, 2, 1)
year_free_apps['Rating'].plot(kind = 'bar')
plt.title('Rating of free apps released per year')

plt.subplot(1, 2, 2)
year_free_apps['No of people Rated'].plot(kind = 'bar')
plt.title('Number of people that rated free apps released per year')

In [None]:
plt.figure(figsize = (22, 6))
plt.subplot(1, 3, 1)
month_no_free_apps['Rating'].plot(kind = 'bar')
plt.title('Rating of non-free apps released per month')

plt.subplot(1, 3, 2)
month_no_free_apps['No of people Rated'].plot(kind = 'bar')
plt.title('Number of people that rated non-free apps released per month')

plt.subplot(1, 3, 3)
month_no_free_apps['Price'].plot(kind = 'bar')
plt.title('Price of non-free apps released per month')

In [None]:
plt.figure(figsize = (22, 6))
plt.subplot(1, 3, 1)
year_no_free_apps['Rating'].plot(kind = 'bar')
plt.title('Rating of non-free apps released per year')

plt.subplot(1, 3, 2)
year_no_free_apps['No of people Rated'].plot(kind = 'bar')
plt.title('Number of people that rated non-free apps released per year')

plt.subplot(1, 3, 3)
year_no_free_apps['Price'].plot(kind = 'bar')
plt.title('Price of non-free apps released per year')

**Another interesting aspect to check is whether there is correlation between the variables for the dataset with the free apps and for the dataset with the non-free apps respectively.**

In [None]:
plt.figure(figsize = (16,10))
corr = df_free_apps.corr()
mask = np.zeros_like(corr)
mask[np.triu_indices_from(mask)] = True
sns.heatmap(corr, mask = mask, annot = True, cmap = 'viridis')
plt.title('Heatmap of correlation between variables for free apps')

In [None]:
plt.figure(figsize = (16,10))
corr = df_no_free_apps.corr()
mask = np.zeros_like(corr)
mask[np.triu_indices_from(mask)] = True
sns.heatmap(corr, mask = mask, annot = True, cmap = 'viridis')
plt.title('Heatmap of correlation between variables for non-free apps')

In [None]:
plt.figure(figsize = (12,6))
sns.boxplot(x = 'Rating', y = 'No of people Rated', data = df_no_free_apps)
plt.title('Distribution of number of people that rated apps and their ratings')

In [None]:
plt.figure(figsize = (12,6))
sns.boxplot(x = 'Rating', y = 'Price', data = df_no_free_apps)
plt.title('Distribution of prices of  apps and their ratings')

In [None]:
plt.figure(figsize = (12,6))
sns.boxplot(x = 'Rating', y = 'No of people Rated', data = df_free_apps)
plt.title('Distribution of number of people that rated apps and their ratings (free apps)')

*We can also group the dataframe with the non-free apps based on rating and then proceed to extract conclusions.*

In [None]:
rating_no_free_apps = df_no_free_apps.groupby(['Rating']).mean().drop(['Month', 'Year'], axis = 1)
rating_no_free_apps['No of people Rated'] = rating_no_free_apps['No of people Rated'].round()
rating_no_free_apps['Price'] = rating_no_free_apps['Price'].round()
rating_no_free_apps

In [None]:
plt.figure(figsize = (14, 6))
plt.subplot(1, 2, 1)
rating_no_free_apps['No of people Rated'].plot(kind = 'bar')
plt.title('Number of people rated for non-free apps per rating level')

plt.subplot(1, 2, 2)
rating_no_free_apps['Price'].plot(kind = 'bar')
plt.title('Price non-free apps per rating level')

*It can be noticed that the apps with rating 3.0 and 4.0 appear to cost a lot more compared to the rest. This could be attributed to outliers. For example, if we have a look at the price distribution of the non-free apps, the vast majority of the apps cost up to 1000 units. Therefore, we could exclude outliers above this threshold and display the results again.*

In [None]:
new_df_no_free_apps = df_no_free_apps[df_no_free_apps['Price'] <= 1000.0]

In [None]:
len(new_df_no_free_apps)

In [None]:
rating_new_df = new_df_no_free_apps.groupby(['Rating']).mean().drop(['Month', 'Year'], axis = 1)
rating_new_df['No of people Rated'] = rating_new_df['No of people Rated'].round()
rating_new_df['Price'] = rating_new_df['Price'].round()

In [None]:
plt.figure(figsize = (26, 6))
plt.subplot(1, 2, 1)
rating_new_df['No of people Rated'].plot(kind = 'bar')
plt.title('Number of people rated for non-free apps per rating level - Removed outliers with price over 1000')

plt.subplot(1, 2, 2)
rating_new_df['Price'].plot(kind = 'bar')
plt.title('Price non-free apps per rating level - Removed outliers with price over 1000')