In [None]:


import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv("/kaggle/input/windows-store/msft.csv")

In [None]:
df

Things to see in this notebook
1. Data cleaning 
2. Study of the fields in dataset
3. Free v/s paid apps analysis

Data cleaning
1. Remove null values
2. Extract month and year from date column
3. Edit price column 
4. Changing data typs of few columns

1. Remove null values

In [None]:
df.isnull().sum()

As there are very few null values they can be removed without any problem

In [None]:
df = df.dropna()

2. Extract month and year from date column

In [None]:
df['Year'] = df['Date'].apply(lambda x:x[-4::])
df['Month'] = df['Date'].apply(lambda x :x[3:5])
df

3)Editing the price column:
1. Converting string 'Free' into 0
2. Dropping the string '₹'

In [None]:
df['Price'] = df['Price'].str.replace('₹', '')
df

In [None]:

df['Price'] = df['Price'].replace(['Free'], 0)
df

Checking the data types

In [None]:
df.info()

Changing data type of columns
1. Year
2. Month
3. Price

In [None]:
df['Year'] = pd.to_numeric(df['Year'])
df['Month'] = pd.to_numeric(df['Month'])
df['Price'] = df['Price'].str.replace(',', '')
df['Price'] = pd.to_numeric(df['Price'])
df["Price"].fillna( 0 , inplace = True) 
df

Adding a free or paid column with binary values indicating:
1. 0 : Free app
2. 1 : Paid app 

In [None]:
def row(l):
    if l == 0:
        return 0
    else:
        return 1
df['Paid'] = df['Price'].apply(row)   
df

In [None]:
df['Category'].value_counts()

Studying the fields of the dataset 
1. Count of Free v/s Paid apps
2. Count of total apps in the categories
3. Avg. rating of categories
4. Month for app uploads
5. Distribution for app uploads based on year

Count of Free v/s Paid apps

In [None]:
fig_dims = (6, 4)
fig, ax = plt.subplots(figsize=fig_dims)
sns.countplot(x='Paid',data = df)


Count of categories

In [None]:
df['Category'] = df['Category'].str.replace('Health and Fitness', 'Fitness')
df['Category'] = df['Category'].str.replace('Kids and Family', 'Family')
df['Category'] = df['Category'].str.replace('Navigation and Maps', 'Maps')
df['Category'] = df['Category'].str.replace('Government and Politics', 'Politics')
fig_dims = (16, 10)
fig, ax = plt.subplots(figsize=fig_dims)
sns.countplot(x='Category',data = df)

Count of total no. of people rated based on the categories
1. Music
2. Books
3. Business

In [None]:
df1 = df.groupby(['Category'])['No of people Rated'].sum()
df1

Average of ratings based on categories
1. Politics
2. Family
3. Developer tools
 

In [None]:
df1 = df.groupby(['Category'])['Rating'].mean()
df1

Month for app uploads

In [None]:
fig_dims = (6,6)
fig, ax = plt.subplots(figsize=fig_dims)
sns.countplot(x='Month',data = df)

Year uploads: It can be seen that after 2016 the number of app uploads reduced dramatically

In [None]:
fig_dims = (6, 4)
fig, ax = plt.subplots(figsize=fig_dims)
sns.countplot(x='Year',data = df)

> Free v/s Paid apps

Studying free apps

In [None]:
df_free = df[df['Paid'] == 0]

Rating of the apps: 4 is the mode

In [None]:
fig_dims = (6,6)
fig, ax = plt.subplots(figsize=fig_dims)
sns.countplot(x='Rating',data = df_free)

Month: Almost uniform distribution of uploads

In [None]:
fig_dims = (6,6)
fig, ax = plt.subplots(figsize=fig_dims)
sns.countplot(x='Month',data = df_free)

Year: The app uploads are depreciating after the peak in 2016

In [None]:
fig_dims = (6,6)
fig, ax = plt.subplots(figsize=fig_dims)
sns.countplot(x='Year',data = df_free)

Studying paid apps

In [None]:
df_paid = df[df['Paid'] == 1]

Rating: 1 is the mode suggesting the customer dissatisfaction

In [None]:
fig_dims = (6,6)
fig, ax = plt.subplots(figsize=fig_dims)
sns.countplot(x='Rating',data = df_paid)

Categories in the paid apps

In [None]:
fig_dims = (6,6)
fig, ax = plt.subplots(figsize=fig_dims)
sns.countplot(x='Category',data = df_paid)

Month: Most of the uploads have taken place in May 

In [None]:
fig_dims = (6,6)
fig, ax = plt.subplots(figsize=fig_dims)
sns.countplot(x='Month',data = df_paid)

Year: The paid apps uploads are increasing every year

In [None]:
fig_dims = (6,6)
fig, ax = plt.subplots(figsize=fig_dims)
sns.countplot(x='Year',data = df_paid)