In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
from wordcloud import WordCloud, STOPWORDS

%matplotlib inline

In [None]:
df = pd.read_csv('../input/windows-store/msft.csv')

## Information about Dataset

In [None]:
df.info()

In [None]:
df.head()

# Preprocessing

## Null values in dataset

In [None]:
def findNull(dataFrame):
    for col in dataFrame.columns:
        null_sum = df[col].isna().sum()
        print(f'"{col}": {null_sum} null values')

In [None]:
findNull(df)

In [None]:
mask=df.isna().any(axis=1)==True
df[mask]

In [None]:
df = df.drop(index=5321)

In [None]:
findNull(df)

## Price data preprocessing

* delete ','
* replace'Free' with 0
* delete '₹ ' 
* convert String to float

In [None]:
# check the Price data
df['Price'].unique()

In [None]:
# delete ','
df['Price'] = df['Price'].str.replace(',','')

# replace'Free' with 0
df.loc[df['Price']=='Free','Price']=0

# delete '₹ ' 
df.loc[df['Price']!=0,'Price']=df['Price'].str[2:]

# convert string to float
df['Price'] = df['Price'].astype(float)

#result
df['Price'].unique()

## Date data preprocessing

In [None]:
df['year'] = df['Date'].str[6:10]
df['month'] = df['Date'].str[3:5]
df['year_month'] = df['Date'].str[6:10]+'-'+df['Date'].str[3:5]

In [None]:
df['Price'].unique()

# Data visualization

* Apps category ratio
* Price distribution****
* Rating distribution
* Relationship between Rating and Price
* Relationship among year_month, Rating and No of people Rated
* Year distribution
* Average Rating by year
* Average Rating by Month
* Average Rating by Year

### Apps category ratio
* "Music" is the the most common category
* Top4 categories ("Music", "Books", "Business" and "Health and Fitness") accounts for a half of the total Apps

In [None]:
Ccount = df['Category']
Ccount = Ccount.reset_index()
Ccount = Ccount.groupby(['Category']).count()
Ccount = Ccount.sort_values('index',ascending=False)


color = ("#55efc4","#81ecec","#74b9ff","#a29bfe","#dfe6e9",
         "#ffeaa7","#e17055","#d63031","#e84393","#2d3436",
         "#00b894","#0984e3","#ffeaa7","#fab1a0","#fd79a8")

label = Ccount.index

plt.figure(figsize=(15,10))
plt.title("App category ratio",fontsize=20)

plt.pie(Ccount, labels=label,colors=color,counterclock=False, startangle=90,autopct="%1.1f%%", pctdistance=0.7)
plt.show()

### Price distribution
* More than 99% of apps are free

In [None]:
tmp1 = df[['Price']]
tmp1 = tmp1.reset_index()
tmp1 = tmp1.groupby(['Price']).count()
tmp1 = tmp1.reset_index()
tmp1 = tmp1.rename(columns={'index': 'count'})
# Pcount

tmp2 = tmp1[tmp1["Price"]!=0]
tmp2 = tmp2.reset_index()
tmp2 = tmp2.groupby(['Price']).count()
tmp2 = tmp2.reset_index()
sum_not_free = tmp2['count'].sum()

tmp1 = tmp1[:1]
Pcount = tmp1.append({'Price': 'Not Free Apps', 'count':sum_not_free }, ignore_index=True)
Pcount.loc[Pcount['Price']==0, 'Price'] = "Free Apps"

label = Pcount['Price']
color = ("#4bcffa","#ff5e57")

plt.figure(figsize=(15,10))
plt.title("Free apps ratio",fontsize=20)

plt.pie(Pcount['count'],labels=label,colors=color,counterclock=False, startangle=90,autopct="%1.1f%%", pctdistance=0.7)
plt.show()

### Rating distribution

In [None]:
plt.subplots(figsize=(12,6))
plt.hist(df['Rating'],color="#ff6b6b")

plt.title("Rating distribution",fontsize=20)
plt.xlabel("Rating", fontsize=15)
plt.ylabel("Count", fontsize=15)
plt.show()

### Relationship between Rating and Price

In [None]:
fig = px.scatter(df, x='Price', y='Rating')
fig.update_layout(title_text="Relationship between Rating and Price")
fig.show()

### Relationship among release year_month, Rating and No of people Rated

In [None]:
fig = px.scatter(df, x='year_month', y='Rating',color='No of people Rated')
fig.update_layout(title_text="Relationship among year_month, Rating and No of people Rated")
fig.show()

### Released Year distribution

In [None]:
Ycount = df[['year']]
Ycount = Ycount.reset_index()
Ycount = Ycount.groupby(['year']).count()
Ycount = Ycount.reset_index()
Ycount = Ycount.rename(columns={'index': 'count'})

plt.subplots(figsize=(10,5))
plt.title('Release Year distribution',fontsize=20)
plt.xlabel('Year',fontsize=15)
plt.ylabel('Count',fontsize=15)
plt.bar(Ycount['year'], Ycount['count'],color="#808e9b")
plt.show()

### Average Rating by release year

* Highest average rating was recorded in 2019
* On the other hand, Lowest average rating was recorded in 2020 (Jan 2020 ~ Jul 2020 )

In [None]:
sample1 = df[['year','Rating']]
sample1 = sample1.groupby(['year'])['Rating'].mean()
sample1 = sample1.reset_index()

plt.subplots(figsize=(10,5))
plt.title('Avg. rating by release year',fontsize=20)
plt.xlabel('Year',fontsize=15)
plt.ylabel('Avg. rating',fontsize=15)
plt.plot(sample1['year'],sample1['Rating'],color="#3c40c6")
plt.show()

### Average rating by release month

* Highest average rating was recorded in September
* Lowest average rating was recorded in May

In [None]:
sample2 = df[['month','Rating']]
sample2 = sample2.groupby(['month'])['Rating'].mean()
sample2 = sample2.reset_index()

plt.subplots(figsize=(10,5))
plt.title('Avg. rating by release month',fontsize=20)
plt.xlabel('Month',fontsize=15)
plt.ylabel('Avg. rating',fontsize=15)

plt.plot(sample2['month'],sample2['Rating'],color="#05c46b")
plt.title
plt.show()

### Average Rating by Release Year

In [None]:
sample3 = df[['year_month','Rating']]
sample3 = sample3.groupby(['year_month'])['Rating'].mean()
sample3 = sample3.reset_index()

plt.subplots(figsize=(20,10))
plt.title('Avg. rating by release year_month',fontsize=20)
plt.xlabel('Year_Month',fontsize=15)
plt.ylabel('Avg. rating',fontsize=15)

plt.plot(sample3['year_month'],sample3['Rating'],color="#ffa801")
plt.xticks(rotation=90)

plt.tight_layout()
plt.show()

## Number of Apps by Category

In [None]:
plt.subplots(figsize=(10,5))
plt.title('Number of Apps by category',fontsize=20)
plt.xlabel('Category',fontsize=15)
plt.ylabel('count',fontsize=15)
df['Category'].value_counts().plot(kind="bar",color="#ff6d69")

## WordCloud

In [None]:
stopwords = set(STOPWORDS)
wordcloud = WordCloud(
                          background_color='#c8d6e5',
                          stopwords=stopwords,
                          max_words=200,
                          max_font_size=50, 
                          random_state=42
                         ).generate(str(df['Name']))

plt.subplots(figsize=(12,6))
plt.axis('off')
plt.imshow(wordcloud)