In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud

import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go

import nltk
from nltk.corpus import stopwords
from textblob import Word
from textblob import TextBlob


* [Introduction](#1)
* [First-Looking to Data](#2)
* [Distribution of Ratings](#3)
* [Distribution of Price](#4)
* [What is the relation between of Ratings and Price?](#5)
* [Distribution of Category](#6)
* [Do People Scare While Voting for Government and Politics Apps?](#7)
* [The Relation between of Years and Total App Numbers and Distribution of Months to Year](#8)
* [The Relation between of Categories and Months-Years](#9)
* [NLP](#10)
* [WordCloud Plot for Music Category](#11)


<a id="1"></a>
## Introduction

![](https://static.shiftdelete.net/wp-content/uploads/2018/01/android-microsoft-store-windows-sdn-02.jpg)

> Microsoft Store (formerly known as Windows Store) is a digital distribution platform owned by Microsoft. It started as an app store for Windows 8 and Windows Server 2012 as the primary means of distributing Universal Windows Platform apps. With Windows 10, Microsoft merged its other distribution platforms (Windows Marketplace, Windows Phone Store, Xbox Music, Xbox Video, Xbox Store, and a web storefront also known as "Microsoft Store") into Microsoft Store, making it a unified distribution point for apps, console games, and digital videos. Digital music was included until the end of 2017, and E-books were included until 2019.[1] Some content is available free of charge from the store.

From **Wikipedia**

#### Importing Data

In [None]:
msft = pd.read_csv("/kaggle/input/windows-store/msft.csv")
df = msft.copy()

<a id="2"></a>
#### First-Looking to Data

In [None]:
df.head()

Description of Columns:

* **Name:** Defines Application Name<br>
* **Rating:** Defines Application Rating<br>
* **No of people Rated:** Defines the value which how many people rated this application <br>
* **Category:** Defines Application Category<br>
* **Date:** Defines when the application uploaded to store<br>
* **Price:** Defines Application Fee<br>

In [None]:
df.info()

* Name - Rating - Category - Date - Price include a null value. We will look their inside. They are in one column or multiple?

In [None]:
df.describe().T

#### Null Values

In [None]:
df[df.isna().any(axis=1)]

* They are in one column. We can remove it. It will not effect to our dataset.

In [None]:
df.dropna(inplace = True)

## Analysis and Visualization with Plotly

<a id="3"></a>
### Distribution of Ratings

In [None]:
rating_series = df["Rating"].value_counts()
labels = rating_series.index
sizes = rating_series.values


In [None]:
fig = make_subplots(rows=1, cols=2, specs=[[{"type": "xy"}, {"type": "domain"}]], subplot_titles=['Histogram', 'Pie'])

fig.add_trace(
    go.Histogram(x=df.Rating),
    row=1, col=1
)

fig.add_trace(
    go.Pie(values = sizes, labels = labels, textinfo='label+percent'),
    row=1, col=2
)


fig.update_layout(title_text='Rating ~ Frequency')
fig.show()

<a id="4"></a>
### Distribution of Price

In [None]:
free_or_not = ["Free" if i == "Free" else "Paid" for i in df.Price]
df["FreeOrPaid"] = free_or_not
price_series = df.groupby("FreeOrPaid")["Name"].count()


In [None]:
fig = go.Figure(data=[go.Pie(values = price_series.values, labels = price_series.index, textinfo='label+percent')])
fig.show()

<a id="5"></a>
### What is the relation between of Ratings and Price?

In [None]:
rating_by_general = df.groupby("Category").mean()
rating_by_general = rating_by_general.sort_values("Rating", ascending = False)

rating_by_free = df[df["Price"] == "Free"].groupby("Category").mean()
rating_by_free.columns = ["Rating_Free","No of people Rated_Free"]

rating_by_paid = df[df["Price"] != "Free"].groupby("Category").mean()
rating_by_paid.columns = ["Rating_Paid","No of people Rated_Paid"]

pd.concat([rating_by_general, rating_by_free, rating_by_paid], axis=1)


In [None]:
class_list = list()
for i in ["General", "Free", "Paid"]:
    for a in range(3):
        class_list.append(i)
        
rating_by_general2 = df.groupby("Category").mean().loc[["Developer Tools", "Books", "Business"]]
rating_by_free2 = df[df["Price"] == "Free"].groupby("Category").mean().loc[["Developer Tools", "Books", "Business"]]
rating_by_paid2 = df[df["Price"] != "Free"].groupby("Category").mean().loc[["Developer Tools", "Books", "Business"]]

df3 = pd.concat([rating_by_general2, rating_by_free2, rating_by_paid2], axis=0)
df3["Class"] = class_list

fig = px.bar(df3,x=df3.index.values, y="Rating", color="Class", barmode="group")
fig.update_layout()
fig.show()

Based on microsoft store dataset in kaggle, there is a visible difference on Ratings between Free and Paid apps.

<a id="6"></a>
### Distribution of Category

In [None]:
df2 = df.groupby("Category")["Name"].count()


fig = px.bar(x=df2.index, y=df2.values)
fig.update_traces(marker_color='brown')
fig.show()

<a id="7"></a>
### Do People Scare While Voting for Government and Politics Apps?

In [None]:
pd.pivot_table(df, index = "Category", columns = "Rating", values = "No of people Rated", aggfunc="mean")

Let's look total app numbers difference between Over 4.0 and Below 2.0 with Graphs

#### Below 2.0

In [None]:
df_new2 = pd.DataFrame(df[df["Rating"] <= 2.0].groupby(["Category","Rating"])["No of people Rated"].mean())
df_new2 = df_new2.reset_index(level=[0,1])

df_new3 = pd.DataFrame(df[df["Rating"] >= 4.0].groupby(["Category","Rating"])["No of people Rated"].mean())
df_new3 = df_new3.reset_index(level=[0,1])


In [None]:
fig = px.bar(df_new2, x="Category",y="No of people Rated", color="Rating")
fig.show()

As we see clearly, People don't tend to under-vote for "Government and Politics" apps<br>
Especially, for 1.0 the difference is huge

#### Over 4.0

In [None]:
fig = px.bar(df_new3, x="Category",y="No of people Rated", color="Rating")
fig.show()


On the otherhand, For over 4.0 there is not a huge difference between of the categories.

### Total Application Numbers by Years, Categories and Months

<a id="8"></a>
#### The Relation between of Years and Total App Numbers and Distribution of Months to Year

In [None]:
df["Year"] = df["Date"].apply(lambda x: x.split("-")[2])
df["Month"] = df["Date"].apply(lambda x: x.split("-")[1])
df["Month"].replace(["01","02","03","04","05","06","07","08","09","10","11","12"],
                    ["January","February","March","April","May","June","July","August","September","October","November","December"],
                    inplace=True)


In [None]:
a = pd.DataFrame(df.groupby(["Year","Month","Category"])["Name"].count())
a = a.reset_index(level=[0,1,2])
fig = px.bar(a, x="Year",y="Name", color="Month")
fig.show()


<a id="9"></a>
#### The Relation between of Categories and Months-Years


In [None]:
fig = px.bar(a, y="Month",x="Name", color="Category")
fig.show()


In [None]:
b = df.groupby(["Year","Category"])["Rating","No of people Rated"].mean()
b = b.reset_index(level=[0,1])
fig = px.scatter(b, x="Year", y="Rating",color="Category",size="No of people Rated")
fig.show()

<a id="10"></a>
## NLP

In [None]:
df_nlp = df.copy()
df_nlp.head()

#### To do lower case all dataframe's Name

In [None]:
df_nlp_series = df_nlp["Name"].apply(lambda x: " ".join(i.lower() for i in str(x).split()))

#### To remove punctuations.

In [None]:
df_nlp_series = df_nlp_series.str.replace("[^\w\s]","") 


#### To remove numbers

In [None]:
df_nlp_series = df_nlp_series.str.replace("\d","") # Numbers

#### To remove stopwords

In [None]:
stop_words = stopwords.words("english")
df_nlp_series = df_nlp_series.apply(lambda x: " ".join(i for i in x.split() if i not in stop_words)) # Stopwords

#### Lemmatization

In [None]:
df_nlp_series = df_nlp_series.apply(lambda x: " ".join(Word(i).lemmatize() for i in x.split()))

In [None]:
df_nlp["Name"] = df_nlp_series
df_nlp.head()

The last status of the dataset

<a id="11"></a>
### WordCloud Plot for Music Category

In [None]:
a = " ".join(df_nlp[df_nlp["Category"] == "Music"]["Name"])
pd.Series(a.split()).value_counts()

In [None]:
word_cloud = WordCloud(max_font_size=50, background_color="white").generate(a)
plt.figure(figsize=(9,7))
plt.imshow(word_cloud, interpolation="bilinear")
plt.axis("off")
plt.show()

### Thank you for taking the time and review.