In [None]:

import numpy as np                 # linear algebra
import pandas as pd                # data processing
import matplotlib.pyplot as plt    # visualization tool
import seaborn as sns              # visualization tool

import plotly.offline              # visualization tool
import plotly.graph_objs as go


from pylab import rcParams         # figure size in inches


import warnings
warnings.filterwarnings('ignore')


In [None]:
dataset = pd.read_csv("../input/googleplaystore.csv") # read .csv file
dataset.sample(10) 

In [None]:
dataset.info() # information about data set

Output Results:

The info() method shows the properties of rows and columns about the data set and what value they have. According to the output results (Figure 1.3), each column (feature) does not have 10841 values. The values in some columns are non-value (Nan). They need to be cleared from the data set.
Although the numerical values are more in this data set, these values are written to the data set as string values. For example, the size, reviews installs and price columns appear in the string. These must be converted to integer or float values.
Removing the unnecessary columns when using the analysis makes the job easier. Therefore, Last Updated, Current Ver and Android Ver columns must be removed.



In [None]:
dataset.describe() # statistical values for numerical columns

Output Results:

As shown in the output just return the Rating column. In fact, the columns that are numerical, but appear string in the data set, must be converted to integer or float.
The intended use of this method appears in the output table. The values of the Rating column are between 1 and 5. But according to the output, the maximum value is 19 that is incorrect value. Incorrect values must be cleared.


In [None]:
dataset.columns  #show features

Output Results:

Unnecessary columns should be removed.( 'Last Updated', 'Current Ver',  'Android Ver')

In [None]:
print('Number of apps in the dataset : ' , len(dataset))
dataset.sample(7)  #give the random 7 sapmle.

Output Results:

Nan values should be removed.

**DATA CLEANING**
* Missing data 
* *Drop duplicates except for the first occurrence in App
* Convert all app sizes to MB
* Remove '+' from 'Number of Installs' to make it numeric
* Convert all review text to English language using Google Translator library

In [None]:
print(dataset.shape)
dataset.info()

In [None]:
#missing data
total = dataset.isnull().sum().sort_values(ascending=False)  
percentage = (dataset.isnull().sum()/dataset.isnull().count()).sort_values(ascending=False) 
missing_data = pd.concat([total, percentage], axis=1, keys=['Total', 'Percentage']) 
missing_data

In [None]:
#Remove missing data
dataset.dropna(how ='any', inplace = True)

In [None]:
total=dataset.isnull().sum().sort_values(ascending=False)
percentage=(dataset.isnull().sum()/dataset.isnull().count()).sort_values(ascending=False)
missing_data=pd.concat([total,percentage],axis=1,keys=["Total","Percentage"])
missing_data

In [None]:
print(dataset.shape)

In [None]:
app=dataset.App.unique()
len(app)

In [None]:
#aynı uygulamadan bir kaç tane varsa onları kaldırmak.
dataset.drop_duplicates(subset="App",keep="first",inplace=True) 
#subset=feature ı seçer.Defaultu firsttir().,nplace kalıcı olarak datasete eşitler.
print(dataset.shape)

Finding:

* After remove missing data and
* Drop duplicates

Our dataset contain 8190 records with 13 features.

In [None]:
#Installs kısmını düzenleme: remove "+" and "," and convet to int
dataset.Installs.unique()
#burda sayılar arasındaki virgülü ve sonundaki artyı kaldırma işlemi yapılacak!

In [None]:
dataset["Installs"] = dataset["Installs"].apply(lambda x: x.replace(",","") if "," in str(x) else x) #sayılar arasındaki virgül
dataset["Installs"] = dataset["Installs"].apply(lambda x: x.replace("+","") if "+" in str(x) else x) #sayıların sonundaki +
dataset["Installs"] = dataset["Installs"].apply(lambda x: int(x))                                       #sayıları int'a çevirme
dataset.Installs.unique()


In [None]:
dataset.loc[1:10,["App","Installs"]]

In [None]:
#Şimdi size kısmını düzeltme
dataset.Size.unique()
#burda hepsini mb cinsinden float bir değere cevirmeliyiz

In [None]:
dataset['Size'] = dataset['Size'].apply(lambda x: str(x).replace('Varies with device', 'NaN') if 'Varies with device' in str(x) else x)

dataset['Size'] = dataset["Size"].apply(lambda x: str(x).replace(",","") if 'M'or "m" in str(x) else x) #burda , leri kaldırma

def mb(x):    #M leri kaldırma fonks.
    if "M"or"m" in str(x):
        x=x.replace("M"or"m","")
        return x
    else:
        return x
dataset["Size"] = dataset["Size"].apply(mb)

def convert_mb(x):    #kb olanları mb a cevirme fonk.
    if "k" in str(x):
        x=x.replace("k","")
        x=float(x)
        x=x/1024
        return x
    else:
        return x
dataset['Size'] = dataset["Size"].apply(convert_mb)

dataset["Size"]=dataset["Size"].astype("float") #Float değerine cevirme fonks.

In [None]:
dataset.loc[1:10,["App","Size"]]

In [None]:
#burda da görüldüğü gibi Nan değerleri yaratmış olduk.
dataset["Size"].value_counts(dropna=False)

In [None]:
#nan olanları fillna ile ortalamaları yazdırma
dataset["Size"].fillna(np.mean(dataset.Size),inplace=True)

dataset["Size"] = dataset["Size"].apply(lambda x: round(x,2))
dataset["Size"].value_counts(dropna=False)

In [None]:
#Şimdi de price kısmını düzeltme de 
print(dataset.dtypes[["Price"]]) #görüldüğü gibi price str gözüküyor.
dataset.Price.unique()

In [None]:
dataset["Price"] = dataset["Price"].apply(lambda x: x.replace("$","") if "$" in str(x) else x)  #$ işaretini kaldırma
dataset["Price"] =dataset["Price"].astype("float")  #ve float a çevirme

dataset.loc[1:5,["App","Price"]]

In [None]:
#Review str bir değer gözüküyor onu int çevirme
dataset.Reviews.unique()

In [None]:
dataset['Reviews'] = dataset['Reviews'].apply(lambda x: int(x))
print(dataset.dtypes[["Reviews"]])

I dropped these portions of information because it unecessary for our machine learning algorithm.

In [None]:
#sütun kaldırma işlemi
dataset.drop(labels=['Last Updated','Current Ver','Android Ver'],axis=1,inplace=True)  

For the following steps, in order to process the data in the machine learning algorithms, we need to first convert it from text to numbers, as from what i understand, most algorithms run better that way. 

From the categorical column, I converted each category into an individual number. In the later sections when we do apply machine learning, two methods will be applied to the code, being integer encoding(which we are doing now) and one-hot encoding, aka dummy variables.

The main reason as to why I understand we do this transformation is mainly because integer encoding relies on the fact that there's a relationship between each category(e.g. think age range vs types of animals). In this case however, it's hard to really determine such a relationship, hence dummy/one-hot encoding might help provide better predictive accuracy.

In [None]:
## Cleaning Categories into integers
category_string = dataset["Category"]
category_val = dataset["Category"].unique()
category_valcount = len(category_val)
category_dict = {}
for i in range(0,category_valcount):
    category_dict[category_val[i]] = i
print(category_dict)
dataset["category_int"] = dataset["Category"].map(category_dict).astype(int)
dataset.head()

In [None]:
#Converting type classification into binary
def type_binary(x):
    if x == 'Free':
        return 0
    else:
        return 1

dataset['Type'] = dataset['Type'].map(type_binary)
dataset.head()

**RATİNG**

In [None]:
dataset['Rating'].describe()

In [None]:
# 2.rating dağılımı
rcParams['figure.figsize'] = 12,8
g = sns.kdeplot(dataset.Rating, color="Red", shade = True)
g.set_xlabel("Rating")
g.set_ylabel("Frequency")
plt.title('Distribution of Rating',size = 20)
plt.show()
print('Average app rating = ', np.mean(dataset['Rating']))

Average of rating of application in store is around 4 which is very high.
Generally, most apps do well with an average rating of 4.17.

In [None]:
#indeksi düzenleme!! unutma
dataset.tail(10)

In [None]:
dataset.shape

In [None]:
"""
R codeları csv yi arff ye cevirme!! Belgelere kaydet csv yi

library("foreign")
data=read.csv("example.csv",header=TRUE)
write.arff(x=data ,file= "file.arff")
"""


In [None]:
index = list(range(1,8191))
dataset.index= index
#dataset.reset_index()   -- bunu da kullanabilirz.
dataset.tail()
#ya da set index iler baska sütunu index yapabilirsin.


Pricing Strategy - Free Vs Paid?

How do app prices impact app rating?

In [None]:
paid_apps = dataset[dataset.Price>0]
figure = sns.jointplot("Price","Rating",paid_apps) #compare to price and rating of paid apps


Finding

Most top rated apps are optimally priced between ~1$ to ~30$. There are only a very few apps priced above 20$.
Shocking...Apps priced above 250\$ !!! Let's quickly examine what these junk apps are.

In [None]:
#print('Junk apps priced above 350$')
dataset[['Category', 'App']][dataset.Price > 200]

1.Android market breakdown


Which category has the highest share of (active) apps in the market?

In [None]:
#1
number_of_apps_in_category=dataset.Category.value_counts().sort_values(ascending=False)
number_of_apps_in_category
#CAtegory sutunundaki olan uygulamaları say ve artan ve ya azalana göre sırala

#buna göre Pie chart yapma
data = [go.Pie(
    labels = number_of_apps_in_category.index,
    values = number_of_apps_in_category.values,
    hoverinfo = 'label+value'
)]

plotly.offline.iplot(data,filename = "Actvie_Category") #buna png olarak erişmemizi saglayacak


Game and Family category are the most appearances for application in store. Sagda gözüktüğü gibi büyüyükten küçüğe sıralanmıs.

2.Current pricing trend - How to price your app?


In [None]:
subset_df = dataset[dataset.Category.isin(['GAME', 'FAMILY', 'PHOTOGRAPHY', 'MEDICAL', 'TOOLS', 'FINANCE',
                                 'LIFESTYLE','BUSINESS'])]
#subset_df ->alt küme bir dataframe olustururuz onuda ana datasetten bu categorydeki uygulamalar alınır
sns.set_style('darkgrid')
fig, ax = plt.subplots()
fig.set_size_inches(15, 8)
title = ax.set_title('App pricing trend across categories')
p = sns.stripplot(x="Price", y="Category", data=subset_df, jitter=True, linewidth=1) #burda jitter kümelenme gibi birsey

In [None]:
fig, ax = plt.subplots()
fig.set_size_inches(15, 8)
subset_df_price = subset_df[subset_df.Price<100]
p = sns.stripplot(x="Price", y="Category", data=subset_df_price, jitter=True, linewidth=1)
title = ax.set_title('App pricing trend across categories - after filtering for junk apps')


Clearly, Medical and Family apps are the most expensive. Some medical apps extend even upto 80\$.

All other apps are priced under 30\$.

Surprisingly, all game apps are reasonably priced below 20\$.

In [None]:
#Bilgi

#ana datasetten sütunları gruplayarak yeni dataset oluşturma.App diye sutun olustur ve o categoride kaç tane var yaz!
new_dataset = dataset.groupby(['Category', 'Type']).agg({'App' : 'count'}).reset_index()
#print(new_dataset)  




Rating vs Rewievs

In [None]:
rcParams['figure.figsize'] = 11.7,8.27
g = sns.kdeplot(dataset.Reviews, color="Green", shade = True)
g.set_xlabel("Reviews")
g.set_ylabel("Frequency")
plt.title('Distribution of Reveiw',size = 20)

Most of application in this store have less than 1M in reviews.

Obviously, well-known applictions have a lot of reviews

In [None]:
plt.figure(figsize = (10,10))
g = sns.jointplot(x="Reviews", y="Rating",color = 'orange', data=dataset,size = 8);

In [None]:
plt.figure(figsize = (10,10))
sns.regplot(x="Reviews", y="Rating", color = 'darkorange',data=dataset[dataset['Reviews']<1000000]); #regrasyon çizgisi.
plt.title('Rating VS Reveiws',size = 20)

Seem like well-known application will get a good rating

#Rating vs İnstall

In [None]:
plt.figure(figsize = (10,10))
sns.regplot(x="Installs", y="Rating", color = 'teal',data=dataset);
plt.title('Rating VS Installs',size = 20)

Seem like number of install affect to rating


Correlation

In [None]:
f,ax=plt.subplots(figsize=(18,18)) #yani f burda figure dur.bu tamamen çıkan görselin boyutunu belirler.18e 18
sns.heatmap(dataset.corr(),annot=True,linewidth=.8,fmt=".1f",ax=ax) #görsel için seaborn kütüphanesinin heatmap() metodu kullanılır.
#data.corr() tabloyu alır,annot=True korelasyon sayılarının gözükmesi demek,linewidth çizgi kalınlıgı,fmt= virgülden sonraki basamak sayısı
plt.show()