## EXPLORATORY DATA ANALYSIS OF GOOGLE APPS

## STEPS:
### IMPORT THE MODULES

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

### READ DATA

In [None]:
gdata = pd.read_csv('../input/googleplaystore.csv') 
#change backward slash to forward slash to avoid unicode error message

In [None]:
gdata.head() #by default it displays 5 rows

In [None]:
gdata.shape

In [None]:
gdata.describe() #statistics

In [None]:
gdata.boxplot()

In [None]:
gdata.hist()

In [None]:
gdata.info()

### DATA CLEANING

In [None]:
gdata.isnull()

#### Count the missing values

In [None]:
gdata.isnull().sum()

In [None]:
#in boxplot we saw one value is outlier. But now criss check how many are outliers
gdata[gdata["Rating"] > 5]

In [None]:
#drop it
gdata.drop([10472], inplace=True)

In [None]:
gdata[10470:10474] #to check if it has been dropped

In [None]:
gdata.boxplot() #now we can see there are no outliers, all are under rating 5

In [None]:
gdata.hist() #it's rightly skewed

#### Drop the columns that are 90% empty

In [None]:
threshold = len(gdata) * 0.1 
threshold
#this is 10% of 10840 (total no. of rows), so using this value we will check which col has 10% values(i.e 90% empty)

In [None]:
gdata.dropna(thresh = threshold, axis = 1, inplace = True) #axis is 1 bcz we are doing for col

In [None]:
print(gdata.isnull().sum())

In [None]:
gdata.shape

#### There is no change observed

### Fill the null values using aggregate functions i.e, mean, median, mode

In [None]:
def compute_median(series):
    return series.fillna(series.median())

In [None]:
gdata.Rating = gdata["Rating"].transform(compute_median)

In [None]:
#check for null values again
gdata.isnull().sum()

In [None]:
#check the modes of Type, Current Ver and Android Ver to be sure before filling missing values
print(gdata["Type"].mode())
print(gdata["Current Ver"].mode())
print(gdata["Android Ver"].mode())

In [None]:
# fill the missing values
gdata["Type"].fillna(str(gdata["Type"].mode().values[0]), inplace=True)
gdata["Current Ver"].fillna(str(gdata["Type"].mode().values[0]), inplace=True)
gdata["Android Ver"].fillna(str(gdata["Type"].mode().values[0]), inplace=True)

In [None]:
gdata.isnull().sum()

In [None]:
# convert the attributes like: Price (which is in $), Reviews and Installs to numeric (without symbols also) 
gdata["Price"] = gdata["Price"].apply(lambda x: str(x).replace("$", '') if "$" in str(x) else str(x))
gdata["Price"] = gdata["Price"].apply(lambda x: float(x))
gdata["Reviews"] = pd.to_numeric(gdata["Reviews"], errors= "coerce")
gdata["Installs"] = gdata["Installs"].apply(lambda x: str(x).replace("+", '') if "+" in str(x) else str(x))
gdata["Installs"] = gdata["Installs"].apply(lambda x: str(x).replace(",", '') if "," in str(x) else str(x))
gdata["Installs"] = gdata["Installs"].apply(lambda x: float(x))

In [None]:
gdata.head(10)

In [None]:
gdata.describe() #previously we had only Rating, now we have 4 columns

### DATA VISUALIZATION

In [None]:
grp = gdata.groupby("Category")
x = grp["Rating"].agg(np.mean)
y = grp["Price"].agg(np.sum)
z = grp["Reviews"].agg(np.mean)
w = grp["Installs"].agg(np.mean)

print(x, "\n")
print(y, "\n")
print(z, "\n")
print(w)

In [None]:
plt.figure(figsize=(15,5))
plt.plot(x, "o", color="r")
plt.xticks(rotation = 90)
plt.title("Category wise Rating")
plt.ylabel("Rating-->")
plt.xlabel("Categories-->")
plt.show()

### We can see Events Apps have the highest rating.

In [None]:
plt.figure(figsize=(15,5))
plt.plot(y, "r--", color="r")
plt.xticks(rotation = 90)
plt.title("Category wise Price")
plt.ylabel("Price-->")
plt.xlabel("Categories-->")
plt.show()

### We can see Finance Apps have the highest Price.

In [None]:
plt.figure(figsize=(15,5))
plt.plot(z, "g^", color="r")
plt.xticks(rotation = 90)
plt.title("Category wise Reviews")
plt.ylabel("Reviews-->")
plt.xlabel("Categories-->")
plt.show()

### We can see top 3 reviews by users are on Communication apps, Game apps and Social apps.

In [None]:
plt.figure(figsize=(15,5))
plt.plot(w, "bs", color="r")
plt.xticks(rotation = 90)
plt.title("Category wise Installs")
plt.ylabel("Installs-->")
plt.xlabel("Categories-->")
plt.show()

### We can see that Communication apps have the most downloads.