## Google App Rating Data

In [None]:
import pandas as pd
import matplotlib.pyplot as map
import numpy as np
import matplotlib.pylab as plt
%matplotlib inline
import seaborn as sns

In [None]:
df = pd.read_csv("/Users/vaibhavk/Projects/Python Project/googleplaystore.csv")
df.head()

### Removing Nulls from the dataset

In [None]:
df["Size"].isnull().sum()
df["Size"].fillna(method='pad',inplace=True)
df["Size"].isnull().sum()

In [None]:
df.shape

In [None]:
nonNulldf = df.dropna()
nonNulldf.shape

In [None]:
nonNulldf.Size

### Update the Size column to convert MB to KB

In [None]:
def update(size):
    if 'M' in size:
        i=size[:-1]
        i=float(i)*1000
        return i
    elif 'k' in size:
        i=size[:-1]
        i=float(i)
        return i
    else:
        return None

nonNulldf.Size = nonNulldf.Size.map(update)

In [None]:
nonNulldf.Size

### Reformatting the columns and converting them to float

In [None]:
nonNulldf['Reviews'] = nonNulldf['Reviews'].astype('float')

nonNulldf.Installs = nonNulldf.Installs.apply(lambda x: x.replace(',','').replace('+',''))
nonNulldf.Installs = nonNulldf.Installs.astype('float')

nonNulldf.Price = nonNulldf.Price.apply(lambda x: x.replace('$',''))
nonNulldf.Price = nonNulldf.Price.astype('float')

In [None]:
nonNulldf.dtypes

### Sanity Checks

In [None]:
validRating = nonNulldf.Rating > 5
validRating.value_counts()

In [None]:
installs = nonNulldf.Reviews>nonNulldf.Installs
installs.value_counts()

In [None]:
price = (nonNulldf.Price>0) & (nonNulldf.Type=='Free')
price.value_counts()

### Box Plots

In [None]:
boxplot = nonNulldf.boxplot(column=['Price'])

In [None]:
boxplot = nonNulldf.boxplot(column=['Reviews'])

### Histograms

In [None]:
histogram = nonNulldf.hist(column=['Rating'])

In [None]:
histogram = nonNulldf.hist(column=['Size'])

### Remove suspiciously high values / Outlier Treatments

In [None]:
nonNulldf = nonNulldf[nonNulldf.Price < 200].copy()
print(nonNulldf.shape)

In [None]:
highReviews = nonNulldf.Reviews>2000000
highReviews.value_counts()

In [None]:
nonNulldf = nonNulldf[nonNulldf.Reviews<=2000000].copy()
nonNulldf = nonNulldf.dropna()
print(nonNulldf.shape)

In [None]:
percentiles=nonNulldf[['Rating','Reviews','Size','Installs','Price']]

In [None]:
print("10th percentile : ",
       np.percentile(percentiles, 10))
print("25th percentile : ",
       np.percentile(percentiles, 25))
print("50th percentile : ", 
       np.percentile(percentiles, 50))
print("70th percentile : ",
       np.percentile(percentiles, 70))
print("90th percentile : ",
       np.percentile(percentiles, 90))
print("95th percentile : ",
       np.percentile(percentiles, 95))
print("99th percentile  : ",
       np.percentile(percentiles, 99))

### Bivariate Analysis

In [None]:
plt.figure(figsize=(25,8))
sns.scatterplot(nonNulldf.Price,nonNulldf.Rating,hue=nonNulldf.Rating)
plt.show()

In [None]:
plt.figure(figsize=(25,8))
sns.scatterplot(nonNulldf.Size,nonNulldf.Rating,hue=nonNulldf.Rating)
plt.show()

In [None]:
plt.figure(figsize=(25,8))
sns.scatterplot(nonNulldf.Reviews,nonNulldf.Rating,hue=nonNulldf.Rating)
plt.show()

In [None]:
plt.figure(figsize=(25,8))
sns.boxplot(nonNulldf["Content Rating"],nonNulldf["Rating"])
plt.show()

In [None]:
plt.figure(figsize=(25,8))
sns.boxplot(nonNulldf.Category,nonNulldf.Rating)
plt.xticks(fontsize=18,rotation='vertical')
plt.show()

### Data Preprocessing

In [None]:
inp1=nonNulldf.copy()

In [None]:
inp1.Installs=inp1.Installs.apply(func=np.log1p)
inp1.Reviews=inp1.Reviews.apply(func=np.log1p)

inp1.hist(column=['Installs','Reviews'])

In [None]:
inp1.drop(["App","Last Updated","Current Ver","Android Ver"],inplace=True,axis=1)
inp1.head(5)

In [None]:
inp1=pd.get_dummies(inp1,drop_first=True)
inp2=inp1
inp2.head(5)

In [None]:
x=inp2.iloc[:,1:]
y=inp2.iloc[:,:1]

### Split Data Set

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split( x, y, test_size=0.30, random_state=1)

### Linear Regression Model Training

In [None]:
from sklearn.linear_model import LinearRegression
regressor=LinearRegression()
model=regressor.fit(x_train, y_train)

y_pred=regressor.predict(x_test)

In [None]:
from statsmodels.api import OLS
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error as ms

### Model

In [None]:
summ=OLS( y_train,x_train).fit()
summ.summary()