### **Predict Ratings Using Regression Models**

Tasks:   
1. Handle Null data

2. Format columns and values

3. Drop unwanted rows

4. Perform univariate analysis

5. Find percentiles   

6. Perform bivariate analysis   

7. Observe predictors and target variable

8. Regressions


In [None]:
# Using the Python 3 environment with analytics libraries 
#as defined by the kaggle/python docker 

import numpy as np 
import pandas as pd 

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
#other libraries
import seaborn as sns
import matplotlib.pyplot as plt 
%matplotlib inline

import sys
import warnings

if not sys.warnoptions:
    warnings.simplefilter("ignore")
    
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LinearRegression
from sklearn import metrics

from sklearn import preprocessing

import statsmodels.api as sm 


### 1- Null values

In [None]:
df1=pd.read_csv('../input/playstore-analysis/googleplaystore.csv')
df1

In [None]:
#are there NaNs?
df1.isnull().values.any()

In [None]:
#how many NaNs?
df1.isnull().sum().sum()

In [None]:
#which columns have NaNs?
df1.isna().any()

In [None]:
#remove rows with NaNs
df2=df1.dropna()

#observe shape change
df2.shape

### 2-Variable format

In [None]:
#fix Size column

df2['Size'] = df2['Size'].replace(['Varies with device'],'0k')

#Extract Last character from right of the column 
#and make another one

df2['Measure'] = df2['Size'].str[-1:]

#remove letter
df2['Size']=df2['Size'].replace({'M':'', 'k':'', 'K':''}, regex=True)

#change datatype
df2['Size']=df2['Size'].astype(float)

df2['Size'] = np.where(df2['Measure'] == 'M',
                                           df2['Size'] * 1000,
                                           df2['Size'])


In [None]:
#drop added column
df3=df2.drop(['Measure'], axis=1)

In [None]:
#change datatype for Reviews column
df3['Reviews']=df3['Reviews'].astype(float)

#fix Installs
# remove special character 
df3['Installs'] = df3['Installs'].str.replace('[,,+]', '') 

In [None]:
#change datatype for Installs column
df3['Installs']=df3['Installs'].astype(int)

In [None]:
#fix Price
df3['Price'] = df3['Price'].str.replace('[$]', '') 
#change datatype 
df3['Price']=df3['Price'].astype(float)

In [None]:
#verify
df3.head()

### 3-Remove Unwanted Rows 

In [None]:
# delete all rows with Ratings outside the 1-5 range
indexOut = df3[ (df3['Rating'] < 0) & (df3['Rating'] > 5) ].index
df3.drop(indexOut , inplace=True)

In [None]:
#verify
df3.shape

In [None]:
# delete Review rows that exceed Installs
df3 = df3[df3['Reviews'] <= df3['Installs']]

In [None]:
#verify
df3.shape

In [None]:
# get indexes where free Types have a price over 0 
indexOut2 = df3[(df3['Price'] >= 0.1) & (df3['Type'] == 'Free')].index 
  
# drop these row 
df3.drop(indexOut2, inplace = True) 
#verify  
df3.shape

### 4-Univariate analysis

In [None]:
#find possible outliers
#boxplot Price and Review columns
boxplot = df3.boxplot(column=['Price','Reviews'], rot=45)

In [None]:
#verify individually
sns.boxplot(y=df3["Price"])

In [None]:
#Price Histogram
df3.Price.plot.hist(color='salmon', figsize=(8,8))

In [None]:
df3["Price"].unique()

In [None]:
sns.boxplot(y=df3["Reviews"])

In [None]:
df3["Reviews"].describe()

In [None]:
df3["Reviews"].value_counts().head(10)

In [None]:
# Density Plot and Histogram of ratings
sns.distplot(df3['Rating'], hist=True, kde=True, 
             bins=int(180/5), color = 'darkgreen', 
             hist_kws={'edgecolor':'black'},
             kde_kws={'linewidth': 4})

In [None]:
# Density Plot and Histogram of Size
sns.distplot(df3['Size'], hist=True, kde=True, 
             bins=int(100/5), color = 'darkorange', 
             hist_kws={'edgecolor':'black'},
             kde_kws={'linewidth': 4})

**Outliers**   
1- Price   
Those apps over $100 seems to be outliers.   
2- Reviews   
The mean of Reviews is 514760.6 with a standard deviation of 3146169 between values.   
This seems as a large discrepancy between values, and perhaps several outliers.   
3- Rating   
Most apps lean towards HIGH rates.   
4- Size   
Most apps' size are below 20,000.   


In [None]:
#drop Price rows at 200 and higher
df4=df3[df3['Price'] < 200]
#verify
df4.shape

In [None]:
#Drop Review rows with over 2 million reviews
df4=df4[df4['Reviews'] <= 2000000]
#verify
df4.shape

In [None]:
#drop rows with 100,000,000 and more Installs
df4=df4[df4['Installs'] <= 100000000]
#verify
df4.shape

### 5- Percentiles

In [None]:
percentiles=df4[['Rating','Reviews','Size','Installs','Price']]

In [None]:
#10, 25, 50, 70, 90, 95, 99 percentiles
print("10th percentile : ",
       np.percentile(percentiles, 10))

print("25th percentile : ",
       np.percentile(percentiles, 25))

print("50th percentile : ", 
       np.percentile(percentiles, 50))

print("70th percentile : ",
       np.percentile(percentiles, 70))

print("90th percentile : ",
       np.percentile(percentiles, 90))
print("95th percentile : ",
       np.percentile(percentiles, 95))
print("99th percentile  : ",
       np.percentile(percentiles, 99))

### 6- Bivariate analysis

In [None]:
#find other outliers with boxplots 
percentiles.boxplot(rot=50)

In [None]:
percentiles.boxplot(column=['Rating'])

In [None]:
#remove outliers
indexOut3 = df4[(df4['Rating'] < 3.5) ].index
df4.drop(indexOut3 , inplace=True)

In [None]:
df4.shape

In [None]:
percentiles.boxplot(column=['Size'])

In [None]:
percentiles.boxplot(column=['Price'], figsize=(7,7))

In [None]:
#remove outliers
indexOut4 = df4[(df4['Price'] > 40)].index
df4.drop(indexOut4 , inplace=True)
#verify
df4.shape

In [None]:
percentiles.boxplot(column=['Installs'])

In [None]:
#remove outliers
indexOut5 = df4[(df4['Installs'] >= 100000000)].index
df4.drop(indexOut5 , inplace=True)
#verify
df4.shape

### 7- Possible Predictors: 'App', 'Category', 'Reviews', 'Size', 'Installs', 'Type','Price', 'Content Rating', 'Genres', 'Last Updated', 'Current Ver','Android Ver'      
Target variable: 'Rating'

In [None]:
#correlation graph
corr1=df4.corr()
sns.heatmap(corr1, cmap="YlOrBr")

In [None]:
sns.catplot(x="Type", y="Rating", data=df4)

In [None]:
# 'Android Ver' and Rating
c=sns.catplot(x="Android Ver", y="Rating", data=df4, palette ='icefire', height=5, aspect=6)

In [None]:
#Scatter plot for Rating vs. Price
plt.figure(figsize=(10, 9))
sns.scatterplot(
    data=df4, x="Rating", y="Price", hue="Rating", size="Rating",
    sizes=(20, 200), hue_norm=(0, 5), legend="full")

In [None]:
#Scatter plot for Rating vs. Size
plt.figure(figsize=(10, 9))
sns.scatterplot(
    data=df4, x="Rating", y="Size", hue="Rating", size="Rating",
    sizes=(20, 200), hue_norm=(2, 7), legend="full")

In [None]:
#Scatter plot for Rating vs. Price

plt.figure(figsize=(10, 9))
sns.scatterplot(
    data=df4, x="Rating", y="Reviews", hue="Rating", size="Rating",
    sizes=(20, 200), hue_norm=(4, 6), legend="full")

In [None]:
# 'Content Rating' and Price
c=sns.catplot(x="Content Rating", y="Rating", data=df4, palette ='icefire', height=3.5, aspect=5)

In [None]:
plt.figure(figsize=(12, 5))
r=sns.boxplot(x ='Content Rating', y ='Rating', data = df4, palette ='cubehelix') 
plt.setp(r.get_xticklabels(), rotation=45)

In [None]:
# Plot category in relation to rating
plt.figure(figsize=(15, 6))
g=sns.boxplot(x ='Category', y ='Rating', data = df4, palette ='plasma') 
plt.setp(g.get_xticklabels(), rotation=70)


In [None]:
# Plot categorical data in relation to Genres
plt.figure(figsize=(16, 8))
g=sns.barplot(x ='Genres', y ='Rating', data = df4, palette ='Set2') 
plt.setp(g.get_xticklabels(), rotation=90)

### **Observations**:   
1.App, Last Updated, Current Ver, and Genres have too many unique entries to plot.   
2.Possible relationships between some elements in Rating and Type, Rating and Content Ratings, and Rating and Android Ver. But none obvious.      
3.Between Price and Rating:         
-Most data concentrates within Ratings 4.4 to 5.0 and Price between 0 and 10.   
-Price does not mean better ratings.   
4.Rating and Size:   
-Entries seem evenly distributed throughout sizes.   
-Heavier ones rate around 4.4 to 4.8.   
-Lighter ones rate better than heavier ones.   
5.Rating and Reviews:   
-Amount of reviews do not always mean better ratings.    
6.Rating and Content Rating:  
-There does not seem to be a difference between Content Ratings in relation to Ratings.   
7.Category and Rating:   
-No visible changes between categories in relation to ratings.  
8.Genres and Rating:   
-Comics/Creativity and Board Pretend Play rate highest.



### 8- Regression

In [None]:
df4.describe()

In [None]:
df4.plot(x='Reviews', y='Rating', style='+')  
plt.title('Reviews and Rating')  
plt.xlabel('Reviews')  
plt.ylabel('Rating')  
plt.show()

In [None]:
X = df4['Reviews'].values.reshape(-1,1)
y = df4['Rating'].values.reshape(-1,1)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=2)

In [None]:
reg1 = LinearRegression()  
reg1.fit(X_train, y_train) #training the algorithm

In [None]:
reg1.intercept_

In [None]:
reg1.coef_

In [None]:
y_pred = reg1.predict(X_test)

In [None]:
dfReg1 = pd.DataFrame({'Actual': y_test.flatten(), 'Predicted': y_pred.flatten()})
dfReg1

In [None]:
top10=dfReg1.head(10)
top10.plot(kind='bar',figsize=(8,10))
plt.grid(which='major', linestyle='-', linewidth='0.3', color='green')
plt.grid(which='minor', linestyle=':', linewidth='0.5', color='black')
plt.show()

In [None]:
plt.scatter(X_test, y_test,  color='gray')
plt.plot(X_test, y_pred, color='purple', linewidth=2)
plt.show()

In [None]:
'Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred)

In [None]:
'Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred)

In [None]:
'Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred))

In [None]:
#Price variable
df4.plot(x='Price', y='Rating', style='*', color='darkorange')  
plt.title('Price and Rating')  
plt.xlabel('Price')  
plt.ylabel('Rating')  
plt.show()

In [None]:
X = df4['Reviews'].values.reshape(-1,1)
y = df4['Rating'].values.reshape(-1,1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
reg3 = LinearRegression()  
reg3.fit(X_train, y_train) #training the algorithm

In [None]:
reg3.intercept_

In [None]:
reg3.coef_

In [None]:
y_pred = reg3.predict(X_test)

In [None]:
dfReg3 = pd.DataFrame({'Actual': y_test.flatten(), 'Predicted': y_pred.flatten()})
dfReg3

In [None]:
top10=dfReg3.head(10)
top10.plot(kind='bar',figsize=(8,10))
plt.grid(which='major', linestyle='-', linewidth='0.3', color='green')
plt.grid(which='minor', linestyle=':', linewidth='0.5', color='black')
plt.show()

In [None]:
plt.scatter(X_test, y_test,  color='navy')
plt.plot(X_test, y_pred, color='salmon', linewidth=2)
plt.show()

In [None]:
'Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred)

In [None]:
'Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred)

In [None]:
'Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred))

In [None]:
#ordinary least squares (OLS)
#is a linear least squares method to estimate the unknown parameters in a linear regression model

X = df4[['Reviews' , 'Installs']]
y = df4['Rating']

model = sm.OLS(y, X).fit()
predictions = model.predict(X)
model.summary()

In [None]:
dfReg2 = pd.DataFrame({'Actual': y, 'Predicted': predictions})
dfReg2

In [None]:
top10=dfReg2.head(10)
top10.plot(kind='bar',figsize=(8,10))
plt.grid(which='major', linestyle='-', linewidth='0.3', color='green')
plt.grid(which='minor', linestyle=':', linewidth='0.5', color='black')
plt.show()

### KNN

In [None]:
#Encode label data
#create labelEncoder
le = preprocessing.LabelEncoder()

# Converting string labels into numbers.
Genres_enc =le.fit_transform(df4.Genres)
Cat_enc =le.fit_transform(df4.Category)

#verify
Genres_enc

In [None]:
Cat_enc

In [None]:
#convert other label columns
Type_enc =le.fit_transform(df4['Type'])

In [None]:
Type_enc

In [None]:
##Rating encoding
#Ratings are FLOAT and need to be non-continuous for KNN

Rate_enc=le.fit_transform(df4['Rating'])

#categorize Installs
Ins_enc=le.fit_transform(df4['Installs'])


AndrdV_enc=le.fit_transform(df4['Android Ver'])

In [None]:
AndrdV_enc

In [None]:
Rate_enc

In [None]:
Ins_enc

In [None]:
#combining type, android version, Genres
features=list(zip(Genres_enc, Cat_enc, Type_enc, Ins_enc, AndrdV_enc))

In [None]:
features

In [None]:
#scale the features
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X)


In [None]:
X = scaler.transform(X)

In [None]:
#model

from sklearn.neighbors import KNeighborsClassifier

# Create feature and target arrays 
X = features
y = Rate_enc 
  
# Split into training and test set 
X_train, X_test, y_train, y_test = train_test_split( 
             X, y, test_size = 0.4, random_state=10) 
  
knn = KNeighborsClassifier(n_neighbors=6) 
  
knn.fit(X_train, y_train) 
  
# Predict on dataset which model has not seen before 
print(knn.predict(X_test)) 


In [None]:
# Calculate the accuracy of the model 
print(knn.score(X_test, y_test)) 

In [None]:
# Generate plot 
plt.figure(figsize=(12, 6))
plt.plot(X_test) 
plt.plot(X_train)   

plt.show() 

There are no obvious relationship between Ratings of games and the variables offered here.