In [1]:
# Data analysis packages
import pandas as pd
import numpy as np
import re

# Plotting packages
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()

# NLP packages
import nltk
#nltk.download('stopwords')
#nltk.download('punkt')
#nltk.download('wordnet')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk.classify.util
from nltk.classify import NaiveBayesClassifier
from nltk.corpus import movie_reviews
from textblob import Word, TextBlob

# Scaling
from sklearn.preprocessing import scale

In [None]:
#****************************************************
# Data Import
#****************************************************

data = pd.read_csv('googleplaystore.csv', 
                    parse_dates=['Last Updated'])

data_reviews = pd.read_csv('googleplaystore_user_reviews.csv')

data.dropna(inplace=True)
data.reset_index(drop=True, inplace=True)
data_reviews.dropna(inplace=True)
data_reviews.reset_index(drop=True, inplace=True)

In [None]:
#****************************************************
# Data Cleaning
#****************************************************

# Cleaning installs column
install = np.array(data.Installs)

for i in range(len(install)):
    install[i] = install[i].strip('+')
    install[i] = int(install[i].replace(',', ''))

data.Installs = install

# Cleaning last update column
dates = pd.DatetimeIndex(data['Last Updated'])
data['Last Updated'] = dates.year

# Removing columns which are not required 
del(data['Current Ver'])
del(data['Android Ver'])
del(data['Genres'])

In [None]:
#**********************************************
# Exploratory Data Analysis
#**********************************************

In [None]:
print(data.info())

In [None]:
fig = plt.figure(figsize=(12,12))
ax1 = plt.axes()
df2 = data['Category'].value_counts()
df2 = df2.reset_index()
sns.barplot(x = df2['Category'],y = df2['index'],color='b',ax=ax1,orient='h')
plt.xlabel('Number of Apps')
plt.ylabel('')
plt.title('Category-wise number of Apps')
for i, v in enumerate(df2['Category'].values):
    ax1.text(v + 1, i + .25, str(v), color='black')
plt.show()

In [None]:
medals=data['Content Rating'].value_counts()
labels=medals.index

fig = plt.figure(figsize=(12,8))
plt.pie(medals,labels=labels,shadow=True,autopct='%1.2f%%',radius=1.2)
plt.title('Apps with Content Rating')
plt.show()

In [None]:
medals=data['Type'].value_counts()
labels=medals.index

fig = plt.figure(figsize=(12,8))
plt.pie(medals,labels=labels,shadow=True,autopct='%1.2f%%',radius=1)
plt.title('Apps by Type')
plt.show()

In [None]:
x_vals=data['Type'].values
y_vals=data['Installs'].values

fig = plt.figure(figsize=(8,10))
sns.boxplot(x=x_vals, y=y_vals)
plt.title('Ratings by Type')
plt.show()

In [None]:
import plotly.offline as off
import plotly.graph_objs as go
off.init_notebook_mode(connected = True)

cat_eda = data.groupby(['Category']).agg({'Installs':'sum'})

labels = cat_eda.index
values = cat_eda.Installs

common_props = dict(labels=labels,
                    values=values,)

trace1 = go.Pie(
    **common_props,
    textinfo='percent',
    textposition='outside')

trace2 = go.Pie(
    **common_props,
    textinfo='label',
    textposition='inside')

off.iplot([trace1, trace2], filename='basic_pie_chart')

In [None]:
#****************************************************
# Data Processing
#****************************************************

In [None]:
#**** Basic Sentiment analysis of App reviews ****#

a=pd.DataFrame(data_reviews.loc[:,['App', 'Translated_Review']])
a.head()

# lower case
lower = lambda x: " ".join(x.lower() for x in x.split())
a['Processed_review'] = a['Translated_Review'].apply(lower)
a['Processed_review'].head()

# remove punctuations
a['Processed_review'] = a['Processed_review'].str.replace('[^\w\s]','')
a['Processed_review'].head()

# remove stop words
stop = stopwords.words('english')
rem_stop = lambda x: " ".join(x for x in x.split() if x not in stop)
a['Processed_review'] = a['Processed_review'].apply(rem_stop)
a['Processed_review'].head()

#count freq words
freq = pd.Series(' '.join(a['Processed_review']).split())
freq = freq.value_counts()[:10]
freq

# remove words with high frequency 
freq = list(freq.index)
rem_freq = lambda x: " ".join(x for x in x.split() if x not in freq)
a['Processed_review'] = a['Processed_review'].apply(rem_freq)
a['Processed_review'].head()

# count rare words
freq = pd.Series(' '.join(a['Processed_review']).split())
freq = freq.value_counts()[-10:]
freq

# remove words with less frequency
freq = list(freq.index)
rem_freq = lambda x: " ".join(x for x in x.split() if x not in freq)
a['Processed_review'] = a['Processed_review'].apply(rem_freq)
a['Processed_review'].head()

# tokenization
a['tokenized_sentences'] = a['Processed_review'].apply(word_tokenize) 
a.head()

#  Lemmatization
lem = lambda x: " ".join([Word(word).lemmatize() for word in x])
a['Lemmatization'] = a['tokenized_sentences'].apply(lem)
a.head()

# sentiment rating (polarity)
polar = lambda x: TextBlob(x).sentiment[0]
a['Polarity'] = a['Lemmatization'].apply(polar)
a[['Lemmatization','Polarity']].head(10)


pos = [0]*len(a)
neg = [0]*len(a)
neu = [0]*len(a)
x = np.array(a['Polarity'])

for i in range(len(a)):
    if x[i] < 0:
        neg[i] = 1
    elif x[i] == 0:
        neu[i] = 1
    else:
        pos[i] = 1

a['Positive'] = pos
a['Negative'] = neg
a['Neutral'] = neu
a.head()

# Updated dataset with total ratings per app
reviews = a.groupby(['App']).agg({
                        'Positive':'sum',
                        'Negative':'sum',
                        'Neutral':'sum'
                })

In [None]:
#**********************************************
# Merging both datasets
#**********************************************

In [None]:
n_data = pd.merge(data, reviews, how='inner', on='App')
n_data.dropna(inplace=True)
n_data.drop_duplicates(subset='App', keep='first', inplace=True)
n_data.reset_index(drop=True, inplace=True)

In [None]:
# Writing processed data to a new file
filename = '/home/rajat/Documents/Aegis/Python/Python Project/google-play-store-apps/Processed_data.csv'
n_data.to_csv(path_or_buf=filename, sep=',', mode='w')

In [None]:
#**********************************************
# Loading Processed data
#**********************************************

In [None]:
n_data = pd.read_csv('/home/rajat/Documents/Aegis/Python/Python Project/google-play-store-apps/Processed_data.csv').iloc[:, 1:]

In [None]:
n_data.head()

In [None]:
#**********************************************
# Data Analysis
#**********************************************

In [None]:
# Category-wise data
cat_eda = n_data.groupby(['Category']).agg({
         'Positive':'sum',
         'Neutral':'sum',
         'Negative':'sum',
         'Rating':'mean',
         'Installs':'sum'
})
cat_eda.Rating = cat_eda.Rating.round(1)
cat_eda.Installs = cat_eda.Installs.round()

In [None]:
# Plotting percentage of reviews post sentiment analysis
barWidth = 0.85
names = cat_eda.index
col_names = ['Negative','Neutral','Positive']
r = np.arange(len(names))
totals = [i+j+k for i,j,k in zip(cat_eda['Positive'], cat_eda['Neutral'], cat_eda['Negative'])]
posi = [i / j * 100 for i,j in zip(cat_eda['Positive'], totals)]
neut = [i / j * 100 for i,j in zip(cat_eda['Neutral'], totals)]
negi = [i / j * 100 for i,j in zip(cat_eda['Negative'], totals)]

sns.set(style='whitegrid')
plt.figure(figsize=(15, 10))
plt.barh(r, negi, color='tab:red', edgecolor='white', height=barWidth, alpha=0.7)
plt.barh(r, neut, left=negi, color='tab:gray', edgecolor='white', height=barWidth, alpha=0.7)
plt.barh(r, posi, left=[i+j for i,j in zip(negi, neut)], color='tab:blue', edgecolor='white', height=barWidth, alpha=0.7)
plt.yticks(r, names)
plt.xlabel('Reviews', labelpad=20)
plt.title('Categorywise Reviews', pad=30)
plt.legend(col_names, loc=4, frameon=True)
sns.despine(left=True, bottom=True)
plt.show()

In [None]:
# Preparing data for generating scores
rank = n_data.loc[:, ['Rating','Installs','Reviews','Positive','Negative','Neutral']]
columns = ['Rating','Installs','Reviews','Positive','Negative','Neutral']
rank_scale = pd.DataFrame(scale(rank), columns=columns)
rank = rank_scale + abs(min(np.min(rank_scale)))

t_rating = np.array(rank.Rating)
t_installs = np.array(rank.Installs)
t_posi = np.array(rank.Positive)
t_negi = np.array(rank.Negative)
t_neut = np.array(rank.Neutral)
t_rev = np.array(rank.Reviews)

In [None]:
# Applying rule to find top 10
y = (1.5*t_rating + 3*t_installs + 2*t_posi - 2*t_negi + t_neut + 2*t_rev)/6
n_data['Scores'] = y

In [None]:
# Sorting in descending order
sorted_desc = n_data.sort_values('Scores', axis=0, ascending=False)
sorted_desc['Ranks'] = np.arange(1, len(sorted_desc)+1)

In [None]:
sorted_desc.head()

In [None]:
# Top 10 most popular
most_popular = sorted_desc.head(10).reset_index(drop=True)
most_popular

In [None]:
# Top 10 most downloaded
most_down = n_data.sort_values('Installs', axis=0, ascending=False).head(10).reset_index(drop=True)
most_down

In [None]:
# Top 10 highly rated
most_rated = n_data.sort_values('Rating', axis=0, ascending=False).head(10).reset_index(drop=True)
most_rated

In [None]:
#**********************************************
# For UI 
#**********************************************

In [None]:
# Creating dummy developers for demo purpose
dev1 = sorted_desc.sample(10, random_state=3).reset_index(drop=True)
dev1['DevID'] = 'dev1'
dev2 = sorted_desc.sample(7, random_state=6).reset_index(drop=True)
dev2['DevID'] = 'dev2'
dev3 = sorted_desc.sample(2, random_state=11).reset_index(drop=True)
dev3['DevID'] = 'dev3'

In [None]:
# Category-wise total scores
cat_scr = n_data.groupby(['Category']).agg({'Scores':'sum'})
cat_scr.sort_values('Scores', ascending=False, inplace=True)
cat_scr.reset_index(inplace=True)

In [None]:
# Function for generating data for developer dashboard

def dev_dash(dev):

    # Developer rating
    dev_rating = round(dev.Rating.mean(), 2)

    # Total App downloads
    dev_downloads = dev.Installs.sum()

    # Total positive reviews
    dev_pos = dev.Positive.sum()

    # Total negative reviews
    dev_neg = dev.Negative.sum()

    # Suggestions for next project
    dev_sugg = cat_scr.Category.head(3).values

    # Developer plot prep
    tb_scr = sorted_desc.Scores.values
    tb_ranks = sorted_desc.Ranks.values
    d_sort = dev.sort_values('Scores', axis=0, ascending=False)
    x_vals = ['Highest'] + list(d_sort.App.values) + ['Lowest']
    dev_ranks = ['Rank '+str(tb_ranks[0])] + ['Rank '+str(x) for x in d_sort.Ranks.values] + ['Rank '+str(tb_ranks[-1])]
    y_vals = [tb_scr[0]] + list(d_sort.Scores.values) + [tb_scr[-1]]
    col_dev = ['g'] + ['b']*(len(x_vals)-2) + ['r']
    
    # Developer plot
    sns.set(style='white')
    fig = plt.figure(figsize=(17,12))
    ax1 = plt.axes()
    sns.barplot(x = y_vals,y = x_vals, palette=col_dev, ax=ax1)
    for x,y,z in zip(np.arange(len(x_vals)),y_vals,dev_ranks):
        ax1.text(y+.1, x+.1, z, color='black', fontsize=12)
    plt.yticks(rotation=10, fontsize=9)
    sns.despine(bottom=True)
    img_name = '/home/rajat/Documents/Aegis/Python/Python Project/google-play-store-apps/static/'+dev.DevID[0]+'.png'
    img_n = dev.DevID[0]+'.png'
    plt.savefig(fname=img_name)

    return {'Rating':dev_rating, 'Downloads':dev_downloads, 'Positive':dev_pos, 'Negative':dev_neg, 'Suggestion':dev_sugg, 'Image':img_n}

In [None]:
# Generating data for dummy developers
d1 = dev_dash(dev1)
d2 = dev_dash(dev2)
d3 = dev_dash(dev3)

In [None]:
print(d1)
print(d2)
print(d3)

In [None]:
D1=pd.DataFrame(most_popular)
D1['percentage']=[(ratings/5)*100 for ratings in D1['Rating']]
D2=D1[['App','Rating','Installs','percentage']]
D2.reset_index(drop=True,inplace=True)
D3=pd.DataFrame(most_down)
D3['percentage']=[(ratings/5)*100 for ratings in D1['Rating']]
D3.reset_index(drop=True,inplace=True)
D4=D3[['App','Rating','Installs','percentage']]
D4
D5=pd.DataFrame(most_rated)
D5['percentage']=[(ratings/5)*100 for ratings in D1['Rating']]
D5.reset_index(drop=True,inplace=True)
D6=D5[['App','Rating','Installs','percentage']]
D6

In [None]:
from werkzeug.wrappers import Request, Response
from flask import Flask,render_template, redirect , url_for,request
from werkzeug.serving import run_simple


app = Flask(__name__,template_folder='templates',static_folder='static')
app.debug=True


@app.route("/",methods=['GET', 'POST'])
def main():
    table=D2
    table1=D4
    table2=D6
    admin=['dev11@gmail','dev12@gmail','dev13@gmail','1111','2222','3333']
    error = None
    if request.method == 'POST':
        if request.form['username'] not in  admin or request.form['password'] not in  admin:
            error = 'Invalid Credentials. Please try again.'
            
        else:
            if request.form['username'] == 'dev11@gmail':
                return render_template('index1.html',dic=d1, dev1=dev1)
       
            elif request.form['username']== 'dev12@gmail':
                return render_template('index1.html',dic=d2, dev2=dev2)
       
            elif request.form['username']== 'dev13@gmail':
                return render_template('index1.html',dic=d3, dev3=dev3)
            else:
                print('Invalid') 
            #return redirect('home')
    return render_template('index7.html',table=D2,table1=D4,table2=D6,error=error)
    
# @app.route("/home")
# def home():
#     if 'username' == 'dev11@gmail':
#         return render_template('index1.html',dic=dic11)
       
#     elif 'username'== 'dev12@gmail':
#         return render_template('index1.html',dic=dic21)
       
#     elif 'username'== 'dev13@gmail':
#         return render_template('index1.html',dic=dic3)
#     else:
#         print('Invalid') 


if __name__ == '__main__':
    run_simple('localhost', 9000, app)