In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
#read in dataset
import pandas as pd
apps_with_duplicates=pd.read_csv('/kaggle/input/play-store-dataset/apps.csv')

#drops duplicates
apps=apps_with_duplicates.drop_duplicates()

In [None]:
print('Total number of apps in the dataset=',apps.count())

In [None]:
n=5
apps.sample(n)

In [None]:
apps.info()

In [None]:
apps.columns

In [None]:
apps=apps.dropna()
print(apps.isnull().sum())

In [None]:
apps.columns

In [None]:
apps.drop(['Unnamed: 0',],axis=1,inplace=True)

In [None]:
chars_to_remove=[',','$','+','M','k']
chars_to_remove

In [None]:
cols_to_clean=['Installs','Size','Price']
cols_to_clean

In [None]:
for col in cols_to_clean:
    for char in chars_to_remove:
        apps[col]=apps[col].str.replace(char,'')
    apps[col]=pd.to_numeric(apps[col])    

In [None]:
!pip install plotly

In [None]:
apps['Category'].unique()

In [None]:
apps['Category'].value_counts()

In [None]:
import plotly
plotly.offline.init_notebook_mode(connected=True)
import plotly.graph_objs as go


In [None]:
num_categories=len(apps['Category'].unique())
print('Number of categories=',num_categories)


In [None]:
import plotly
plotly.offline.init_notebook_mode(connected=True)
import plotly.graph_objs as go

num_categories=len(apps['Category'].unique())
print('Number of categories=',num_categories)

num_apps_in_category=apps['Category'].value_counts().sort_values(ascending=False)

data=[go.Bar(
    x=num_apps_in_category.index,#index=category name
    y=num_apps_in_category.values,#value=count
)]

plotly.offline.iplot(data)

In [None]:
avg_app_rating=apps['Rating'].mean()
print('Average app rating=', avg_app_rating)

data=[go.Histogram(
    x=apps['Rating']
)]

layout={'shapes':[{
    'type':'line',
    'x0':avg_app_rating,
    'y0':0,
    'x1':avg_app_rating,
    'y1':1000,
    'line':{'dash':'dashdot'}
}]
}

plotly.offline.iplot({'data':data,'layout':layout})

# SIZE AND PRICE OF AN APP

In [None]:
%matplotlib inline
import seaborn as sns
sns.set_style("darkgrid")
import warnings
warnings.filterwarnings("ignore")

#subset for
large_categories=apps.groupby('Category').filter(lambda x: len(x) >= 250).reset_index()


plt1=sns.jointplot(x=large_categories['Size'], y=large_categories['Rating'], kind='hex')

paid_apps=apps[apps['Type']=='Paid']

plt2=sns.jointplot(x=paid_apps['Price'], y=paid_apps['Rating'])

In [None]:
import matplotlib.pyplot as plt
fig, ax=plt.subplots()
fig.set_size_inches(15,8)

popular_app_cats=apps[apps.Category.isin(['GAME','FAMILY','PHOTOGRAPHY','MEDICAL','TOOLS','FINANCE','LIFESTYLE','BUSINESS'])]

ax=sns.stripplot(x=popular_app_cats['Price'], y=popular_app_cats['Category'], jitter=True, linewidth=1)
ax.set_title('App pricing trend across categories')

In [None]:
trace0=go.Box(
    y=apps[apps['Type']=='Paid']['Installs'],
    name='Paid'
)

trace1=go.Box(
    y=apps[apps['Type']=='Free']['Installs'],
    name='Free'
)

layout=go.Layout(
    title="Number of downloads of paid apps vs. free apps",
    yaxis=dict(
    type='log',
    autorange=True
    )

)

data=[trace0,trace1]
plotly.offline.iplot({'data':data, 'layout':layout})

In [None]:
reviews_df=pd.read_csv('/kaggle/input/play-store-dataset/user_reviews.csv')

merged_df=pd.merge(apps,reviews_df,on='App',how="inner")

merged_df=merged_df.dropna(subset=['Sentiment','Translated_Review'])

sns.set_style('ticks')
fig, ax=plt.subplots()
fig.set_size_inches(11,8)

ax=sns.boxplot(x=merged_df['Type'], y=merged_df['Sentiment_Polarity'])
ax.set_title('Sentiment Polarity Distribution')

In [None]:
merged_df.head()

In [None]:
selected_df=merged_df[['App','Category','Rating','Reviews','Installs','Type','Translated_Review','Sentiment']]
selected_df.head()

In [None]:
from sklearn.preprocessing import LabelEncoder

selected_df[['App','Category','Type','Sentiment']]=selected_df[['App','Category','Type','Sentiment']].apply( LabelEncoder().fit_transform)

In [None]:
selected_df.head()

In [None]:
import re
import nltk
#nltk.download()
nltk.download('stopwords')
stemmer = nltk.SnowballStemmer("english")
from nltk.corpus import stopwords
import string
stopword=set(stopwords.words('english'))

In [None]:
def clean(text):
    text = str(text).lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    text = [word for word in text.split(' ') if word not in stopword]
    text=" ".join(text)
    text = [stemmer.stem(word) for word in text.split(' ')]
    text=" ".join(text)
    return text
selected_df["Translated_Review"] = selected_df["Translated_Review"].apply(clean)
print(selected_df.head())

In [None]:
selected_df['Translated_Review']

In [None]:
X.shape

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vect= TfidfVectorizer(analyzer='word' , stop_words='english',  use_idf=True, smooth_idf=True, sublinear_tf=False)

In [None]:
tfidf_vect.fit(selected_df["Translated_Review"].values)
X = tfidf_vect.transform(selected_df["Translated_Review"].values)
y = selected_df['Sentiment'].values

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix,classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix,classification_report

import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
X_train, X_test , y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [None]:
classifier = RandomForestClassifier()

classifier.fit(X_train,y_train)

In [None]:
y_pred = classifier.predict(X_test)

In [None]:
print(classification_report(y_test,y_pred))

In [None]:
cnf_matrix = confusion_matrix(y_test,y_pred)
cnf_matrix

In [None]:
# Creating a dataframe for a array-formatted Confusion matrix,so it will be easy for plotting.
cm_df = pd.DataFrame(cnf_matrix,
                     index = ['Negative','Neutral','Positive'], 
                     columns = ['Negative','Neutral','Positive'])
#Plotting the confusion matrix
plt.figure(figsize=(5,4))
sns.heatmap(cm_df, annot=True)
plt.title('Confusion Matrix')
plt.ylabel('Actal Values')
plt.xlabel('Predicted Values')
plt.show()

In [None]:
!pip install gradio

In [None]:
import gradio as gr

def tweetdata(user):
    sample = user
    data = tfidf_vect.transform([sample]).toarray()
    a = classifier.predict(data)
    if a == 0:
        out ='Negative'
    elif a == 1:
        out ='Neutral'
    elif a == 2:
        out ='Positive'
    return out

demo = gr.Interface(fn=tweetdata, inputs="text", outputs="text")
    
if __name__ == "__main__":
    demo.launch()