# Calm (US) App Store

## Import the Libraries

In [None]:
!pip install langdetect
!pip install vaderSentiment

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, ENGLISH_STOP_WORDS
import math
import seaborn as sns
import nltk
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
from langdetect import detect_langs

In [None]:
df = pd.read_csv("../input/app-store-meditation-app-reviews/appstore_calm_us.csv")

In [None]:
df.head()

# Understanding the Data

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.drop("Unnamed: 0", axis = 1, inplace = True)

In [None]:
df['year'] = pd.DatetimeIndex(df['date']).year
df['month'] = pd.DatetimeIndex(df['date']).month
df['day'] = pd.DatetimeIndex(df['date']).day
df = df.drop(columns=["date"])
df.head()

In [None]:
from nltk import word_tokenize
word_token = [word_tokenize(review) for review in df.review]
len_tokens= [] 

for i in range(len(word_token)):
    len_tokens.append(len(word_token[i]))

df["n_tokens"] = len_tokens
df.head()

In [None]:
plt.hist(len_tokens, range=[0, 300], facecolor='pink');

In [None]:
df.info()

In [None]:
# Check if all reviews are in English
lang = [detect_langs(i) for i in df.review]
language = [str(i).split(":")[0][1:] for i in lang]
a = [lan for lan in language if lan != "en" ]

In [None]:
a # all reviews are in english

In [None]:
df.groupby("rating").mean()

In [None]:
df["year"].min()

In [None]:
df["year"].max()

In [None]:
correlation = df.corr()
correlation

In [None]:
sns.heatmap(correlation)
plt.show()

# Data Visualization

In [None]:
sns.catplot(x="year",data=df,kind="count")
plt.show()

In [None]:
sns.catplot(x="month",data=df,kind="count")
plt.show()

In [None]:
sns.catplot(x="rating",data=df,kind="count")
plt.show()

In [None]:
df.rating.value_counts()

# Analysing Reviews

In [None]:
df['review'] = df['review'].str.replace('[^\w\s]','')
df['review'] = df['review'].str.lower()
df.head()

In [None]:
df.isna().sum()

# Word Cloud

In [None]:
df_1 = df[(df["rating"] == 1)]
df_2 = df[(df["rating"] == 2)]
df_3 = df[(df["rating"] == 3)]
df_4 = df[(df["rating"] == 4)]
df_5 = df[(df["rating"] == 5)]

In [None]:
text_1 = " ".join(review for review in df_1.review)
text_2 = " ".join(review for review in df_2.review)
text_3 = " ".join(review for review in df_3.review)
text_4 = " ".join(review for review in df_4.review)
text_5 = " ".join(review for review in df_5.review)

In [None]:
stopwords = set(STOPWORDS)
stopwords.update(["meditation", "app", "meditate", "im", "meditations" ,"thing", "calm", "dont", "really"])

cloud_1 = WordCloud(stopwords=stopwords, background_color="white").generate(text_1)
cloud_2 = WordCloud(stopwords=stopwords, background_color="white").generate(text_2)
cloud_3 = WordCloud(stopwords=stopwords, background_color="white").generate(text_3)
cloud_4 = WordCloud(stopwords=stopwords, background_color="white").generate(text_4)
cloud_5 = WordCloud(stopwords=stopwords, background_color="white").generate(text_5)

In [None]:
width=5
height=5
rows = 1
cols = 5
axes=[]
fig=plt.figure(figsize=(40, 40))

for a in range(rows*cols):
    cloud = [cloud_1, cloud_2, cloud_3, cloud_4, cloud_5]
    axes.append(fig.add_subplot(rows, cols, a+1) )
    subplot_title=("Word Cloud"+str(a + 1))
    axes[-1].set_title(subplot_title)  
    plt.imshow(cloud[a])
fig.tight_layout()    
plt.show()

## Implementing TF-IDF

In [None]:
df.rating.value_counts()

In [None]:
vect = TfidfVectorizer(max_features = 10,stop_words=stopwords)
tfIdf = vect.fit(df_1.review)
X = vect.transform(df_1.review)
X_df = pd.DataFrame(X.toarray(), columns = vect.get_feature_names())
X_df.head()

In [None]:
vect = TfidfVectorizer(max_features = 10, stop_words=stopwords)
tfIdf = vect.fit(df_2.review)
X = vect.transform(df_2.review)
X_df = pd.DataFrame(X.toarray(), columns = vect.get_feature_names())
X_df.head()

In [None]:
vect = TfidfVectorizer(max_features = 10, stop_words=stopwords)
tfIdf = vect.fit(df_3.review)
X = vect.transform(df_3.review)
X_df = pd.DataFrame(X.toarray(), columns = vect.get_feature_names())
X_df.head()

In [None]:
vect = TfidfVectorizer(max_features = 10, stop_words=stopwords)
tfIdf = vect.fit(df_4.review)
X = vect.transform(df_4.review)
X_df = pd.DataFrame(X.toarray(), columns = vect.get_feature_names())
X_df.head()

In [None]:
vect = TfidfVectorizer(max_features = 10, stop_words=stopwords)
tfIdf = vect.fit(df_5.review)
X = vect.transform(df_5.review)
X_df = pd.DataFrame(X.toarray(), columns = vect.get_feature_names())
X_df.head()

## Bag of Words

In [None]:
my_stopwords = ENGLISH_STOP_WORDS.union(["meditation", "app", "meditate", "im", "meditations" ,"thing", "calm"])

## Rating 1

In [None]:
vectorizer = CountVectorizer(ngram_range=(1,3), max_features = 100, max_df=500, stop_words=my_stopwords)
vectorizer.fit(df_1.review)
X = vectorizer.transform(df_1.review)
X_df = pd.DataFrame(X.toarray(), columns= vectorizer.get_feature_names())
X_df.columns

In [None]:
pd.set_option("max_colwidth", None)

In [None]:
print(df_1[df_1['review'].str.contains("ads")]["review"])

## Rating 2

In [None]:
vectorizer = CountVectorizer(ngram_range=(1,3),max_features = 100, max_df=500, stop_words=my_stopwords)
vectorizer.fit(df_2.review)
X = vectorizer.transform(df_2.review)
X_df = pd.DataFrame(X.toarray(), columns= vectorizer.get_feature_names())
X_df.columns

In [None]:
print(df_2[df_2['review'].str.contains("premium")]["review"])

## Rating 3

In [None]:
vectorizer = CountVectorizer(ngram_range=(1,3),max_features = 100, max_df=500, stop_words=my_stopwords)
vectorizer.fit(df_3.review)
X = vectorizer.transform(df_3.review)
X_df = pd.DataFrame(X.toarray(), columns= vectorizer.get_feature_names())
X_df.columns

In [None]:
print(df_3[df_3['review'].str.contains("different")]["review"])

## Rating 4

In [None]:
vectorizer = CountVectorizer(ngram_range=(1,3),max_features = 100, max_df=500, stop_words=my_stopwords)
vectorizer.fit(df_4.review)
X = vectorizer.transform(df_4.review)
X_df = pd.DataFrame(X.toarray(), columns= vectorizer.get_feature_names())
X_df.columns

## Rating 5

In [None]:
vectorizer = CountVectorizer(ngram_range=(1,3),max_features = 100, max_df=500, stop_words=my_stopwords)
vectorizer.fit(df_5.review)
X = vectorizer.transform(df_5.review)
X_df = pd.DataFrame(X.toarray(), columns= vectorizer.get_feature_names())
X_df.columns