# Meditopia (TR - US) App Store

# Import the Libraries

In [None]:
!pip install langdetect

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, ENGLISH_STOP_WORDS
import math
import seaborn as sns
import nltk
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
from langdetect import detect_langs

In [None]:
nltk.download('punkt')

# Understanding the Data

In [None]:
df = pd.read_csv("../input/app-store-meditation-app-reviews/appstore_meditopia.csv")
df.head()

In [None]:
# unnamed is just a duplicate of the index
df.drop('Unnamed: 0', axis=1, inplace=True)

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
# look at the dates separately
df['year'] = pd.DatetimeIndex(df['date']).year
df['month'] = pd.DatetimeIndex(df['date']).month
df['day'] = pd.DatetimeIndex(df['date']).day
df = df.drop(columns=["date"])
df.head()

In [None]:
df = df.rename(columns={'developerResponse.body': "meditopia response"})
df.head()

In [None]:
from nltk import word_tokenize
word_token = [word_tokenize(review) for review in df.review]
len_tokens= [] 

for i in range(len(word_token)):
    len_tokens.append(len(word_token[i]))

df["n_tokens"] = len_tokens
df.head()

## Analyzing MultiLingual Data

In [None]:
df.tail()

As it can be seen in line 235, there are also some non-English tweets. Let's identify them.

In [None]:
lang = [detect_langs(i) for i in df.review]

In [None]:
language = [str(i).split(":")[0][1:] for i in lang]

In [None]:
df["lang"] = language

In [None]:
df

There are some errors but removing all non-english tweets will reduce 86 rows.

In [None]:
(len(df) - len(df[df["lang"] == "en"])) / len(df)

In [None]:
fix = df[df["lang"] != "en"]

In [None]:
fix.rating.mean()

In [None]:
# correcting some wrong lang classification
df.loc[89, "lang"] = "en"
df.loc[90, "lang"] = "en"
df.loc[131, "lang"] = "en"
df.loc[132, "lang"] = "en"
df.loc[141, "lang"] = "en"

In [None]:
df.groupby(by="lang").mean()

### Correlation

In [None]:
# see if there is a correlation on the numeric data
correlation = df.corr()
correlation

In [None]:
df["year"].min()

In [None]:
df["year"].max()

In [None]:
# visualize (there is no correlation)
sns.heatmap(correlation)
plt.show()

## Data Visualization

In [None]:
df.info()

In [None]:
sns.catplot(x="year",data=df,kind="count")
plt.show()

In [None]:
sns.countplot(x="year", data=df, hue="rating");

In [None]:
df.columns

In [None]:
sns.catplot(x="rating",data=df,kind="count")
plt.show()

In [None]:
sns.catplot(x="month",data=df,kind="count")
plt.show()

# Analysing Reviews

In [None]:
df['review'] = df['review'].str.replace('[^\w\s]','')
df['review'] = df['review'].str.lower()
df.head()

In [None]:
df.isna().sum()

Calm hasn't responded some of the reviews. Check if there is a relation between comments Calm answered and ratings.

## Word Cloud

I will only check English tweets.

In [None]:
df_1 = df[(df["rating"] == 1) & (df["lang"] == "en")]
df_2 = df[(df["rating"] == 2) & (df["lang"] == "en")]
df_3 = df[(df["rating"] == 3) & (df["lang"] == "en")]
df_4 = df[(df["rating"] == 4) & (df["lang"] == "en")]
df_5 = df[(df["rating"] == 5) & (df["lang"] == "en")]

In [None]:
text_1 = " ".join(review for review in df_1.review)
text_2 = " ".join(review for review in df_2.review)
text_3 = " ".join(review for review in df_3.review)
text_4 = " ".join(review for review in df_4.review)
text_5 = " ".join(review for review in df_5.review)

In [None]:
stopwords = set(STOPWORDS)
stopwords.update(["meditation", "app", "meditate", "im", "meditations" ,"thing"])

cloud_1 = WordCloud(stopwords=stopwords, background_color="white").generate(text_1)
cloud_2 = WordCloud(stopwords=stopwords, background_color="white").generate(text_2)
cloud_3 = WordCloud(stopwords=stopwords, background_color="white").generate(text_3)
cloud_4 = WordCloud(stopwords=stopwords, background_color="white").generate(text_4)
cloud_5 = WordCloud(stopwords=stopwords, background_color="white").generate(text_5)

In [None]:
width=5
height=5
rows = 1
cols = 5
axes=[]
fig=plt.figure(figsize=(40, 40))

for a in range(rows*cols):
    cloud = [cloud_1, cloud_2, cloud_3, cloud_4, cloud_5]
    axes.append(fig.add_subplot(rows, cols, a+1) )
    subplot_title=("Word Cloud"+str(a + 1))
    axes[-1].set_title(subplot_title)  
    plt.imshow(cloud[a])
fig.tight_layout()    
plt.show()

## Implementing TF-IDF

In [None]:
vect = TfidfVectorizer(max_features = 10, stop_words=stopwords)
tfIdf = vect.fit(df_1.review)
X = vect.transform(df_1.review)
X_df = pd.DataFrame(X.toarray(), columns = vect.get_feature_names())
X_df.head()

In [None]:
len(df_1.index)

In [None]:
stopwords.update(["really"])
vect = TfidfVectorizer(max_features = 15, stop_words=stopwords)
tfIdf = vect.fit(df_2.review)
X = vect.transform(df_2.review)
X_df = pd.DataFrame(X.toarray(), columns = vect.get_feature_names())
X_df.head()

In [None]:
len(df_2.index)

In [None]:
pd.set_option('display.max_colwidth', -1)

In [None]:
print(df_2[df_2['review'].str.contains("press")]["review"])

In [None]:
len(df_3.index)

In [None]:
vect = TfidfVectorizer(max_features = 15, stop_words=stopwords)
tfIdf = vect.fit(df_3.review)
X = vect.transform(df_3.review)
X_df = pd.DataFrame(X.toarray(), columns = vect.get_feature_names())
X_df.head()

In [None]:
len(df_4.index)

In [None]:
vect = TfidfVectorizer(max_features = 15, stop_words=stopwords)
tfIdf = vect.fit(df_4.review)
X = vect.transform(df_4.review)
X_df = pd.DataFrame(X.toarray(), columns = vect.get_feature_names())
X_df.head()

In [None]:
len(df_5.index)

In [None]:
vect = TfidfVectorizer(max_features = 15, stop_words=stopwords)
tfIdf = vect.fit(df_5.review)
X = vect.transform(df_5.review)
X_df = pd.DataFrame(X.toarray(), columns = vect.get_feature_names())
X_df.head()

# Sentiment Analysis

In [None]:
from textblob import TextBlob 

In [None]:
sentiment = []

for i in df.review:
    text = TextBlob(i)
    sentiment.append(text.sentiment)


In [None]:
sentiment[2]

In [None]:
sent = [float(str(i).split(",")[0].split("=")[1]) * 100 for i in sentiment]

In [None]:
df["sentiment"] = sent

In [None]:
df.head(2)

In [None]:
sns.relplot(x="year", y="sentiment", kind="line", data=df, ci=None);

In [None]:
sns.relplot(x="rating", y="sentiment", kind="scatter", hue="year", data=df);

## Bag of Words

In [None]:
my_stopwords = ENGLISH_STOP_WORDS.union(["meditation", "app", "meditate", "im", "meditations" ,"thing"])

### Rating 1

In [None]:
len(df_1["review"])

In [None]:
vectorizer = CountVectorizer(ngram_range=(1,3), max_features = 100, max_df=500, stop_words=my_stopwords)
vectorizer.fit(df_1.review)
X = vectorizer.transform(df_1.review)

In [None]:
"""
Sparse matrix is a matrix which 
contains very few non-zero elements. When a sparse matrix is represented with a 2-dimensional array, 
we waste a lot of space to represent that matrix.
"""
X

In [None]:
X_df = pd.DataFrame(X.toarray(), columns= vectorizer.get_feature_names() )

In [None]:
X_df.head()

In [None]:
X_df.columns

In [None]:
print(df_1[df_1['review'].str.contains("miss")]["review"])

### Rating 2

In [None]:
len(df_2.index)

In [None]:
vectorizer = CountVectorizer(ngram_range=(1,2), max_features = 100, max_df=500, stop_words=my_stopwords)
vectorizer.fit(df_2.review)
X = vectorizer.transform(df_2.review)
X_df = pd.DataFrame(X.toarray(), columns= vectorizer.get_feature_names())
X_df.head()

In [None]:
X_df.columns

In [None]:
print(df_2["review"])

### Rating 3

In [None]:
vectorizer = CountVectorizer(ngram_range=(1,2), max_features = 100, max_df=500, stop_words=my_stopwords)
vectorizer.fit(df_3.review)
X = vectorizer.transform(df_3.review)
X_df = pd.DataFrame(X.toarray(), columns= vectorizer.get_feature_names())
X_df.head()

In [None]:
X_df.columns

In [None]:
print(df_3["review"])

### Rating 4

In [None]:
vectorizer = CountVectorizer(max_features = 100, max_df=500, stop_words=my_stopwords)
vectorizer.fit(df_4.review)
X = vectorizer.transform(df_4.review)
X_df = pd.DataFrame(X.toarray(), columns= vectorizer.get_feature_names())
X_df.head()

In [None]:
X_df.columns

In [None]:
df_4

### Rating 5

In [None]:
vectorizer = CountVectorizer(ngram_range=(1,2), max_features = 100, max_df=500, stop_words=my_stopwords)
vectorizer.fit(df_5.review)
X = vectorizer.transform(df_5.review)
X_df = pd.DataFrame(X.toarray(), columns= vectorizer.get_feature_names())
X_df.head()

In [None]:
X_df.columns

In [None]:
print(df_5[df_5['review'].str.contains("meditopia")]["review"])

## Stemming and Lemmatization

In [None]:
from nltk.stem import PorterStemmer, WordNetLemmatizer

In [None]:
porter = PorterStemmer()
WNLemmatizer = WordNetLemmatizer()
tokens = [word_tokenize(text) for text in df.review]

In [None]:
stem = [porter.stem(str(text)) for text in tokens]

In [None]:
nltk.download('wordnet')

In [None]:
lem = [WNLemmatizer.lemmatize(str(text)) for text in tokens]

In [None]:
stem[0]

In [None]:
lem[0]

In [None]:
list(set(lem) - set(stem))

## Looking at Turkish Data

In [None]:
df[df["lang"] == "tr"]

In [None]:
df.to_csv('meditopia_final_app.csv', index=False) 

In [None]:
print(df[df['review'].str.contains("student")]["review"])