# Ürünlerin genel olarak olumlu olumsuz tahminlemesini gerçekleştirmeden önce, veri seti üzerinde metinsel veri analizinin gerçekleştirilmesini kapsamaktadır.

In [None]:
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Veri Ön İşleme

In [None]:
#import required libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import string
import math
import spacy
from sklearn.feature_extraction.text import CountVectorizer
from wordcloud import WordCloud
from textwrap import wrap
from textblob import TextBlob


In [None]:
pip install textstat

In [None]:
import textstat

In [None]:
# load dataset

text=pd.read_csv('/kaggle/input/turkishreviews/hb.csv')
text.shape
text["Rating"] = -1

In [None]:
text.head()

In [None]:
#Select required features for analysis from the 4 given columns.

text.columns

In [None]:
#Select the the 4 key columns, product url, review content, users if they recommend the product
textdata = text[['URL','Review','Rating','Rating (Star)']]
textdata.head()

In [None]:
#Drop null values
textdata.dropna(inplace=True)
textdata.isna().sum()

In [None]:
textcopy=textdata.copy()


In [None]:
#Filter products based on number of reviews

textdata=textdata.groupby(['URL']).filter(lambda x: len(x)>300).reset_index(drop=True)
print('Number of products matching the criteria is ',len(textdata['URL'].unique()))

In [None]:
#convert datatype boolean and float to int
textdata['Rating']=textdata['Rating (Star)'].astype(int)

In [None]:


textdata['URL'].unique()
textdata['URL']=textdata['URL'].apply(lambda x: x.split(',,,')[0])

In [None]:
#Explore and clean the review text

for text in enumerate(textdata['Review'][20:30]):
  print('Review:\n',text)

In [None]:
# While developing NLP models capital and lowercase letters are treated differently so its required to convert all words to lowercase, as few words are in capitals in the review text.

textdata['Review']=textdata['Review'].apply(lambda x: x.lower())

In [None]:
# Eliminate digits in the text using regular expressions

textdata['Review']=textdata['Review'].apply(lambda x: re.sub('\w*\d\w*','', x))

In [None]:
#Eliminate punctuaitons

textdata['Review']=textdata['Review'].apply(lambda x: re.sub('[%s]' % re.escape(string.punctuation), '', x))

In [None]:
product_review_count=textdata.groupby(['Rating']).agg(['mean', 'count'])
product_review_count.head()
fig = plt.figure()
ax = fig.add_axes([0,0,1,1])
ax.bar(product_review_count["Rating (Star)"]["mean"], product_review_count["Rating (Star)"]["count"])
plt.show()

**Yukarıda gördüğümüz grafikte yüksek oy verenlerin daha fazla yorum yaptığı gözlemlenmiştir. Hipotez olarak; olumlu oy verenlerin sistem içerisinde fazla yorum yapmaya dahil olduğunu söyleyebiliriz.**

**SpaCy yorumlarda yer alan en çok kelimeleri tespit etmek için kullanılmaktadır. **

In [None]:
!python -m spacy download xx_ent_wiki_sm
!python -m spacy link xx_ent_wiki_sm xx_model

In [None]:
# use spacy module
from spacy.lang.tr import Turkish
nlp = Turkish()
nlp.max_length = 199386400

stop_word_list=pd.read_csv('/kaggle/input/stopwords/stopwords.csv')
print(stop_word_list)

spacy_stopwords = spacy.lang.tr.stop_words.STOP_WORDS

for w in stop_word_list['Words']:
    spacy_stopwords.add(w)
  
print('Number of stop words: %d' % len(spacy_stopwords))
print('First ten stop words: %s' % list(spacy_stopwords)[:10])

text = "Pekala, bu nedenle süper bir ürün."
doc = nlp(text)
tokens = [token.text for token in doc if not token.is_stop]

print('Original Article: %s' % (text))
print()
print(tokens)
#tokens_without_sw= [word for word in text_tokens if not word in all_stopwords]

#print(tokens_without_sw)

In [None]:

# stopwords removal and lemmatization
textdata['Review']=textdata['Review'].apply(lambda x: ' '.join([token.lemma_ for token in list(nlp(x)) if (token.is_stop==False)]))

textdata=textdata[['Rating (Star)','Review']].groupby(by='Rating (Star)').agg(lambda x:' '.join(x))
textdata.head()

In [None]:
#Generate document term matrix to find most occuring terms in the reviews

cv=CountVectorizer(analyzer='word')
data=cv.fit_transform(textdata['Review'])
text_dtm = pd.DataFrame(data.toarray(), columns=cv.get_feature_names())
text_dtm.index=textdata.index
text_dtm.head()

In [None]:
def wordcloud(data,title):
  title = str(title)
  wc = WordCloud(width=600, height=530, max_words=150,colormap="Dark2").generate_from_frequencies(data)
  plt.figure(figsize=(10,8))
  plt.imshow(wc, interpolation='bilinear')
  plt.axis("off")
  plt.title(('Rating: ' + title),fontsize=13)
  plt.show()
  
text_dtm=text_dtm.transpose()

for index,product in enumerate(text_dtm.columns):
  wordcloud(text_dtm[product].sort_values(ascending=False),product)
 


**Kelime bulutu, incelemelerde en sık görülen terimleri hızlı bir şekilde anlamanıza ve görselleştirmenize yardımcı olmaktadır.

In [None]:
#Polarity in sentiment analysis refers to identifying sentiment orientation (positive, neutral, and negative) in written or spoken language.

textdata['emotion']=textdata['Review'].apply(lambda x:TextBlob(x).sentiment.polarity)

In [None]:
product_polarity=pd.DataFrame(textdata.groupby('Rating (Star)')['emotion'].mean().sort_values(ascending=True))

plt.figure(figsize=(16,12))
plt.xlabel('Emotion')
plt.ylabel('Products')
plt.title('Polarity of Product Reviews')
polarity_graph=plt.barh(np.arange(len(product_polarity.index)),product_polarity['emotion'],color='blue')


for bar,product in zip(polarity_graph,product_polarity.index):
  plt.text(0.005,bar.get_y()+bar.get_width(),'{}'.format(product),va='center',fontsize=11,color='white')

for bar,polarity in zip(polarity_graph,product_polarity['emotion']):
  plt.text(bar.get_width()+0.001,bar.get_y()+bar.get_width(),'%.3f'%polarity,va='center',fontsize=11,color='black')
  
plt.yticks([])
plt.show()