In [None]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
from openpyxl import Workbook
from openpyxl import load_workbook
from vnlp import SentimentAnalyzer
from vnlp import StemmerAnalyzer
from nltk.corpus import stopwords
import nltk
import string
import re
from collections import Counter
from snowballstemmer import TurkishStemmer

In [None]:
def take_all_data():
    page_number = 1;
    finish_page = 1;
    dataset = list()

    while page_number <= finish_page:
        books_url = "https://www.idefix.com/kategori/Kitap/Edebiyat/grupno=00055?ShowNotForSale=True&Page="+ str(page_number) + ""
        books_request = requests.get(books_url)
        books_html = BeautifulSoup(books_request.content,"lxml")

        books_div1 = books_html.find("div",attrs={"class":"no-margin productListNewBox boxes books clearfix"})
        books_div2 = books_div1.find("div",attrs={"class":"row"})
        books_list = books_div2.find_all("div",attrs={"class":"cart-product-box-view"})

        for book_html in books_list:
            book_link = "https://www.idefix.com/"+book_html.a.get("href")
            books_request = requests.get(book_link)

            book_page_html = BeautifulSoup(books_request.content,"lxml")
            comments_html = book_page_html.find_all("div",attrs={"class":"comment"})

            book_div1 = book_page_html.find("div",attrs={"class":"product-info hidden-lg hidden-md hidden-sm col-xs-12"})
            if(book_div1 is None):  # if the link of the book is not available, go to the next book.
                continue
            book_name = book_div1.find("h3",attrs={"style":"margin-bottom: 10px !important; margin-top: 0px;"}).get_text().replace('\n',"").replace(' ',"")

            if (len(comments_html) != 0):
                for comment_html in comments_html:
                    comment = comment_html.find(id="reviewBody").get_text()
                    data = [comment, 0]
                    dataset.append(data)
        page_number += 1
    return dataset

In [None]:
def write_excel(file_name, data):
    wb = Workbook()
    ws = wb.active
    i = 1
    for comment in data:
        ws["A" + str(i)] = comment[0]
        ws["B" + str(i)] = comment[1]
        i += 1
    wb.save(file_name + ".xlsx")

def read_excel(file_name):
    wb = load_workbook(filename = (file_name + ".xlsx"))
    sheet_ranges = wb["Sheet"]
    data = list()
    for i in range(1, sheet_ranges.max_row + 1):
        data.append([sheet_ranges["A" + str(i)].value, sheet_ranges["B" + str(i)].value])
    return data

In [None]:
sentiment_analyzer = SentimentAnalyzer()
def analyzer(data):
    analyzed_data = list()
    for comment in data:
        score = sentiment_analyzer.predict_proba(comment[0])
        analyzed_data.append([comment[0], score * 5])
    return analyzed_data

In [None]:
# punctation = string.punctuation
punctuation ='''!()-[]{};':'"\,<>./?@#$%^&*_~'''
#Özel karakterleri temizleme
def special_character_remover(comment):
    return comment.translate(str.maketrans("","",punctuation))

#Stopword cleaning
stopword = set(stopwords.words("turkish"))
def stopwords_remover(comment):
    return " ".join([word for word in str(comment).split() if word not in stopword])

count = Counter()
#sık kullanılan kelimeleri temizleme
def frequently_words_remover(comment):
    for word in comment.split():
        count[word] += 1
    frequency = set([i for (i,j) in count.most_common(15)])
    return frequency

def frequency_remover(comment):
    return " ".join([word for word in str(comment).split() if word not in frequently_words_remover(comment)])

#Emojileri Silme
def emoji_remover(comment):
    emoji = re.compile("["
                               u"\U0001F600-\U0001F64F"  
                               u"\U0001F300-\U0001F5FF"                                 
                               u"\U0001F680-\U0001F6FF"  
                               u"\U0001F1E0-\U0001F1FF"  
                               u"\U00002500-\U00002BEF"                                 
                               u"\U00002702-\U000027B0"
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               u"\U0001f926-\U0001f937"
                               u"\U00010000-\U0010ffff"
                               u"\u2640-\u2642"
                               u"\u2600-\u2B55"
                               u"\u200d"
                               u"\u23cf"
                               u"\u23e9"
                               u"\u231a"
                               u"\ufe0f"  
                               u"\u3030"
                               "]+", flags=re.UNICODE)
    return emoji.sub(r"",comment)

#----------------Kelime Kökünü Alma
stemmer = StemmerAnalyzer()
def get_root_word(comment):
    root_words = ""
    wordlist = stemmer.predict(comment)
    for sentence in wordlist:
        root_words += sentence.split("+")[0] + " "
    return root_words
    
# normalization cleaning
def normalization(data):
    dummy_data = data
    for comment in dummy_data:
        comment[0] = get_root_word(stopwords_remover(special_character_remover((emoji_remover(comment[0])))))
    return dummy_data

In [None]:
original_comments = take_all_data()
write_excel("original_comments", original_comments)

In [None]:
normalized_comments = normalization(read_excel("original_comments"))
write_excel("normalized_comments", normalized_comments)

In [9]:
analyzed_original_comments = analyzer(read_excel("original_comments"))
write_excel("analyzed_original_comments", analyzed_original_comments) 

analyzed_normalized_comments = analyzer(read_excel("normalized_comments"))
write_excel("analyzed_normalized_comments", analyzed_normalized_comments)



In [None]:
# comments_without_stopwords = normalization(original_comments)
for i in normalized_comments:
    print(i[0])
# print(comments_without_stopwords[i][0])