In [4]:
import pandas as pd
from bs4 import BeautifulSoup
import requests
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
import re
import time
import random

def setup_session():
    session = requests.Session()
    retry = Retry(connect=5, backoff_factor=1)
    adapter = HTTPAdapter(max_retries=retry)
    session.mount('http://', adapter)
    return session

def get_page_count(session, url, headers):
    site_request = session.get(url, headers=headers).text
    soup = BeautifulSoup(site_request, "lxml")
    return int(soup.find_all("a", class_="page")[-1].text)

def get_complaint_links(session, url, page_count, headers):
    complaint_links = []
    for page in range(1, page_count + 1):
        page_url = f"{url}?page={page}"
        page_request = session.get(page_url, headers=headers).text
        soup = BeautifulSoup(page_request, "lxml")
        for container in soup.find_all("div", class_="read-more-container"):
            read_more_div = container.find("div", class_="read-more")
            data_url = read_more_div.get('data-url')
            if data_url:
                complaint_links.append("https://www.sikayetvar.com" + data_url)
    return complaint_links

def scrape_complaint(session, url, headers):
    review_site_request = session.get(url, headers=headers).text
    soup = BeautifulSoup(review_site_request, "lxml")
    try:
        title = soup.find("h2", class_="complaint-title").text.strip()
        comment = soup.find("a", class_="complaint-description").text.strip()
        date = soup.find("div", class_="js-tooltip time tooltipstered").text.strip()
        #views = soup.find("span", class_="js-view-count").get_text()
        #username = soup.find("span", class_="username").text.strip()
        return [url, date, title, comment]
    except Exception as e:
        print(f"Error processing {url}: {e}")
        return None

def main():
    company_list = ['tazedirekt']
    df_columns = ['PrimaryKey', 'href', 'date', 'title', 'comment']
    data = []

    session = setup_session()
    headers = {'user-agent': 'Mozilla/5.0'}

    for company in company_list:
        url = f"https://www.sikayetvar.com/turkcell/superbox"
        page_count = get_page_count(session, url, headers)
        complaint_links = get_complaint_links(session, url, page_count, headers)

        for idx, link in enumerate(complaint_links, 1):
            time.sleep(random.uniform(1, 2))
            complaint_data = scrape_complaint(session, link, headers)
            if complaint_data:
                data.append([idx] + complaint_data)
                print(f"Scraping progress: {idx}/{len(complaint_links)}")

    df = pd.DataFrame(data, columns=df_columns)
    excel_filename = "scraped_data.xlsx"
    df.to_excel(excel_filename, index=False)
    print("Scraping completed and data written to 'scraped_data.xlsx'")

if __name__ == "__main__":
    main()



Scraping completed and data written to 'scraped_data.xlsx'


In [1]:
import requests
from bs4 import BeautifulSoup as bts
import pandas as pd
import re
import numpy as np
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.action_chains import ActionChains
from bs4 import BeautifulSoup
import pandas as pd
import time
import requests
import os


def getAndParseURL(url):
    result = requests.get(url, headers={"User-Agent":"Chrome/119.0.6045.105"}) # Safari/537.36. Chrome/103.0.0.0
    soup = bts(result.text, "html.parser")
    return soup

In [2]:
pages = []
for page in range(1,61):
    pages.append("https://www.sikayetvar.com/turkcell/fatura?page="+str(page))
    
pages

['https://www.sikayetvar.com/turkcell/fatura?page=1',
 'https://www.sikayetvar.com/turkcell/fatura?page=2',
 'https://www.sikayetvar.com/turkcell/fatura?page=3',
 'https://www.sikayetvar.com/turkcell/fatura?page=4',
 'https://www.sikayetvar.com/turkcell/fatura?page=5',
 'https://www.sikayetvar.com/turkcell/fatura?page=6',
 'https://www.sikayetvar.com/turkcell/fatura?page=7',
 'https://www.sikayetvar.com/turkcell/fatura?page=8',
 'https://www.sikayetvar.com/turkcell/fatura?page=9',
 'https://www.sikayetvar.com/turkcell/fatura?page=10',
 'https://www.sikayetvar.com/turkcell/fatura?page=11',
 'https://www.sikayetvar.com/turkcell/fatura?page=12',
 'https://www.sikayetvar.com/turkcell/fatura?page=13',
 'https://www.sikayetvar.com/turkcell/fatura?page=14',
 'https://www.sikayetvar.com/turkcell/fatura?page=15',
 'https://www.sikayetvar.com/turkcell/fatura?page=16',
 'https://www.sikayetvar.com/turkcell/fatura?page=17',
 'https://www.sikayetvar.com/turkcell/fatura?page=18',
 'https://www.sikay

In [3]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin

def fetch_complaint_links(base_url, total_pages=60):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.6478.127 Safari/537.36'
    }
    
    all_complaint_links = []
    
    for page in range(1, total_pages + 1):
        page_url = f"{base_url}?page={page}"
        response = requests.get(page_url, headers=headers)
        
        # Check if the request was successful
        if response.status_code == 200:
            soup = BeautifulSoup(response.text, 'html.parser')
            
            # Find all <h2> tags with class "complaint-title" and extract the href from <a> tags
            for h2 in soup.find_all('h2', class_='complaint-title'):
                a_tag = h2.find('a')
                if a_tag and 'href' in a_tag.attrs:
                    href = a_tag['href']
                    # Convert relative URL to absolute URL
                    full_url = urljoin(base_url, href)
                    all_complaint_links.append(full_url)
        else:
            print(f"Failed to retrieve page {page}. Status code: {response.status_code}")
    
    # Print the collected complaint links
    if all_complaint_links:
        print(f"Found {len(all_complaint_links)} complaint links:")
        for link in all_complaint_links:
            print(link)
    else:
        print("No complaint links found.")
    
    return all_complaint_links

# Example URL
base_url = 'https://www.sikayetvar.com/turkcell/fatura'
links = fetch_complaint_links(base_url, total_pages=60)


Found 1363 complaint links:
https://www.sikayetvar.com/turkcell/turkcell-paycell-htp-click-n-play-15
https://www.sikayetvar.com/turkcell/turkcell-site-abonelik-bedeli-yansitilmasi
https://www.sikayetvar.com/turkcell/turkcell-kontrat-bitis-sorunu-hk
https://www.sikayetvar.com/turkcell/turkcell-htp-click-n-play-sorunu-4
https://www.sikayetvar.com/turkcell/turkcell-fatura-sorunu-cifte-odeme-ve-ilgisizlik
https://www.sikayetvar.com/turkcell/turkcell-fatura-borcu-var-diyor-borcum-yok
https://www.sikayetvar.com/turkcell/turkcell-musteri-hizmetleri-telefonu-yuzume-kapatti-5
https://www.sikayetvar.com/turkcell/turkcell-htp-click-n-play-islemi
https://www.sikayetvar.com/turkcell/turkcell-fazla-fatura-ibraz-etmis-sozlesmemi-eksik-yapmis
https://www.sikayetvar.com/turkcell/turkcell-30-yilinda-son-odemesi-2-gun-gecmis-diye-hatti-kisitlamasi
https://www.sikayetvar.com/turkcell/turkcell-telefon-hattim
https://www.sikayetvar.com/turkcell/turkcell-bilgim-disi-ucretlendirme
https://www.sikayetvar.com/t

In [4]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup as bts
from urllib.parse import urljoin
import time
import pandas as pd
from selenium.common.exceptions import TimeoutException, NoSuchElementException

# Webdriver'ı başlat
chrome_options = webdriver.ChromeOptions()
driver = webdriver.Chrome(options=chrome_options)

# Set page load timeout
driver.set_page_load_timeout(30)  # 30 seconds

# Base URL
base_url = "https://www.sikayetvar.com"

# Page URLs to visit
pages = [f"{base_url}/turkcell/fatura?page={page}" for page in range(1, 10)]

# Function to parse a given URL
def getAndParseURL(url):
    try:
        driver.get(url)
        WebDriverWait(driver, 30).until(EC.presence_of_element_located((By.TAG_NAME, 'html')))  # Ensure the HTML is loaded
        html = driver.page_source
        soup = bts(html, 'html.parser')
        return soup
    except TimeoutException:
        print(f"Timeout while trying to load page: {url}")
        return None

# Function to get product details
def getProductDetails(product_url):
    while True:
        try:
            html = getAndParseURL(product_url)
            if not html:
                return {}
            WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, 'h1.complaint-detail-title')))
            product_name = driver.find_element(By.CSS_SELECTOR, 'h1.complaint-detail-title').text
            comment_paragraphs = driver.find_elements(By.CSS_SELECTOR, 'div.complaint-detail-description')
            yorum_metinleri = [paragraph.text for paragraph in comment_paragraphs]

            return {'product_name': product_name, 'comments': yorum_metinleri}
        except (TimeoutException, NoSuchElementException):
            # If an exception occurs, retry
            print(f"Error while processing: {product_url}. Retrying...")
            time.sleep(1)  # Small delay before retry

# List to hold product URLs
products = []

# Extract product URLs from each page
for page in pages:
    soup = getAndParseURL(page)
    if soup:
        for h2 in soup.find_all('h2', class_='complaint-title'):
            a_tag = h2.find('a')
            if a_tag and 'href' in a_tag.attrs:
                href = a_tag['href']
                full_url = urljoin(base_url, href)
                products.append(full_url)

# List to hold all product details
all_yorumlar = []

# Extract product details for each product URL
for page_number, product_link in enumerate(products, start=1):
    yorumlar = getProductDetails(product_link)
    if yorumlar:
        all_yorumlar.append(yorumlar)
    else:
        print(f"Skipping {product_link} due to errors or timeout")

# Tarayıcıyı kapatma
driver.quit()



WebDriverException: Message: disconnected: not connected to DevTools
  (failed to check if window was closed: disconnected: not connected to DevTools)
  (Session info: chrome=126.0.6478.183)
Stacktrace:
	GetHandleVerifier [0x00007FF7A4AFEEB2+31554]
	(No symbol) [0x00007FF7A4A77EE9]
	(No symbol) [0x00007FF7A493872A]
	(No symbol) [0x00007FF7A491FA9C]
	(No symbol) [0x00007FF7A494630D]
	(No symbol) [0x00007FF7A49CCCD5]
	(No symbol) [0x00007FF7A49ACDD3]
	(No symbol) [0x00007FF7A497A33B]
	(No symbol) [0x00007FF7A497AED1]
	GetHandleVerifier [0x00007FF7A4E08B2D+3217341]
	GetHandleVerifier [0x00007FF7A4E55AF3+3532675]
	GetHandleVerifier [0x00007FF7A4E4B0F0+3489152]
	GetHandleVerifier [0x00007FF7A4BAE786+750614]
	(No symbol) [0x00007FF7A4A8376F]
	(No symbol) [0x00007FF7A4A7EB24]
	(No symbol) [0x00007FF7A4A7ECB2]
	(No symbol) [0x00007FF7A4A6E17F]
	BaseThreadInitThunk [0x00007FF926DE7374+20]
	RtlUserThreadStart [0x00007FF92835CC91+33]


In [1]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
import time

def fetch_complaint_links(base_url, total_pages=60):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.6478.127 Safari/537.36'
    }
    
    all_complaint_links = []
    
    for page in range(1, total_pages + 1):
        page_url = f"{base_url}?page={page}"
        response = requests.get(page_url, headers=headers)
        
        if response.status_code == 200:
            soup = BeautifulSoup(response.text, 'html.parser')
            for h2 in soup.find_all('h2', class_='complaint-title'):
                a_tag = h2.find('a')
                if a_tag and 'href' in a_tag.attrs:
                    href = a_tag['href']
                    full_url = urljoin(base_url, href)
                    all_complaint_links.append(full_url)
        else:
            print(f"Failed to retrieve page {page}. Status code: {response.status_code}")
    
    if all_complaint_links:
        print(f"Found {len(all_complaint_links)} complaint links:")
    else:
        print("No complaint links found.")
    
    return all_complaint_links

def get_complaint_details(driver, complaint_url):
    driver.get(complaint_url)
    try:
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, 'h1.complaint-detail-title'))
        )
        complaint_title = driver.find_element(By.CSS_SELECTOR, 'h1.complaint-detail-title').text
        complaint_description = driver.find_element(By.CSS_SELECTOR, 'div.complaint-detail-description').text

        return {
            'title': complaint_title,
            'description': complaint_description
        }
    except Exception as e:
        print(f"Error while processing {complaint_url}: {e}")
        return None

if __name__ == "__main__":
    base_url = 'https://www.sikayetvar.com/turkcell/fatura'
    links = fetch_complaint_links(base_url, total_pages=60)
    
    # Initialize Selenium WebDriver
    driver_path = r'C:\Users\asus\Desktop\chromedriver-win64\chromedriver.exe'  # Update this path to your chromedriver location
    service = Service(driver_path)
    driver = webdriver.Chrome(service=service)
    
    all_complaint_details = []
    
    for link in links:
        details = get_complaint_details(driver, link)
        if details:
            all_complaint_details.append(details)
    
    driver.quit()
    
    for complaint in all_complaint_details:
        print(f"Title: {complaint['title']}")
        print(f"Description: {complaint['description']}\n")

Failed to retrieve page 17. Status code: 429
Failed to retrieve page 19. Status code: 429
Failed to retrieve page 20. Status code: 429
Failed to retrieve page 21. Status code: 429
Failed to retrieve page 23. Status code: 429
Failed to retrieve page 24. Status code: 429
Failed to retrieve page 25. Status code: 429
Failed to retrieve page 27. Status code: 429
Failed to retrieve page 28. Status code: 429
Failed to retrieve page 29. Status code: 429
Failed to retrieve page 31. Status code: 429
Failed to retrieve page 32. Status code: 429
Failed to retrieve page 34. Status code: 429
Failed to retrieve page 35. Status code: 429
Failed to retrieve page 37. Status code: 429
Failed to retrieve page 38. Status code: 429
Failed to retrieve page 39. Status code: 429
Failed to retrieve page 41. Status code: 429
Failed to retrieve page 42. Status code: 429
Failed to retrieve page 44. Status code: 429
Failed to retrieve page 45. Status code: 429
Failed to retrieve page 46. Status code: 429
Failed to 

WebDriverException: Message: disconnected: not connected to DevTools
  (failed to check if window was closed: disconnected: not connected to DevTools)
  (Session info: chrome=126.0.6478.183)
Stacktrace:
	GetHandleVerifier [0x00007FF74324EEB2+31554]
	(No symbol) [0x00007FF7431C7EE9]
	(No symbol) [0x00007FF74308872A]
	(No symbol) [0x00007FF74306FA9C]
	(No symbol) [0x00007FF74309630D]
	(No symbol) [0x00007FF74311CCD5]
	(No symbol) [0x00007FF7430FCDD3]
	(No symbol) [0x00007FF7430CA33B]
	(No symbol) [0x00007FF7430CAED1]
	GetHandleVerifier [0x00007FF743558B2D+3217341]
	GetHandleVerifier [0x00007FF7435A5AF3+3532675]
	GetHandleVerifier [0x00007FF74359B0F0+3489152]
	GetHandleVerifier [0x00007FF7432FE786+750614]
	(No symbol) [0x00007FF7431D376F]
	(No symbol) [0x00007FF7431CEB24]
	(No symbol) [0x00007FF7431CECB2]
	(No symbol) [0x00007FF7431BE17F]
	BaseThreadInitThunk [0x00007FF926DE7374+20]
	RtlUserThreadStart [0x00007FF92835CC91+33]


In [1]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

def fetch_complaint_links(base_url, total_pages=60):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.6478.127 Safari/537.36'
    }
    
    all_complaint_links = []
    
    for page in range(1, total_pages + 1):
        page_url = f"{base_url}?page={page}"
        response = requests.get(page_url, headers=headers)
        
        if response.status_code == 200:
            soup = BeautifulSoup(response.text, 'html.parser')
            for h2 in soup.find_all('h2', class_='complaint-title'):
                a_tag = h2.find('a')
                if a_tag and 'href' in a_tag.attrs:
                    href = a_tag['href']
                    full_url = urljoin(base_url, href)
                    all_complaint_links.append(full_url)
        else:
            print(f"Failed to retrieve page {page}. Status code: {response.status_code}")
    
    if all_complaint_links:
        print(f"Found {len(all_complaint_links)} complaint links.")
    else:
        print("No complaint links found.")
    
    return all_complaint_links

def get_complaint_details(driver, complaint_url):
    driver.get(complaint_url)
    try:
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, 'h1.complaint-detail-title'))
        )
        complaint_title = driver.find_element(By.CSS_SELECTOR, 'h1.complaint-detail-title').text
        complaint_description = driver.find_element(By.CSS_SELECTOR, 'div.complaint-detail-description').text

        return {
            'title': complaint_title,
            'description': complaint_description
        }
    except Exception as e:
        print(f"Error while processing {complaint_url}: {e}")
        return None

if __name__ == "__main__":
    base_url = 'https://www.sikayetvar.com/turkcell/yurt-disi-paketleri'
    
    # Fetch complaint links using requests and BeautifulSoup
    links = fetch_complaint_links(base_url, total_pages=60)
    
    # Initialize Selenium WebDriver
    driver_path = r'C:\Users\asus\Desktop\chromedriver-win64\chromedriver.exe'  # Update this path to your chromedriver location
    service = Service(driver_path)
    driver = webdriver.Chrome(service=service)
    
    all_complaint_details = []
    
    for link in links:
        details = get_complaint_details(driver, link)
        if details:
            all_complaint_details.append(details)
    
    driver.quit()
    
    for complaint in all_complaint_details:
        print(f"Title: {complaint['title']}")
        print(f"Description: {complaint['description']}\n")


Failed to retrieve page 49. Status code: 429
Failed to retrieve page 50. Status code: 429
Failed to retrieve page 51. Status code: 429
Failed to retrieve page 52. Status code: 429
Failed to retrieve page 53. Status code: 429
Failed to retrieve page 54. Status code: 429
Failed to retrieve page 55. Status code: 429
Failed to retrieve page 56. Status code: 429
Failed to retrieve page 57. Status code: 429
Failed to retrieve page 58. Status code: 429
Failed to retrieve page 59. Status code: 429
Failed to retrieve page 60. Status code: 429
Found 1077 complaint links.


WebDriverException: Message: disconnected: not connected to DevTools
  (failed to check if window was closed: disconnected: not connected to DevTools)
  (Session info: chrome=126.0.6478.183)
Stacktrace:
	GetHandleVerifier [0x00007FF783C2EEB2+31554]
	(No symbol) [0x00007FF783BA7EE9]
	(No symbol) [0x00007FF783A6872A]
	(No symbol) [0x00007FF783A4FA9C]
	(No symbol) [0x00007FF783A7630D]
	(No symbol) [0x00007FF783AFCCD5]
	(No symbol) [0x00007FF783ADCDD3]
	(No symbol) [0x00007FF783AAA33B]
	(No symbol) [0x00007FF783AAAED1]
	GetHandleVerifier [0x00007FF783F38B2D+3217341]
	GetHandleVerifier [0x00007FF783F85AF3+3532675]
	GetHandleVerifier [0x00007FF783F7B0F0+3489152]
	GetHandleVerifier [0x00007FF783CDE786+750614]
	(No symbol) [0x00007FF783BB376F]
	(No symbol) [0x00007FF783BAEB24]
	(No symbol) [0x00007FF783BAECB2]
	(No symbol) [0x00007FF783B9E17F]
	BaseThreadInitThunk [0x00007FF926DE7374+20]
	RtlUserThreadStart [0x00007FF92835CC91+33]


In [2]:
import pandas as pd
# Create a DataFrame from the collected data
df = pd.DataFrame(all_complaint_details)
df.to_csv('yurtdişi.csv', index=False)
df

Unnamed: 0,title,description
0,Turkcell Yurt Dışı Kullanım Ücreti,Turkcell kullanıcısıyım. 24.07.2024 tarihinde ...
1,Muhatap Bulamıyorum Turkcell Beni Mobile Yönle...,Yurt dışına çıkmadan önce yurt dışı paketi ald...
2,Turkcell Yurt Dışı SMS Sorunu,Turkcell yurt dışına mesaj gönderme. Hiçbir şe...
3,Turkcell Yurt Dışı Kullanım Ücreti Haksız Fatu...,Turkcell hattımda yurt dışı arama ayarlarım ka...
4,Turkcell Yurt Dışında Olmadığım Halde Yurt Dış...,Turkcell yurt dışında olmadığım halde yurt dış...
...,...,...
74,Turkcell Yurt Dışı Kullanımı,Turkcell\n15 senelik Turkcell kullanıcıyım. Ba...
75,Turkcell Haksız Faturalandırma Yaptı.,Turkcell 21 Haziran 2024 Yunanistan seyahatimd...
76,Turkcell'in Yurt Dışı Mağduriyeti,"İki gündür ailecek yurt dışındayız, tüm aile o..."
77,Yurt Dışında Turkcell Servis Yok!,Turkcell sim kartım telefona takılı ve yurt dı...


In [2]:
# Convert the collected data to a DataFrame
data = []
for yorum in all_yorumlar:
    product_name = yorum['product_name']
    for comment in yorum['comments']:
        data.append({'product_name': product_name, 'comment': comment})

df = pd.DataFrame(data)

# Save the DataFrame to a CSV file
df.to_csv('product_comments_fatura3.csv', index=False)

print("Data saved to 'product_comments_fatura3.csv'")

Data saved to 'product_comments_fatura3.csv'


In [3]:
df

Unnamed: 0,product_name,comment
0,Turkcell Ücret İadesi Talep Ediyorum,Faturamın sözleşme süresi bitmiş ve bununla il...
1,Turkcell Yüksek Fatura Mağduriyeti,2023 yılının Ekim ayında numaramı daha avantaj...
2,Turkcell Faturalı Hattımın Faturasının Yüksek ...,320 liralık faturam şimdi 800 lira isteniyor. ...
3,Turkcell Devlet Tarafından Ödenen Borcu Yine T...,2015 yılında aldığım adıma kayıtlı 2 adet fatu...
4,Turkcell Bilgi Vermeden Faturalı Yapıyor,Ben Turkcell'den hiçbir zaman memnun olamadım....
5,Turkcell KVKK İhlali Fatura Ödeme İptali,13/07/2024 tarihinde kullanımımda bulunan 54**...
6,Turkcell Yetersiz Ek Paket Ve Fatura Ücreti,Ek SMS paketi almak istiyorum ve tek seçeneğim...
7,"Turkcell, Paycell Ve Vizyon Şikayeti",Turkcell hattıma ait faturamda son 3 aydır dik...
8,Turkcell Haksız Kazanç Sağlıyor,"Turkcell'de abonesi olduğum 3 hattım mevcut, h..."
9,Turkcell'in Hat Sahibinden Habersiz Aylarca Pa...,Sorunum Turkcell. Ben bayiden bir telefon aldı...


## ----------

In [8]:
df.iloc[20].comment

'Fatura tarihini değiştiremeyeceklerini belirtiyorlar. Böyle bir saçmalık yok ve çağrı merkezinde yardımcı olacak bir personel yok. Çok saçma işler yapıyorlar.\nTüketiciye yardımcı olacaklarına biz onlara yardımcı oluyoruz. Çok saçma. Bir de gecikmiş borcum olmadığına rağmen kapama açma ücreti yansıtacaklarmış, SMS ile bilgi veriyorlar. Çok gereksiz bir firma.'

In [1]:
import pandas as pd

df=pd.read_csv("product_comments.csv")

In [2]:
df

Unnamed: 0,product_name,comment
0,Turkcell Superbox Ortalama 10 Mbit Hız Veriyor.,Turkcell Superbox kullanıcısıyım. Oturduğum bi...
1,Turkcell Superbox Slack Sorunu,Turkcell Superbox ile bazı internet sitelerine...
2,Turkcell Haksız Turkcell Superbox Modem Ücreti...,"Merhaba, Turkcell Superbox kullanıcısıyım. Bağ..."
3,Turkcell Yüzde 300 Zam Yaptı,Turkcell Superbox paketim 250 GB Mayıs 2023 yı...
4,Turkcell İle 4 Aydır Yaşadığımız Şebeke Sorunu,Turkcell'den 2 kurumsal telefon hattımız bir k...
...,...,...
460,Turkcell Bayi Kiralama Adı Altında Ürünü Satıy...,16.02.2024 tarihinde Vialand'daki Turkcell bay...
461,Turkcell Superbox Arıza Ve İcra,Bozuk gelen ve çalışmayan Turkcell Superbox'ı ...
462,Turkcell Geçtiğimiz Şubat Ayında Deprem Nedeni...,Geçtiğimiz şubat ayında deprem nedeniyle 3 ay ...
463,Turkcell Superbox Modemi Nasıl Geri Gönderebil...,Superbox ımın iptalini gerçekleştirdim ama mod...


In [3]:
df2=df[["comment"]]

In [4]:
df2

Unnamed: 0,comment
0,Turkcell Superbox kullanıcısıyım. Oturduğum bi...
1,Turkcell Superbox ile bazı internet sitelerine...
2,"Merhaba, Turkcell Superbox kullanıcısıyım. Bağ..."
3,Turkcell Superbox paketim 250 GB Mayıs 2023 yı...
4,Turkcell'den 2 kurumsal telefon hattımız bir k...
...,...
460,16.02.2024 tarihinde Vialand'daki Turkcell bay...
461,Bozuk gelen ve çalışmayan Turkcell Superbox'ı ...
462,Geçtiğimiz şubat ayında deprem nedeniyle 3 ay ...
463,Superbox ımın iptalini gerçekleştirdim ama mod...


In [5]:
df=df2.rename(columns={'comment': 'comments'})

In [7]:
from transformers import BertTokenizer

# Alternatif bir Türkçe BERT modeli yükleyin
tokenizer = BertTokenizer.from_pretrained('dbmdz/bert-base-turkish-cased')

# Yorumları tokenize edin
def encode_data(text_list, tokenizer, max_length):
    return tokenizer(
        text_list,
        add_special_tokens=True,
        max_length=max_length,
        padding='max_length',
        truncation=True,
        return_tensors='tf'
    )

# Verileri tokenize edin
encoded_comments = encode_data(df['comments'].tolist(), tokenizer, max_length=128)
input_ids = encoded_comments['input_ids']
attention_masks = encoded_comments['attention_mask']

# TensorFlow dataset oluşturma
import tensorflow as tf

def create_tf_dataset(inputs, labels, batch_size):
    dataset = tf.data.Dataset.from_tensor_slices((dict(inputs), labels))
    dataset = dataset.shuffle(len(labels)).batch(batch_size).prefetch(tf.data.AUTOTUNE)
    return dataset

labels = df['sentiment'].tolist()
batch_size = 8

# TensorFlow dataset'i oluşturun
train_dataset = create_tf_dataset({'input_ids': input_ids, 'attention_mask': attention_masks}, labels, batch_size)

# Dataset'i kontrol etme
for batch in train_dataset:
    print(batch)
    break


TypeError: create_tf_dataset() missing 1 required positional argument: 'batch_size'

In [13]:
from transformers import TFBertForSequenceClassification, BertTokenizer, BertConfig
from transformers import AdamW, get_linear_schedule_with_warmup

# Alternatif bir Türkçe BERT modeli yükleyin
model_name = 'dbmdz/bert-base-turkish-cased'
model = TFBertForSequenceClassification.from_pretrained(model_name, num_labels=2)
tokenizer = BertTokenizer.from_pretrained(model_name)

# Optimizasyon ve eğitim adımları
optimizer = AdamW(learning_rate=2e-5, weight_decay=0.01)
train_steps = len(train_dataset) // batch_size
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=train_steps)

model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy')

# Modeli eğitin
model.fit(train_dataset, epochs=3)


KeyboardInterrupt: 

In [None]:
model.save_pretrained('sentiment_model')
tokenizer.save_pretrained('sentiment_model')

In [10]:
from transformers import BertTokenizer, TFBertForTokenClassification
import tensorflow as tf
import pandas as pd

# Türkçe BERT ve NER Modeli
tokenizer = BertTokenizer.from_pretrained('dbmdz/bert-base-turkish-cased')
ner_model = TFBertForTokenClassification.from_pretrained('dbmdz/bert-base-turkish-cased', num_labels=3)  # Örnek etiketler: O, B-ENTITY, I-ENTITY

# Veriyi tokenizasyon
def encode_data(text_list, tokenizer, max_length):
    return tokenizer(
        text_list,
        add_special_tokens=True,
        max_length=max_length,
        padding='max_length',
        truncation=True,
        return_tensors='tf'
    )

# Entity Ayrıştırma ve Sonuçları Yazdırma
def predict_entities(text):
    encoded = encode_data([text], tokenizer, max_length=128)
    input_ids = encoded['input_ids']
    attention_mask = encoded['attention_mask']
    
    # Entity tahmini
    ner_preds = ner_model.predict({'input_ids': input_ids, 'attention_mask': attention_mask})
    entities = tf.argmax(ner_preds.logits, axis=-1).numpy()[0]
    
    # Decode entity ids to labels
    tokens = tokenizer.convert_ids_to_tokens(input_ids[0].numpy())
    entity_labels = [ner_model.config.id2label[id] for id in entities]
    
    # Filtreleme
    results = [(token, label) for token, label in zip(tokens, entity_labels) if label != 'O']
    
    return results

# DataFrame ile modeli test et
df = pd.DataFrame({'comments': ['Fiber 100mb SuperOnline kullanıcısıyım yaklaşık 2 haftadır @Twitch @Kick_Turkey gibi canlı yayın platformlarında 360p yayın izlerken donmalar yaşıyoruz. Başka hiç bir operatörler bu sorunu yaşamazken ben parasını verip alamadığım hizmeti neden ödeyeyim ?']})  # Örnek veri
text = df['comments'].iloc[0]
print(predict_entities(text))


All model checkpoint layers were used when initializing TFBertForTokenClassification.

Some layers of TFBertForTokenClassification were not initialized from the model checkpoint at dbmdz/bert-base-turkish-cased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[('[CLS]', 'LABEL_2'), ('Fi', 'LABEL_1'), ('##ber', 'LABEL_1'), ('100', 'LABEL_1'), ('##m', 'LABEL_0'), ('##b', 'LABEL_1'), ('Super', 'LABEL_0'), ('##O', 'LABEL_2'), ('##n', 'LABEL_0'), ('##line', 'LABEL_1'), ('kullanıcısı', 'LABEL_1'), ('##yım', 'LABEL_1'), ('yaklaşık', 'LABEL_1'), ('2', 'LABEL_1'), ('haftadır', 'LABEL_0'), ('@', 'LABEL_1'), ('Tw', 'LABEL_1'), ('##it', 'LABEL_2'), ('##ch', 'LABEL_1'), ('@', 'LABEL_1'), ('Ki', 'LABEL_2'), ('##ck', 'LABEL_0'), ('_', 'LABEL_1'), ('Turkey', 'LABEL_1'), ('gibi', 'LABEL_1'), ('canlı', 'LABEL_2'), ('yayın', 'LABEL_2'), ('platform', 'LABEL_2'), ('##larında', 'LABEL_2'), ('360', 'LABEL_2'), ('##p', 'LABEL_2'), ('yayın', 'LABEL_2'), ('izlerken', 'LABEL_2'), ('don', 'LABEL_0'), ('##malar', 'LABEL_0'), ('yaşıyoruz', 'LABEL_0'), ('.', 'LABEL_1'), ('Başka', 'LABEL_1'), ('hiç', 'LABEL_1'), ('bir', 'LABEL_0'), ('operatör', 'LABEL_1'), ('##ler', 'LABEL_1'), ('bu', 'LABEL_1'), ('sorunu', 'LABEL_1'), ('yaşama', 'LABEL_0'), ('##z', 'LABEL_0'), ('##ken', 

In [13]:
import tensorflow as tf
from transformers import TFBertForSequenceClassification, BertTokenizer
from tensorflow.keras.optimizers.schedules import PolynomialDecay

# Tokenizer ve modeli yükle
tokenizer = BertTokenizer.from_pretrained('dbmdz/bert-base-turkish-cased')
model = TFBertForSequenceClassification.from_pretrained('dbmdz/bert-base-turkish-cased', num_labels=2)

# Örnek veriler
texts = ["Bu bir örnek cümledir.", "Başka bir örnek cümle."]
labels = [0, 1]

# Tokenize etme
encodings = tokenizer(texts, truncation=True, padding=True, max_length=128, return_tensors='tf')

# TensorFlow veri kümesi oluşturma
dataset = tf.data.Dataset.from_tensor_slices((
    dict(encodings),
    labels
))

# Veriyi batch'lere ayırma
batch_size = 8
train_dataset = dataset.batch(batch_size)

# Öğrenme oranı zamanlayıcısı
learning_rate_schedule = PolynomialDecay(
    initial_learning_rate=2e-5,
    decay_steps=len(train_dataset) * 3,  # Toplam adım sayısı: epoch sayısı * adım başına batch sayısı
    end_learning_rate=0
)

# TensorFlow optimizer
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate_schedule)

# Modeli custom training loop ile eğitin
@tf.function
def train_step(inputs, labels):
    with tf.GradientTape() as tape:
        outputs = model(**inputs, labels=labels, training=True)  # labels'ı model girişine ekleyin
        loss = outputs.loss
    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))
    return loss

# Eğitim döngüsü
epochs = 3
for epoch in range(epochs):
    print(f'Epoch {epoch + 1}/{epochs}')
    for batch in train_dataset:
        batch_inputs, batch_labels = batch
        inputs = {'input_ids': batch_inputs['input_ids'], 'attention_mask': batch_inputs['attention_mask']}
        labels = batch_labels
        loss = train_step(inputs, labels)
        print(f'Loss: {loss.numpy()}')

# Modeli kaydedin
model.save_pretrained('sentiment_model')
tokenizer.save_pretrained('sentiment_model')


All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at dbmdz/bert-base-turkish-cased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3
Loss: [0.31435633 0.6755159 ]
Epoch 2/3
Loss: [0.287983   0.19096033]
Epoch 3/3
Loss: [1.2006663  0.24680977]


('sentiment_model\\tokenizer_config.json',
 'sentiment_model\\special_tokens_map.json',
 'sentiment_model\\vocab.txt',
 'sentiment_model\\added_tokens.json')

In [15]:
import pandas as pd
import tensorflow as tf
from transformers import TFBertForSequenceClassification, BertTokenizer
from tensorflow.keras.optimizers.schedules import PolynomialDecay

# Tokenizer ve modeli yükle
tokenizer = BertTokenizer.from_pretrained('dbmdz/bert-base-turkish-cased')
model = TFBertForSequenceClassification.from_pretrained('dbmdz/bert-base-turkish-cased', num_labels=2)

# CSV dosyasını oku
df = pd.read_csv('product_comments.csv')

# Verileri ve etiketleri hazırlayın
texts = df['comment'].tolist()
# Burada etiketleri belirlemeniz gerekebilir. Örneğin, etiketlerinizi belirlemek için bir etiketleme yapılmalı
# Aşağıdaki örnekte, sadece sıfırları kullanıyoruz; gerçek etiketleriniz olmalı.
labels = [0] * len(texts)  # Tüm etiketleri sıfır olarak belirledik, gerçek etiketlerinizi ekleyin

# Tokenize etme
encodings = tokenizer(texts, truncation=True, padding=True, max_length=128, return_tensors='tf')

# TensorFlow veri kümesi oluşturma
dataset = tf.data.Dataset.from_tensor_slices((
    dict(encodings),
    labels
))

# Veriyi batch'lere ayırma
batch_size = 8
train_dataset = dataset.batch(batch_size)

# Öğrenme oranı zamanlayıcısı
learning_rate_schedule = PolynomialDecay(
    initial_learning_rate=2e-5,
    decay_steps=len(train_dataset) * 3,  # Toplam adım sayısı: epoch sayısı * adım başına batch sayısı
    end_learning_rate=0
)

# TensorFlow optimizer
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate_schedule)

# Modeli custom training loop ile eğitin
@tf.function
def train_step(inputs, labels):
    with tf.GradientTape() as tape:
        outputs = model(**inputs, labels=labels, training=True)  # labels'ı model girişine ekleyin
        loss = outputs.loss
    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))
    return loss

# Eğitim döngüsü
epochs = 3
for epoch in range(epochs):
    print(f'Epoch {epoch + 1}/{epochs}')
    for batch in train_dataset:
        batch_inputs, batch_labels = batch
        inputs = {'input_ids': batch_inputs['input_ids'], 'attention_mask': batch_inputs['attention_mask']}
        labels = batch_labels
        loss = train_step(inputs, labels)
        print(f'Loss: {loss.numpy()}')

# Modeli kaydedin
model.save_pretrained('sentiment_model')
tokenizer.save_pretrained('sentiment_model')


All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at dbmdz/bert-base-turkish-cased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3
Loss: [1.5290143 1.3187249 1.3479078 1.1799304 1.2951334 1.4287262 1.3591492
 1.1887718]
Loss: [0.99476016 1.2304344  0.9943463  0.9183173  0.9679361  1.1532688
 1.0106426  1.2391732 ]
Loss: [0.5797102  0.7618297  1.0360916  0.53951734 0.82075226 0.84116316
 0.73668945 0.6217344 ]
Loss: [0.40484035 0.32308048 0.43383387 0.37515184 0.42502287 0.40324122
 0.4052467  0.5250732 ]
Loss: [0.517889   0.30045202 0.2644992  0.30821627 0.33953148 0.34247288
 0.50775564 0.30774727]
Loss: [0.4154189  0.26083553 0.44385403 0.25998116 0.2932962  0.36450598
 0.6460851  0.28018707]
Loss: [0.21995185 0.20203476 0.26992154 0.57435036 0.3197848  0.22862007
 0.22887489 0.3150981 ]
Loss: [0.28541046 0.2478235  0.23568842 0.25980842 0.22959055 0.67259645
 0.55132645 0.26884723]
Loss: [0.20092356 0.25849822 0.24578753 0.1645552  0.21574819 0.19820969
 0.19221987 0.16418383]
Loss: [0.22086865 0.19668852 0.175672   0.16178873 0.16589276 0.15146391
 0.18646763 0.22874318]
Loss: [0.19516562 0.16434394 

('sentiment_model\\tokenizer_config.json',
 'sentiment_model\\special_tokens_map.json',
 'sentiment_model\\vocab.txt',
 'sentiment_model\\added_tokens.json')