In [2]:
import os
from dotenv import load_dotenv

load_dotenv()

True

In [3]:
import openai
import openai.error
from tqdm import tqdm
openai.api_key = os.environ['OPENAI_API_KEY']

In [4]:
# performs clean up

import os
import re

pattern = r'results-row\d+-row\d+.csv'

file_list = []

directory = 'E:\\Coding\\Python\\Jupyter Notebook'

def list_files():
  global file_list
  file_list = []
  for filename in os.listdir(directory):
      # print(filename)
      # if re.match(pattern, filename):
        # print('match found', filename)
      if not os.path.isfile(os.path.join(directory, filename)):
        continue
      if re.match(pattern, filename):
        file_list.append(filename)
      if filename == "latest_j.txt":
        file_list.append(filename)

def cleanup():
  # s = input("type 'yes' if you want to remove output file: ")
  # if s == 'yes':
    print(file_list)
    try: 
      if len(file_list):
        s = input("Found older result files, cleanup? y/n: ")
        if s == 'y':
          for filename in file_list:
            os.remove(os.path.join(directory, filename))
          return 1
        else: 
          print('cool')
          return 0
    except FileNotFoundError:
      print("File not present")

In [5]:
## Setup logger for debugging
## Don't touch

import logging

logger = logging.getLogger('my_logger')
logger.setLevel(logging.DEBUG)

handler = logging.FileHandler('debug.log', mode='w')
handler.setLevel(logging.DEBUG)

formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
handler.setFormatter(formatter)

logger.addHandler(handler)


In [12]:
## Handle rate limit error with exponential backoff strategy

import random
import time
 
def retry_with_exponential_backoff(
    func,
    initial_delay: float = 1,
    exponential_base: float = 2,
    jitter: bool = True,
    max_retries: int = 10,
    errors: tuple = (openai.error.RateLimitError, openai.error.ServiceUnavailableError),
):
 
    def wrapper(*args, **kwargs):
        num_retries = 0
        delay = initial_delay
 
        while True:
            try:
                return func(*args, **kwargs)
 
            except errors as e:
                logger.exception(e)
                num_retries += 1
 
                if num_retries > max_retries:
                    raise Exception(
                        f"Maximum number of retries ({max_retries}) exceeded."
                    )
 
                delay *= exponential_base * (1 + jitter * random.random())
 
                time.sleep(delay)
 
            except Exception as e:
                logger.exception(e)
                raise e
 
    return wrapper
    
@retry_with_exponential_backoff
def completions_with_backoff(**kwargs):
    return openai.ChatCompletion.create(**kwargs)

In [10]:
# INPUTS
# Please upload your input CSV file and paste the path down here (starts with /content)
input_csv = "E:\\Coding\\Python\\Jupyter Notebook\\input.csv"
# from and to row, set it to -1,-1 if you want to run it for the whole csv
from_row = -1
to_row = -1
# batch size
batch_length = 20
# for naming of file
FROM, TO = 0, 0

In [11]:
#Run this cell after setting up the prompt context and prompt question

prompt_context = f'''
Assist me in researching website niches using a list of URLs. For each URL, provide the niche and whether the site is in English or non-English. Use the format "URL - Niche, Language". Select one niche from the list and indicate the language as either English or non-English, e.g., "Advertising, non-English".



Academia, College & University
Advertising
Animals
Animation
Apparel & Fashion
Architecture
Artificial intelligence
Arts & Culture
Astronomy
Athlete
Automotive & Cars
Aviation
Badminton
Baseball
Basketball
Beauty & Cosmetics
Biking
Biotechnology
Blogging
Boating
Books, Reading & Magazine
Camping
Cats
CBD & Cannabis
Children, Infants & Baby
Civil Rights & Social Action
Climbing
Coaching
Community Organization
Computers
Consumer Internet
Cooking
Crafts
Cricket
Cruises
Cycling & Mountain Biking
Dancing
Design
Diet
DIY
Dogs
E-Commerce & Business
Economics
Editing
Education, Teaching & E-learning
Electronics
Entertainment
Entrepreneurship
Events
Film
Finance & Financial Services
Fishing
Fitness, Exercise & Bodybuilding
Food & Beverage
Football
Fragrance
Furniture
Gadgets
Gardening
Golf
Haircare
Health Care & Mental Health
Health, Wellness & Holistic
Hiking
Hockey
Home Improvement
Hospitality
Human Resources
Human Rights
Illustration
Interior Design
Internet
Internet Marketing
Investing
Journalism
Kid's / Child's Fashion
Kids
Learning
Lifestyle
Literature
Local Business
Luxury goods, Jewellery & Jewelry
Marketing
Martial Arts
Medicine, Nutrition, Supplements & Vitamins
Meditation
Men's Fashion
Men's Health
Men's Interest
Music
Nature
News
Non-Profit Organization
Outdoors & Adventure
Painting
Parenting
Performing Arts
Personal Development
Personal Finance
Pets
Photography
Politics
Psychology
Public Figure & Celebrity
Publishing
Real Estate
Recruiting
Religious Organization
Restaurants
Retail
Running
SaaS
Science & Technology
Seniors
Shoes & Footwear
Skateboarding
Skiing
Skincare & Makeup
Snowboarding
Soccer
Social Commerce
Social Media
Social Services
Software
Software Development
Sports
Startups
Sustainable living
Swimming
Technology
Tennis
Theatre
Travel, Leisure & Tourism
Utilities, Services & Telecommunications
Venture Capital
Video Games & Gaming
Weddings
Women's Fashion
Women's Health
Women's Interest
Writing
Yoga

'''

prompt_question = f"What all niches do these {batch_length} websites belong to from my list and are they in English or non-english:"

In [13]:
# Run this cell to start running
import pandas as pd
import sys

try:
  list_files()
  flag = cleanup()
except FileNotFoundError as e:
  print("File not found, make sure directory path is correctly is setup")
  logger.error("File not found, make sure directory path is correctly is setup")
except Exception as e:
  print("Some error occured in cleanup")
  logger.exception("Exception: ")

try:
  df = pd.read_csv(input_csv, header=None)
except FileNotFoundError:
  print("File not found, Re-upload and try again")
  logger.exception("Exception: ")
  sys.exit()
except:
  print("Some error occured in reading file")
  logger.exception("Exception: ")
  sys.exit()

df_length = len(df.index)

if from_row == -1 or flag:
  from_row, to_row = [0, df_length]
  FROM, TO = from_row, to_row

try:
  website_names = df.iloc[from_row : to_row, 0]
  if(len(website_names) < batch_length):
    batch_length = len(website_names)
except:
  print(f"Error reading input, running for all rows")
  logger.exception("Exception: ")
  website_names = df.iloc[:, 0]


total_websites = len(website_names)

results = []

j = from_row

if not flag:
  try:
    with open('latest_j.txt', 'r') as f:
      j = int(f.read())
      from_row = j
      logger.debug("from_row: ", from_row)
  except FileNotFoundError:
    print("Running for the first time")
    logger.exception("Exception: ")
  except:
    print("Some error occured in reading j value, try again, starting from 0")
    logger.exception("Exception: ")


try:
    progress_bar = tqdm(range(from_row, to_row, batch_length), desc="Processing batches")
    for i in progress_bar:
      #print('i: ', i)
      website_batch = df.iloc[i : i + batch_length, 0]
      # print('website_batch_length: ', len(website_batch))
      completion = completions_with_backoff(
      model="gpt-3.5-turbo",
      messages=[
          {"role": "system", "content": f"{prompt_context}"},
          {"role": "user", "content": f"{prompt_question} {website_batch}"}
      ],
      n = 1,
      temperature = 0.1,
      max_tokens = 2048
)
      # print(completion)
      response = completion.choices[0].message.content
      # print(response)
      websites_and_niches = response.split('\n')
      # print(websites_and_niches)

      # print('website_batch: ', website_batch)
      #print('j: ', j)
      for website_and_niche in websites_and_niches:
        # if len(website_and_niche) and website_and_niche[0].isdigit():
        if not len(website_and_niche):
          continue
        if(website_and_niche.find(' - ') == -1):
          continue
        website_and_niche_list = website_and_niche.split(' - ')
        try:
          website = website_batch[j]
        except (KeyError, ValueError):
          #print("f error: ", j, i)
          j=i
          logger.exception("Exception: ")
          logger.debug("f error: j, i = ", j, i)
          website = website_batch[j]
        # print("website: ", website)
        niches = website_and_niche_list[1]
        # print(website, niches)
        niche_list = niches.split(', ')
        # print(website, niche_list)
        result = [website, *niche_list]
        result_df = pd.DataFrame([result])
        result_df.to_csv(f'results-row{FROM}-row{TO}.csv', mode='a', index=False, header=False)
        # results.append(result)
        with open('latest_j.txt', 'w') as f:
          j += 1
          # print("j: ", j)
          f.write(str(j))
      #progress_bar.set_description(f"Total {i }")
except Exception as e:
  with open('latest_j.txt', "w") as f:
    f.write(str(j))
  print("Some error occured, for more details, check logs, or, to resume re-run cell")
  print("Error: ", type(e), e)
  logger.exception("Exception: ")



['latest_j.txt', 'results-row0-row100.csv']


Processing batches: 100%|██████████| 5/5 [01:31<00:00, 18.28s/it]


In [None]:
## Testing area for multithreading, this is in beta, do not run

import pandas as pd
import concurrent.futures as cf

df = pd.read_csv('/content/Copy_of_The_Sun_US_the-sun.com_batch_4_-_March_20_5_20_PM_-_The_Sun_US_the-sun.com_batch_4.csv')

total_websites_to_test = 1800
max_threads = 20

websites = df.iloc[:total_websites_to_test, 0]

batch_length = 20

def make_req(website_batch):
  completion = openai.ChatCompletion.create(
      model="gpt-3.5-turbo",
      messages=[
        {"role": "system", "content": f"{prompt}"},
        {"role": "user", "content": f"What all niches do these {batch_length} websites {website_batch} belong to?"}
      ],
      n=1,
      temperature=0.1,
      max_tokens=2048
  )
  response = completion.choices[0].message.content
  # print(response)
  return response

with cf.ThreadPoolExecutor(max_workers=max_threads) as executor:
  future_to_batch = {executor.submit(make_req, websites[i : i + batch_length]): i for i in range(0, total_websites_to_test, batch_length)}
  for future in cf.as_completed(future_to_batch):
    response = future_to_batch[future]
    try:
      data = future.result()
    except Exception as exc:
      print(f"{response} generated the error {exc}")
    else:
      print(data)



# make_req(websites[:20])