In [None]:
!pip install selenium requests unidecode nltk pyldavis

In [None]:
%%shell
sudo apt -y update
sudo apt install -y wget curl unzip
wget http://archive.ubuntu.com/ubuntu/pool/main/libu/libu2f-host/libu2f-udev_1.1.4-1_all.deb
dpkg -i libu2f-udev_1.1.4-1_all.deb
wget https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb
dpkg -i google-chrome-stable_current_amd64.deb
wget -N https://edgedl.me.gvt1.com/edgedl/chrome/chrome-for-testing/118.0.5993.70/linux64/chromedriver-linux64.zip -P /tmp/
unzip -o /tmp/chromedriver-linux64.zip -d /tmp/
chmod +x /tmp/chromedriver-linux64/chromedriver
mv /tmp/chromedriver-linux64/chromedriver /usr/local/bin/chromedriver
pip install selenium chromedriver_autoinstaller

In [None]:
import sys
sys.path.insert(0,'/usr/lib/chromium-browser/chromedriver')

from selenium import webdriver
from selenium.webdriver.common.by import By
import chromedriver_autoinstaller
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import  expected_conditions as EC
from selenium.webdriver.common.keys import Keys
from unidecode import unidecode

from time import sleep
import json
import os

import sqlite3
from datetime import datetime

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet, stopwords
from nltk.stem import WordNetLemmatizer

chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
chromedriver_autoinstaller.install()
driver = webdriver.Chrome(options=chrome_options)

keyword = "DARPA"


## API Call ##
import requests

def modifiedDate(query, size = 25):

    url = "https://sam.gov/api/prod/sgs/v1/search"

    params = {
        "index": "_all",
        "page": 0,
        "mode": "search",
        "sort": "-modifiedDate",
        "size": size,
        "mfe": "true",
        "q": query,
        "qMode": "ALL",
        "is_active": "true"
    }


    response = requests.get(url, params=params)

    if response.status_code == 200:

        data = response.json()
        if '_embedded' in data and 'results' in data['_embedded']:
            modifeid_date = data['_embedded']['results'][0]
            return modifeid_date.get('modifiedDate', 'No modified field')
        else:
            return "No 'results' found in the response."
    else:
        return f"Error: {response.status_code}, {response.text}"


## SAVE DATE DATABASE ##
def create_connection():
  conn = sqlite3.connect(f"modifiedDate{keyword}.db")
  return conn

def create_table(conn):
  cursor = conn.cursor()
  cursor.execute('''
    CREATE TABLE IF NOT EXISTS last_modified(
      id INTEGER PRIMARY KEY,
      date TEXT
    )
  ''')
  conn.commit()

def get_last_modified_date(conn):
  cursor = conn.cursor()
  cursor.execute('SELECT date FROM last_modified ORDER BY id DESC LIMIT 1')

  result = cursor.fetchone()
  if result:
    return datetime.fromisoformat(result[0])
  return None

def save_last_modified_date(conn, date_str):
  cursor = conn.cursor()
  cursor.execute("INSERT INTO last_modified (date) VALUES (?)",
                 (date_str,))
  conn.commit()


## Clean Data ##
def clean_text(text):
  cleaned_text = text.replace('\n', '')
  normalized_text = unidecode(cleaned_text)
  return normalized_text


def lemmatized_data(cleaned_data):
  nltk.download('averaged_perceptron_tagger')
  nltk.download('punkt')
  nltk.download('wordnet')
  nltk.download('stopwords')

  lemmatizer = WordNetLemmatizer()
  stop_words = set(stopwords.words('english'))

  def get_wordnet_pos(word):
      tag = nltk.pos_tag([word])[0][1][0].upper()
      tag_dict = {"J": wordnet.ADJ,
                  "N": wordnet.NOUN,
                  "V": wordnet.VERB,
                  "R": wordnet.ADV}
      return tag_dict.get(tag, wordnet.NOUN)

  def lemmatize_text(text):
    tokens = word_tokenize(text.lower())
    tokens = [token for token in tokens if token not in stop_words]
    lemmatized = [lemmatizer.lemmatize(token, get_wordnet_pos(token)) for token in tokens]
    return ' '.join(lemmatized)

  def process_scraped_data(scraped_text):
      combined_texts = []

      for entry in scraped_text:
          combined = f"{entry['title']} {entry['description']}"
          combined_texts.append(combined)

      lemmatized_texts = [lemmatize_text(text) for text in combined_texts]
      return lemmatized_texts

  return process_scraped_data(cleaned_data)



def get_new_data():
  new_data = []

  title_href = driver.find_elements(By.CSS_SELECTOR, "h3.margin-y-0 a.usa-link")

  for item in title_href:
    title = item.text
    href = item.get_attribute("href")
    new_data.append({
        "title" : title,
        "href" : href
    })
  return new_data

def read_json(file_path):
      try:
        with open(file_path,"r", encoding='utf-8') as f:
          return json.load(f)
      except (FileNotFoundError, json.JSONDecodeError):
        return []


def find_different_links(new_data_o):
  try:
    def get_titles_set(json_list):
      return set(item["title"] for item in json_list)

    def find_unique_hrefs(json1, new_data):
      titles_new_data = get_titles_set(json1)
      hrefs = []

      for item in new_data:
        if item["title"] not in titles_new_data:
          hrefs.append(item["href"])
      return hrefs

    json1 = read_json(f"{keyword}.json")

    return find_unique_hrefs(json1, new_data_o)

  except Exception as e:
    print(e)



## GET ALL LINKS ##
def get_all_links():
  unique_hrefs = []
  while True:
    sleep(5)
    links = driver.find_elements(By.CSS_SELECTOR, "h3.margin-y-0 a.usa-link")

    for link in links:
      href = link.get_attribute("href")
      if href not in unique_hrefs:
        unique_hrefs.append(href)

    try:
      sleep(5)

      #WebDriverWait(driver,10).until(EC.presence_of_all_elements_located((By.ID, 'bottomPagination-nextPage')))

      next_button = driver.find_element(By.ID, 'bottomPagination-nextPage')
      button_page = driver.find_element(By.XPATH, '//*[@id="bottomPagination-currentPage"]').get_attribute("aria-label")
      max = driver.find_element(By.XPATH, '//*[@id="bottomPagination-currentPage"]').get_attribute("max")

      if button_page[5] == max:
        return unique_hrefs
        break
      else:
        next_button.click()

    except Exception as e:
      print(e)
      break





## get data for given links ##
def get_given_links_data(unique_hrefs):
  json_data = []
  for href in unique_hrefs:
    driver.get(href)
    sleep(5)

    try:
      title = driver.find_element(By.XPATH, '//*[@id="main-container"]/ng-component/page/div/div/div[3]/div[2]/div[1]/h1').text
      print(title)
    except Exception as e:
      print(e)



    try:
      description = driver.find_element(By.XPATH, '//*[@id="description"]/div[1]').text
      print(description)
    except Exception as e:
      print(e)


    try:
      ## Page Down to get data ##
      body = driver.find_element(By.TAG_NAME, "body")

      for _ in range(10):
        body.send_keys(Keys.PAGE_DOWN)
        sleep(1)

      parent_element = driver.find_element(By.TAG_NAME, 'sam-accordion-section')
      elems = parent_element.find_elements(By.TAG_NAME, "a")


      links = [item.get_attribute("href") for item in elems ]
      sleep(2)
      links_text = [item.text for item in elems]



    except Exception as e:
      print(e)
      links = []
      links_text = []


    try:
      email = driver.find_element(By.XPATH, '//*[@id="contact-primary-poc-email"]/a').text
    except Exception as e:
      print(e)

    title = clean_text(title)
    description = clean_text(description)


    json_data.append({
        "title" : title,
        "description" : description,
        "email" : email,
        "links_text" : links_text,
        "links" : links
    })
  return json_data



def append_to_json(file_path2, new_data2):
  data = read_json(file_path2)

  if isinstance(data, list):
        data.extend(new_data2)  # Listeyi tek tek mevcut listeye ekle
  else:
      raise ValueError("Mevcut JSON yapısı liste formatında değil!")

  with open(file_path2, "w", encoding='utf-8') as file:
    json.dump(data, file, ensure_ascii=False, indent = 4)


def download_file(url, save_path):
    try:
        response = requests.get(url, allow_redirects=True)
        response.raise_for_status()

        os.makedirs(os.path.dirname(save_path), exist_ok=True)
        with open(save_path, "wb") as file:
            file.write(response.content)

        print(f"Dosya başarıyla {save_path} yoluna kaydedildi.")
    except requests.exceptions.RequestException as e:
        print(f"Bir hata oluştu: {e}")


def download_file_from_data(data, download_folder):
  try:
    for item in data:
      links = item.get("links", [])
      links_text = item.get("links_text", [])
      title = item.get("title")

      if len(links) == len(links_text):
        for i in range(len(links)):
          url = links[i]
          file_name = links_text[i]
          save_path = os.path.join(download_folder + f"/{title}", file_name)

          if url and file_name:
            download_file(url, save_path)
          else:
            print(f"geçersiz url veya dosya adı: {item}")
      else:
          print(f"links veya links_text listeleri aynı uzunlukta değil {item}")
  except (FileNotFoundError, json.FileNotFoundError) as e:
    print(e)



driver.get("https://sam.gov/")
sleep(5)

search_box = driver.find_element(By.NAME, "search")
search_box.send_keys(keyword)
search_box.submit()

sleep(5) ## istekler arasında robots.txt dosyası gereği 5sn beklenmeli ##

close_button = driver.find_element(By.XPATH, '//*[@id="sds-dialog-0"]/layout-splash-modal/div[4]/div[2]/div/button')
close_button.click()

sleep(5)


def check_and_process_new_date(new_date_str):
  conn = create_connection()
  create_table(conn)

  new_date = datetime.fromisoformat(new_date_str)
  last_date = get_last_modified_date(conn)

  if last_date is None:
    print("İlk defa veri ekliyorsunuz")
    print("new date: ", new_date)

    save_last_modified_date(conn, new_date)
    all_links = get_all_links()
    scraped_data = get_given_links_data(all_links)
    append_to_json(f"{keyword}.json", scraped_data)
    append_to_json(f"{keyword}_lemmatized.json", lemmatized_data(scraped_data))
    sleep(5)
    print("dosyalar indiriliyor. Lütfen bekleyiniz...")
    download_folder = f"/content/download_files_{keyword}"
    download_file_from_data(scraped_data, download_folder)

  elif new_date > last_date:
    print("new_date: ", new_date)
    print("last_date: ", last_date)
    print("Tarih yeni. Veriler güncelleniyor...")

    new_data = get_new_data()
    different_links = find_different_links(new_data)
    scraped_data = get_given_links_data(different_links)
    append_to_json(f"{keyword}.json", scraped_data)
    append_to_json(f"{keyword}_lemmatized.json", lemmatized_data(scraped_data))
    print("Veriler güncellendi.")

    sleep(5)
    print("dosyalar indiriliyor. Lütfen bekleyiniz...")
    download_folder = f"/content/download_files_{keyword}"
    download_file_from_data(scraped_data, download_folder)

  elif new_date == last_date:
    print("new_date: ", new_date)
    print("last_date: ", last_date)
    print("tarih aynı. Veriler güncel")
  else:
    print("new_date: ", new_date)
    print("last_date: ", last_date)
    print("Tarih eski. Veriler güncel")
  conn.close()


last_modified_date = modifiedDate(keyword, 1)
check_and_process_new_date(last_modified_date)

driver.quit()


In [None]:
## data visualization ##

import pandas as pd
from gensim import corpora
from gensim.models import LdaModel
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
import json
nltk.download('punkt')
nltk.download('stopwords')



keyword = "DARPA"

import pyLDAvis.gensim

def read_json(file_path):
      try:
        with open(file_path,"r", encoding='utf-8') as f:
          return json.load(f)
      except (FileNotFoundError, json.JSONDecodeError):
        return []

documents = read_json(f"{keyword}_lemmatized.json")

def preprocess(text):
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word.isalnum()]
    return tokens


texts = [preprocess(doc) for doc in documents]

dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

lda_model = LdaModel(corpus, num_topics=10,
                     id2word=dictionary,
                     passes=15, random_state=42)

for idx, topic in lda_model.print_topics(num_words=5):
    print(f"Topic {idx + 1}: {topic}")



pyLDAvis.enable_notebook()
lda_vis_data = pyLDAvis.gensim.prepare(lda_model, corpus, dictionary)
pyLDAvis.display(lda_vis_data)


In [None]:
## CODE CELL 2 ##
import sqlite3
import json

keyword = "DARPA"

conn = sqlite3.connect(f"{keyword}.db")
c = conn.cursor()

# Veritabanı tablosunu oluştur
c.execute(f'''CREATE TABLE IF NOT EXISTS {keyword}
            (title text, description text, email text, links text)''')

c.execute(f"DELETE FROM {keyword}")

with open("sam_full.json", "r") as f:
    sam = json.load(f)

for data in sam:
    # Listeyi JSON formatında bir string olarak sakla
    links = json.dumps(data.get("links", []))
    c.execute("INSERT INTO sam (title, description, email, links) VALUES (?, ?, ?, ?)",
              (data["title"], data["description"], data["email"], links))

conn.commit()
conn.close()


In [None]:
## excele yazma ##
import sqlite3
import pandas as pd
from openpyxl import load_workbook
from openpyxl.styles import Alignment

conn = sqlite3.connect("sam.db")

df = pd.read_sql_query("SELECT * FROM sirop", conn)

excel_file = "sirop.xlsx"
df.to_excel(excel_file, index= True)

wb = load_workbook(excel_file)
ws = wb.active

for cell in ws[1]:
  cell.alignment = Alignment(horizontal = "left")

for column in ws.columns:
  max_length = 0
  column = [cell for cell in column]
  for cell in column:
    try:
      if len(str(cell.value)) > max_length:
        max_length = len(cell.value)
    except:
      pass
  adjust_width = (max_length + 2)
  ws.column_dimensions[column[0].column_letter].width = adjust_width

wb.save(excel_file)

conn.close()

In [None]:
import sqlite3

conn = sqlite3.connect("sam.db")
c = conn.cursor()

c.execute("SELECT * FROM sam")

rows = c.fetchall()
for row in rows:
  print(row)


conn.close()

('Exploration of Highly Complex Defense Systems', 'The Defense Advanced Research Projects Agency Tactical Technology Office (DARPA/TTO) is seeking applications from researchers, engineers, and subject matter experts to attend an invitation-only workshop focused on design, development, and management of highly complex systems. The workshop will be held on November 13th to 15th, 2024 at a to-be-determined hotel in the Boston, MA metropolitan area. DARPA strongly encourages non-traditional performers including small businesses, academic and research institutions, and first-time Government contractors to apply.', 'DARPA-SN-24-102@darpa.mil', '["https://sam.gov/api/prod/opps/v3/opportunities/resources/files/b115e14627bc4dfa90270fde362c6061/download?&token="]')
('Draft Broad Agency Announcement Heterogenous Adaptively Produced Photonic Interfaces (HAPPI)', 'The purpose of this Special Notice is to provide a DRAFT Broad Agency Announcement for theupcoming DARPA HAPPI Program. This Special Not