In [1]:
import torch
from google.colab import userdata
userdata.get('hf')
if torch.cuda.is_available():
    print(f"CUDA is available. GPU: {torch.cuda.get_device_name(0)}")
else:
    print("CUDA is not available.")
print(torch.cuda.get_device_name(0))
print(torch.cuda.get_device_capability(0))


CUDA is available. GPU: Tesla T4
Tesla T4
(7, 5)


### Installing chromedriver

In [None]:
import os
import shutil
import re
import subprocess
import urllib
import zipfile
import requests


"""
Scrapes and installs chromium from linux mint 21.3(virginia) packages site.
Link: http://packages.linuxmint.com/pool/upstream/c/chromium/
Scrapes and installs chromedriver from Chrome for Testing page.
Link: https://googlechromelabs.github.io/chrome-for-testing/
"""

class CantGetLatestChromiumVersionError(Exception):
    """Happens when regex failed"""

class ChromiumInstallationFailedException(Exception):
    """
    Happens when deb package not installed
    Check the downloaded chroumium deb file
    """

main_url = "http://packages.linuxmint.com/pool/upstream/c/chromium/"
work_dir = "/content"

def get_chromium_latest_version() -> str:
    # A request to packages.linuxmint.com for getting latest version of chromium
    # e.g. "chromium_121.0.6167.160~linuxmint1+virginia_amd64.deb"
    r = requests.get(main_url)
    if r.status_code != 200:
        raise Exception("status_code code not 200!")
    text = r.text

    # Find latest version
    pattern = '<a\shref="(chromium_[^"]+linuxmint1%2Bvirginia_amd64.deb)'
    latest_version_search = re.search(pattern, text)
    if latest_version_search:
        latest_version = latest_version_search.group(1)
    else:
        raise CantGetLatestChromiumVersionError("Failed to get latest chromium version!")
    return latest_version

def install_chromium(latest_version: str, deb_file: str, quiet: bool):
    # Full url of deb file
    url = f"{main_url}{latest_version}"

    # Download deb file
    if quiet:
        command = f"wget -q -O {work_dir}/{deb_file} {url}"
    else:
        command = f"wget -O {work_dir}/{deb_file} {url}"
    print(f"Downloading: {deb_file}")
    # os.system(command)
    !$command

    # Install deb file
    if quiet:
        command = f"apt-get install {work_dir}/{deb_file} >> apt.log"
    else:
        command = f"apt-get install {work_dir}/{deb_file}"
    print(f"Installing: {deb_file}")
    # os.system(command)
    !$command

def check_chromium_installation(deb_file: str):
    try:
        subprocess.call(["chromium"])
        print("Chromium installation successfull.\n")
        # If installation successfull we can remove deb file
        # Delete deb file from disk
        os.remove(f"{work_dir}/{deb_file}")
    except FileNotFoundError:
        raise ChromiumInstallationFailedException("Chromium Installation Failed!")

def get_chromedriver_url(deb_file: str) -> str | None:
    # Get content of crhomedriver page
    url = "https://googlechromelabs.github.io/chrome-for-testing/"
    r = requests.get(url)
    if r.status_code != 200:
        raise Exception("status_code code not 200!")
    text = r.text

    # Get chromium version from deb file's name
    version_number = deb_file.split("chromium_")[-1].split(".")[0]

    # Example: https://edgedl.me.gvt1.com/edgedl/chrome/chrome-for-testing/121.0.6167.85/linux64/chromedriver-linux64.zip
    pattern = f'https://[^<]+/{version_number}[^<]+/linux64/chromedriver-linux64.zip'
    # Find latest version
    chromedriver_url_search = re.search(pattern, text)
    if chromedriver_url_search:
        chromedriver_url = chromedriver_url_search.group()
        return chromedriver_url
    else:
        return None

def install_chromedriver(deb_file: str, quiet: bool):
    url = get_chromedriver_url(deb_file)
    if not url:
        # Regex failed, lets create url manually based on chromium version
        # This happpens when mint repository a day behind chrome for testing page
        chromium_version = deb_file.split("chromium_", 1)[-1].split("~", 1)[0]
        url = f"https://storage.googleapis.com/chrome-for-testing-public/{chromium_version}/linux64/chromedriver-linux64.zip"
    file_name = url.split("/")[-1]
    # Download chromedriver
    chromedriver_zip = f"{work_dir}/{file_name}"
    if quiet:
        command = f"wget -q -O {chromedriver_zip} {url}"
    else:
        command = f"wget -O {chromedriver_zip} {url}"
    print(f"Downloading: {file_name}")
    # os.system(command)
    !$command

    # Extract chromedriver from zip
    with zipfile.ZipFile(chromedriver_zip) as zpf:
        _ = zpf.extract(member="chromedriver-linux64/chromedriver", path=work_dir)

    # Remove chromedriver-linux64.zip file
    os.remove(chromedriver_zip)

    # Move extracted chromedriver binary file to /usr/bin directory
    source = f"{work_dir}/chromedriver-linux64/chromedriver"
    destination = "/usr/bin/chromedriver"
    os.rename(source, destination)

    # Make chromedriver binary executable
    os.system(f"chmod +x {destination}")

    # Remove empty chromedriver-linux64 folder
    shutil.rmtree(f"{work_dir}/chromedriver-linux64")

    print("Chromedriver installed")

def install_selenium_package(quiet: bool):
    if quiet:
        !pip install selenium -qq >> pip.log
    else:
        !pip install selenium

def main(quiet: bool):
    # Update apt
    !sudo apt update >> quiet.log
    # Get the latest version of chromium from linux mint packages site
    latest_version = get_chromium_latest_version()
    # Name of the deb file
    deb_file = urllib.parse.unquote(latest_version, "utf-8")
    # Download and install chromium for ubuntu 22.04
    install_chromium(latest_version, deb_file, quiet)
    # Check if installation succesfull
    check_chromium_installation(deb_file)
    # Install chromedriver
    install_chromedriver(deb_file, quiet)
    # Finally install selenium package
    install_selenium_package(quiet)

if __name__ == '__main__':
    quiet = True # verboseness of wget and apt
    main(quiet)




W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)
Downloading: chromium_134.0.6998.35~linuxmint1+virginia_amd64.deb
Installing: chromium_134.0.6998.35~linuxmint1+virginia_amd64.deb
E: Sub-process /usr/bin/dpkg returned an error code (1)
Chromium installation successfull.

Downloading: chromedriver-linux64.zip
Chromedriver installed


In [2]:
!pip install selenium

Collecting selenium
  Downloading selenium-4.30.0-py3-none-any.whl.metadata (7.5 kB)
Collecting trio~=0.17 (from selenium)
  Downloading trio-0.29.0-py3-none-any.whl.metadata (8.5 kB)
Collecting trio-websocket~=0.9 (from selenium)
  Downloading trio_websocket-0.12.2-py3-none-any.whl.metadata (5.1 kB)
Collecting outcome (from trio~=0.17->selenium)
  Downloading outcome-1.3.0.post0-py2.py3-none-any.whl.metadata (2.6 kB)
Collecting wsproto>=0.14 (from trio-websocket~=0.9->selenium)
  Downloading wsproto-1.2.0-py3-none-any.whl.metadata (5.6 kB)
Downloading selenium-4.30.0-py3-none-any.whl (9.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.4/9.4 MB[0m [31m57.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading trio-0.29.0-py3-none-any.whl (492 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m492.9/492.9 kB[0m [31m28.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading trio_websocket-0.12.2-py3-none-any.whl (21 kB)
Downloading outcome-1.3.0.post0-py2.py3-

## Scraping professor details

In [3]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
import time

# Define URL for the Kelley faculty directory
URL = "https://kelley.iu.edu/faculty-research/faculty-directory/index.html"

def scrape_kelley_faculty():
    # Set up the Chrome options
    options = Options()
    options.add_argument("--headless")  # Run in headless mode
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")
    driver = webdriver.Chrome(options=options)

    # Go to the faculty directory URL
    driver.get(URL)
    time.sleep(5)  # Wait for JavaScript to load the page

    # Get the page source after the JavaScript has loaded
    soup = BeautifulSoup(driver.page_source, "html.parser")
    driver.quit()

    professors = []

    # Loop through each professor's information
    for prof in soup.select('div.faculty-directory'):
        # Extract the name using the correct selector
        name_tag = prof.select_one('div.grid-item > div.text > h3 > strong > a')
        name = name_tag.text.strip() if name_tag else "N/A"

        # Extract the email using the correct selector
        email_tag = prof.select_one('div.grid-item > div.text > p > a')
        email = email_tag["href"].replace("mailto:", "") if email_tag else "N/A"

        # Extract the research area using the correct selector
        research_area_tag = prof.select_one('div.grid-item > div.text > div.text > p')
        research_area = research_area_tag.text.strip() if research_area_tag else "N/A"

        # Add the professor's data to the list
        professors.append({"name": name, "email": email, "research_area": research_area})

    return professors

# Example Usage
professors_data = scrape_kelley_faculty()
print(professors_data)


[{'name': 'N/A', 'email': 'N/A', 'research_area': 'N/A'}, {'name': 'Paul Louis Acito', 'email': 'placito@iu.edu', 'research_area': 'Management and Entrepreneurship'}, {'name': 'Larissa J Adamiec', 'email': 'ladamiec@iu.edu', 'research_area': 'Kelley School of Business Indianapolis'}, {'name': 'Vikram Ahuja', 'email': 'vahuja@iu.edu', 'research_area': 'Industrial Organization, Microeconomics, Game Theory, Marketing, and Management'}, {'name': 'Frank E. Akaiwa', 'email': 'fakaiwa@iu.edu', 'research_area': 'Business Process Management, Enterprise Resource Planning (ERP), Enterprise Applications, XML Tools and Techniques, Database Management and Spreadsheet Analysis Tools and Techniques, Emerging Technologies'}, {'name': 'Benjamin Ale-Ebrahim', 'email': 'baleebr@iu.edu', 'research_area': 'CPCS-COMM'}, {'name': 'James B Anderson', 'email': 'jbanders@iu.edu', 'research_area': 'CPCS-K201'}, {'name': 'Kyle J. Anderson', 'email': 'kyjander@iu.edu', 'research_area': 'E-Commerce, Industrial organ

In [4]:
import pandas as pd
def clean_research_areas(professors_data):
    # Convert the list of professors to a pandas DataFrame
    df = pd.DataFrame(professors_data)

    # Filter out rows with 'professor', 'Indianapolis', or 'Bloomington' in the research area
    df_cleaned = df[~df['research_area'].str.contains('professor|Indianapolis|Bloomington', case=False, na=False)]

    # Ensure any 'N/A' entries stay as 'N/A'
    df_cleaned['research_area'] = df_cleaned['research_area'].apply(lambda x: 'N/A' if x == 'N/A' else x)

    return df_cleaned

In [5]:
cleaned_data = clean_research_areas(professors_data)

# Print cleaned data
print(cleaned_data)

                      name            email  \
0                      N/A              N/A   
1         Paul Louis Acito   placito@iu.edu   
3             Vikram Ahuja    vahuja@iu.edu   
4          Frank E. Akaiwa   fakaiwa@iu.edu   
5     Benjamin Ale-Ebrahim   baleebr@iu.edu   
6         James B Anderson  jbanders@iu.edu   
7         Kyle J. Anderson  kyjander@iu.edu   
8   Spencer Blake Anderson  speander@iu.edu   
9           Angela Andrews  abandrew@iu.edu   
10         Demetra Andrews   demetra@iu.edu   
11           Sridhar Arcot    sarcot@iu.edu   
12             Richard Ash      ashr@iu.edu   
13      David B. Audretsch  daudrets@iu.edu   
14            Doug Austrom  daustrom@iu.edu   
15          Amrou Awaysheh  awaysheh@iu.edu   

                                        research_area  
0                                                 N/A  
1                     Management and Entrepreneurship  
3   Industrial Organization, Microeconomics, Game ...  
4   Business Process Ma

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned['research_area'] = df_cleaned['research_area'].apply(lambda x: 'N/A' if x == 'N/A' else x)


In [6]:
cleaned_data

Unnamed: 0,name,email,research_area
0,,,
1,Paul Louis Acito,placito@iu.edu,Management and Entrepreneurship
3,Vikram Ahuja,vahuja@iu.edu,"Industrial Organization, Microeconomics, Game ..."
4,Frank E. Akaiwa,fakaiwa@iu.edu,"Business Process Management, Enterprise Resour..."
5,Benjamin Ale-Ebrahim,baleebr@iu.edu,CPCS-COMM
6,James B Anderson,jbanders@iu.edu,CPCS-K201
7,Kyle J. Anderson,kyjander@iu.edu,"E-Commerce, Industrial organization, Online pr..."
8,Spencer Blake Anderson,speander@iu.edu,Judgment and Decision Making in Financial Acco...
9,Angela Andrews,abandrew@iu.edu,"Executive perks, corporate governance, persona..."
10,Demetra Andrews,demetra@iu.edu,"Consumer Behavior in Retail Contexts, Choice C..."


In [7]:
cleaned_data = cleaned_data[~cleaned_data["name"].isin(["N/A"]) & ~cleaned_data["research_area"].isin(["N/A"])]



In [8]:
cleaned_data

Unnamed: 0,name,email,research_area
1,Paul Louis Acito,placito@iu.edu,Management and Entrepreneurship
3,Vikram Ahuja,vahuja@iu.edu,"Industrial Organization, Microeconomics, Game ..."
4,Frank E. Akaiwa,fakaiwa@iu.edu,"Business Process Management, Enterprise Resour..."
5,Benjamin Ale-Ebrahim,baleebr@iu.edu,CPCS-COMM
6,James B Anderson,jbanders@iu.edu,CPCS-K201
7,Kyle J. Anderson,kyjander@iu.edu,"E-Commerce, Industrial organization, Online pr..."
8,Spencer Blake Anderson,speander@iu.edu,Judgment and Decision Making in Financial Acco...
9,Angela Andrews,abandrew@iu.edu,"Executive perks, corporate governance, persona..."
10,Demetra Andrews,demetra@iu.edu,"Consumer Behavior in Retail Contexts, Choice C..."
11,Sridhar Arcot,sarcot@iu.edu,Finance


In [9]:
!pip install pymupdf

Collecting pymupdf
  Downloading pymupdf-1.25.4-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (3.4 kB)
Downloading pymupdf-1.25.4-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (20.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.0/20.0 MB[0m [31m76.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pymupdf
Successfully installed pymupdf-1.25.4


In [10]:
import pymupdf

def extract_text_from_pdf(pdf_path):
    doc = pymupdf.open(pdf_path)
    text = ""
    for page in doc:
        text += page.get_text("text") + "\n"
    return text

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

# Load LLaMA model (download from Hugging Face if not already)
MODEL_NAME = "meta-llama/Llama-2-7b-chat-hf"  # Adjust based on your model version

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME,token="")
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, torch_dtype=torch.float16, device_map="auto",token="")

def generate_email_llama(professor, resume_text):
    prompt = f"""
    You are an AI email assistant. Write a concise and professional cold email to Professor {professor['name']}
    inquiring about a research assistant position. Their research area is {professor['research_area']}.

    **Applicant Resume:**
    {resume_text}

    **Constraints:**
    - Start with a polite greeting
    - Mention the professor's research area
    - Highlight relevant experience from the resume
    - Politely ask about opportunities in their lab
    - Keep it under 200 words

    Generate the email below:
    """

    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=1024).to("cuda")
    output = model.generate(**inputs, max_length=1100, temperature=0.7)
    email_text = tokenizer.decode(output[0], skip_special_tokens=True)

    return email_text

resume_text = extract_text_from_pdf("/content/Data_Scientist_Resume_Sarah (1).pdf")


Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
# Loop through all rows in cleaned_data and generate an email for each professor
for index, professor in cleaned_data.iterrows():
    email_content = generate_email_llama(professor, resume_text)
    print(f"Email for {professor['Name']}:\n{email_content}\n")

Gemini (instead of llama)

In [13]:
!pip install -q -U google-genai

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/144.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m143.4/144.7 kB[0m [31m6.0 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m144.7/144.7 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[?25h

In [15]:
resume_text = extract_text_from_pdf("/content/Data_Scientist_Resume_Sarah (1).pdf")

In [None]:
import google.generativeai as genai
import pandas as pd

# Load your Gemini API key
genai.configure(api_key="")

# Load your database (CSV file in this example)
df = cleaned_data



# Function to generate personalized emails
def generate_email(name, research_area):
    prompt = f"""
    Given my resume below:
    {resume_text}

    Generate a concise, professional cold email to {name},
    who works in {research_area}. Highlight my relevant skills and interest
    in their research. Keep it polite, direct, and under 100 words.
    DO NOT include any placeholders like [mention a specific area]
    I will not go through this email before sending so generate a fully ready-to-send email.
    If specific details about their research are not available, keep it general.
    Write the email in a way that it can be sent automatically without manual edits.

    Do not include the subject. This will be automatically sent from my email address,
    so do not include anything I need to manually add.
    """

    response = genai.GenerativeModel("gemini-1.5-flash").generate_content(prompt)
    return response.text

# Loop through the database and generate emails
emails = []
for _, row in df.iterrows():
    email_content = generate_email(row["name"], row["research_area"])
    emails.append({"Name": row["name"], "Email": row["email"], "Message": email_content})

# Convert results to a DataFrame and save
email_df = pd.DataFrame(emails)
email_df.to_csv("generated_emails.csv", index=False)

print("Emails generated successfully!")


Emails generated successfully!


In [39]:
email_df

Unnamed: 0,Name,Email,Message
0,Paul Louis Acito,placito@iu.edu,"Dear Mr. Acito,\n\nMy name is Sarah Dias Barre..."
1,Vikram Ahuja,vahuja@iu.edu,"Dear Mr. Ahuja,\n\nMy name is Sarah Dias Barre..."
2,Frank E. Akaiwa,fakaiwa@iu.edu,"Dear Mr. Akaiwa,\n\nMy name is Sarah Dias Barr..."
3,Benjamin Ale-Ebrahim,baleebr@iu.edu,"Dear Mr. Ale-Ebrahim,\n\nMy name is Sarah Dias..."
4,James B Anderson,jbanders@iu.edu,"Dear Mr. Anderson,\n\nMy name is Sarah Dias Ba..."
5,Kyle J. Anderson,kyjander@iu.edu,"Dear Mr. Anderson,\n\nMy name is Sarah Dias Ba..."
6,Spencer Blake Anderson,speander@iu.edu,"Dear Mr. Anderson,\n\nMy name is Sarah Dias Ba..."
7,Angela Andrews,abandrew@iu.edu,"Dear Ms. Andrews,\n\nMy name is Sarah Dias Bar..."
8,Demetra Andrews,demetra@iu.edu,"Dear Ms. Andrews,\n\nMy name is Sarah Dias Bar..."
9,Sridhar Arcot,sarcot@iu.edu,"Dear Mr. Arcot,\n\nMy name is Sarah Dias Barre..."


In [None]:
import smtplib
from email.mime.text import MIMEText

EMAIL_ADDRESS = "sarah9db@gmail.com"
EMAIL_PASSWORD = ""

def send_email(prof, email_content):
    msg = MIMEText(email_content)
    msg['Subject'] = f"Research Assistant Inquiry"
    msg['From'] = EMAIL_ADDRESS
    msg['To'] = prof

    with smtplib.SMTP_SSL("smtp.gmail.com", 465) as server:
        server.login(EMAIL_ADDRESS, EMAIL_PASSWORD)
        server.sendmail(EMAIL_ADDRESS, 'sdiasbar@iu.edu', msg.as_string())

#test
send_email('sdiasbar@iu.edu',email_df['Message'][0])

In [None]:
for _, row in email_df.iterrows():
    if row['Email'] != 'N/A':
        email = row['Email']
        email_text = row['Message']
        send_email(email, email_text)