In [None]:
!pip install -q -U google-generativeai

In [None]:
import pathlib
import textwrap

import google.generativeai as genai
import os
from IPython.display import display
from IPython.display import Markdown


def to_markdown(text):
  text = text.replace('•', '  *')
  return Markdown(textwrap.indent(text, '> ', predicate=lambda _: True))

In [None]:
# Or use `os.getenv('GOOGLE_API_KEY')` to fetch an environment variable.
GOOGLE_API_KEY = '####'
os.environ["GOOGLE_API_KEY"]=GOOGLE_API_KEY

genai.configure(api_key= GOOGLE_API_KEY)

for m in genai.list_models():
  if 'generateContent' in m.supported_generation_methods:
    print(m.name)
model = genai.GenerativeModel('gemini-pro')

models/gemini-1.0-pro
models/gemini-1.0-pro-001
models/gemini-1.0-pro-latest
models/gemini-1.0-pro-vision-latest
models/gemini-pro
models/gemini-pro-vision


In [None]:
# Langchain with Gemini
# Reference : https://python.langchain.com/docs/integrations/chat/google_generative_ai
%pip install --upgrade --quiet  langchain-google-genai pillow

Note: you may need to restart the kernel to use updated packages.


In [None]:
# Authentication with LLM

from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.messages import HumanMessage, SystemMessage

model = ChatGoogleGenerativeAI(model="gemini-pro", convert_system_message_to_human=True)
output = model(
    [
        SystemMessage(content="Answer in details."),
        HumanMessage(content="Who is Amazon's CEO?")
    ]
)

  warn_deprecated(


In [None]:
output

AIMessage(content='**Andy Jassy**\n\n* Current CEO of Amazon\n* Previously served as CEO of Amazon Web Services (AWS)\n* Joined Amazon in 1997 as a marketing manager\n* Held various leadership positions within the company, including Senior Vice President of Global Operations\n* Succeeded Jeff Bezos as CEO in July 2021\n\n**Key Accomplishments as Amazon CEO:**\n\n* Oversaw Amazon\'s expansion into new markets, such as healthcare and robotics\n* Implemented initiatives to improve employee experience and reduce workplace injuries\n* Focused on sustainability and environmental initiatives\n* Led Amazon\'s acquisition of MGM Studios, expanding the company\'s presence in entertainment\n\n**Background and Education:**\n\n* Born in Scarsdale, New York\n* Graduated from Harvard College with a Bachelor of Arts in English\n* Received an MBA from Harvard Business School\n* Prior to Amazon, worked as a management consultant at McKinsey & Company\n\n**Leadership Style:**\n\n* Known for his analytica

In [None]:
print(output.content)

**Andy Jassy**

* Current CEO of Amazon
* Previously served as CEO of Amazon Web Services (AWS)
* Joined Amazon in 1997 as a marketing manager
* Held various leadership positions within the company, including Senior Vice President of Global Operations
* Succeeded Jeff Bezos as CEO in July 2021

**Key Accomplishments as Amazon CEO:**

* Oversaw Amazon's expansion into new markets, such as healthcare and robotics
* Implemented initiatives to improve employee experience and reduce workplace injuries
* Focused on sustainability and environmental initiatives
* Led Amazon's acquisition of MGM Studios, expanding the company's presence in entertainment

**Background and Education:**

* Born in Scarsdale, New York
* Graduated from Harvard College with a Bachelor of Arts in English
* Received an MBA from Harvard Business School
* Prior to Amazon, worked as a management consultant at McKinsey & Company

**Leadership Style:**

* Known for his analytical and data-driven approach
* Values customer o

In [None]:
# This is 2.2 Analyse Spark code from Apache spark git repo
import re

# Function to interact with LLM: Authentication
def query_llm(prompt):
    # Replace with the LLM's API endpoint and your access token
    model = ChatGoogleGenerativeAI(model="gemini-pro", convert_system_message_to_human=True)
    output = model(
        [
            SystemMessage(content="Answer in details"),
            HumanMessage(content=prompt)
        ]
    )
    return output.content

# Function to handle user queries
def answer_query(query, file_path, directory_path):
    # Preprocess the query
    preprocessed_query = preprocess(query)

    code_content = file_read(file_path)
    #code_content = read_py_files(directory_path)

    print(code_content)

    # Use the LLM to answer the query about the Spark code
    llm_answer = query_llm(preprocessed_query + code_content)

    # Post-process the LLM's answer
    processed_answer = postprocess(llm_answer)

    return processed_answer

def preprocess(query):
    # Remove special characters and extra spaces
    preprocessed_query = re.sub(r'[^\w\s]', '', query)
    preprocessed_query = re.sub(r'\s+', ' ', preprocessed_query)

    return preprocessed_query.strip()


def postprocess(llm_answer):
    # Add a closing statement to the answer
    filtered_answer = llm_answer + "\n\nThank you for using our service. If you have any further questions, feel free to ask!"

    return filtered_answer

# For single particular file
def file_read(file_path):
    # This method reads the py file from local and convert into
    # string format without any formatting
    with open(file_path, 'r') as file:
        file_content = file.read()
    return file_content


# TODO: would be do it multiple files under the same repository
# I have created this function to read multiple files within same repo
def read_py_files(file_paths):
    # This method reads multiple py files from local and
    # concatenates their contents into a single string
    all_contents = ""
    for file_path in file_paths:
        # Check if the file has a .py extension
        if file_path.endswith('.py'):
            with open(file_path, 'r') as file:
                file_content = file.read()
                all_contents += file_content
    return all_contents

# Example usage of answer_query function
user_query = "Help me understand the code below:"
# print(user_query)
file_path = "/home/riddhi/Desktop/riddhi_workplace/Homework4/spark/examples/src/main/python/wordcount.py"
directory_path = "/home/riddhi/Desktop/riddhi_workplace/Homework4/spark/examples/src/main/python"
response = answer_query(user_query, file_path, directory_path)
print(response)

#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements.  See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License.  You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

import sys
from operator import add

from pyspark.sql import SparkSession


if __name__ == "__main__":
    if len(sys.argv) != 2:
        print("Usage: wordcount <file>", file=sys.stderr)
        sys.exit(-1)

    s

In [None]:
%pip install --upgrade --quiet  langchain-openai tiktoken chromadb langchain

Note: you may need to restart the kernel to use updated packages.


In [None]:
%pip install -qU esprima esprima tree_sitter tree_sitter_languages

Note: you may need to restart the kernel to use updated packages.


In [None]:
# This is 2.3
# Reference: https://python.langchain.com/docs/use_cases/code_understanding

import warnings
warnings.filterwarnings("ignore")
from pprint import pprint
from langchain_community.document_loaders.generic import GenericLoader
from langchain_community.document_loaders.parsers import LanguageParser
from langchain_text_splitters import Language

# Load files that contain python code from repo to loader
loader = GenericLoader.from_filesystem(
    "/home/riddhi/Desktop/riddhi_workplace/Homework4/spark/examples/src/main/python",
    glob="*",
    suffixes=[".py"],
    parser=LanguageParser(language=Language.PYTHON),
)
docs = loader.load()
# len(docs) # length of docs (No. of .py files)

print(docs[0].page_content)
# Print content from all files
# print("\n\n--8<--\n\n".join([document.page_content for document in docs]))

#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements.  See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License.  You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

import sys
from typing import Tuple

from pyspark.rdd import RDD
from pyspark.sql import SparkSession


if __name__ == "__main__":
    if len(sys.argv) != 2:
        print("Usage: sort <file>", file=sys.stderr)
    

In [None]:
%pip install --upgrade --quiet langchain-text-splitters tiktoken

Note: you may need to restart the kernel to use updated packages.


In [None]:
# Tokenization
from langchain_text_splitters import RecursiveCharacterTextSplitter

python_splitter = RecursiveCharacterTextSplitter.from_language(
    language=Language.PYTHON, chunk_size=200, chunk_overlap=0
)
texts = python_splitter.split_documents(docs)
# len(texts)
print("\n\n--8<--\n\n".join([document.page_content for document in texts]))

#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements.  See the NOTICE file distributed with

--8<--

# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0

--8<--

# (the "License"); you may not use this file except in compliance with
# the License.  You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#

--8<--

# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,

--8<--

# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

--8<--

import sys
from typing import Tuple

from pyspark.rdd import RDD
from pyspark.sql import SparkSession

--8<--

if __name__ == "__main__":
    if len(sys.argv) != 2:
      

In [None]:
pip install spacy

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [None]:
!python3 -m spacy download en_core_web_sm

Defaulting to user installation because normal site-packages is not writeable
Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m16.1 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m[36m0:00:01[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [None]:
# Name Entity recognition
import spacy
nlp = spacy.load("en_core_web_sm")

# Iterate over each document in the list
for document in texts:
    # Get the text content from the document
    text = document.page_content

    # Process the text using spaCy
    doc = nlp(text)

    # Iterate over the entities found in the text
    for ent in doc.ents:
        print(ent.text, ent.label_)

# CARDINAL
Licensed PERSON
the Apache Software Foundation ORG
ASF ORG
# CARDINAL
ASF ORG
the Apache License ORG
2.0 CARDINAL
License WORK_OF_ART
# the License MONEY
# CARDINAL
# CARDINAL
License ORG
# CARDINAL
License ORG
sys
 PERSON
Tuple ORG
RDD ORG
2 CARDINAL
int(x ORG
1 CARDINAL
\ PERSON
# CARDINAL
# CARDINAL
Licensed PERSON
the Apache Software Foundation ORG
ASF ORG
# CARDINAL
ASF ORG
the Apache License ORG
2.0 CARDINAL
License WORK_OF_ART
# the License MONEY
# CARDINAL
# CARDINAL
License ORG
# CARDINAL
License ORG
sys
 PERSON
2 CARDINAL
\
                   PERSON
1 CARDINAL
\ PERSON
# CARDINAL
Licensed PERSON
the Apache Software Foundation ORG
ASF ORG
# CARDINAL
ASF ORG
the Apache License ORG
2.0 CARDINAL
License WORK_OF_ART
# the License MONEY
# CARDINAL
# CARDINAL
License ORG
# CARDINAL
License ORG
rand.randrange(0 ORG
numVertices ORG
rand.randrange(0 ORG
numVertices ORG
# CARDINAL
Licensed PERSON
the Apache Software Foundation ORG
ASF ORG
# CARDINAL
ASF ORG
the Apache License

In [None]:
# Previous Homework analysis - avg_content_length.py
# Example usage of answer_query function
hw_query = "Explain the code:"
# print(user_query)
file_path = "/home/riddhi/Desktop/riddhi_workplace/Homework4/avg_content_length.py"
hw_response = answer_query(hw_query, file_path, directory_path)
print(hw_response)

from mrjob.job import MRJob
import re

class AverageRating(MRJob):
    # Parses each line of input data and extracts the product name and calculated 
    # unique word count for customer reviews. The input data is in CSV format with columns for date, 
    # rating, review, and product. This filters out the header row, splits the 
    # CSV data, and calculates the word count for review text using the get_word_count()
    # method before yielding the product name and score.
    def mapper(self, _, line):
        if line.lower().startswith('date'):  # Skip header row
            return

        # Input data is in CSV format with columns: date, rating, review, product
        data = line.strip().split('|')
        product = data[2]
        word_count = self.get_word_count(data[3])
        yield product, word_count

    def reducer(self, product, word_count):
        word_count_list = list(word_count) 
        total_word_count = sum(word_count_list)
        total_reviews = len(word_count_l