In [23]:
import langchain
from langchain_core.documents import Document
from langchain_community.document_loaders import TextLoader
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.vectorstores import FAISS
from langchain_text_splitters import RecursiveCharacterTextSplitter

import os
from google.colab import userdata
from openai import OpenAI
from IPython.display import Markdown, display

In [7]:
from langchain_groq import ChatGroq
os.environ["GROQ_API_KEY"] = userdata.get("GROQ_API_KEY")

llm = ChatGroq(model_name = 'openai/gpt-oss-120b', temperature = 0.7)
display(Markdown(llm.invoke('what is the capital of India').content))

The capital of India is **New Delhi**.

In [8]:
doc1 = Document(
    page_content = 'this is the first document',
    metadata = {'source' : 'file1.txt'}
)

In [9]:
doc1.page_content

'this is the first document'

In [11]:
loader = TextLoader('rag_text.txt')
docs2 = loader.load()
# docs2 ==> list

In [12]:
for doc in docs2:
    print(doc.page_content, doc.metadata)

Privacy and Security in Online Social Media  
Week -10 Assignment (Graded) 
Maximum Marks = 30 (+6 bonus) 
1. [3 marks] What is the 3R strategy to conduct ethical studies according to the Principles of 
Humane Experimental Technique (Russell and Burch, 1959)? 
2. [1 mark] What strategy can be used to determine the required sample size for a survey 
design? 
3. [3 marks] What are the three principles of research with human subjects and mention how 
they are implemented. 
4. [2 marks] Mention two instances of research data which were deanonymized soon after 
they were released publicly. 
5. [2+2 marks] What were the major issues with the FB Emotional Contagion study? 
Suggest ways to modify the experiment such that such issues can be mitigated. 
6. [1 mark] The density histogram of the number of questions answered by OkCupid users 
follows a _____ distribution 
7. [3 marks] Draw the scope of considerations needed in data utilization and briefly explain 
each level according to the “Good 

In [13]:
loader = PyPDFLoader('rag_doc.pdf')
docs3 = loader.load()
for doc in docs3:
    print(doc.page_content, doc.metadata)

Privacy and Security in Online Social MediaWeek -5 Assignment (Graded Solutions)
Total Marks: 20
1. [2 Marks] What different kinds of analysis can be performed fromthe curatedsocial media data?2. [1 Mark] Name a crowd-sourced method launched by Twitter to fightmisinformation.3. [2Marks]Whatistheissuewiththecrowd-sourcedmethodlaunchedbyTwittertofight misinformation?4. [2Marks]Nametwocategoriesoftextfeaturesthatcanbeextractedfortext-basedfake news detection.5. [2 Marks] What are lexico-syntactic features? Give examples.6. [3 Marks] Please name any one system designed to fight fake news:a. Facebookb. WhatsAppc. Twitter7. [2Marks] MyfriendsentamessagetoourBSWhatsAppgroup.Afterreadingthecontent, I was unsureabouttheveracityofthemessage.NametheserviceIshoulduse to evaluate the message for me. How does that service work?8. [2 Marks] What novel characteristics does the following multi-modal fake newsdetection method comprise:a. Event Adversarial Neural Networks for Multimodal Fake News Detecti

In [14]:
splitter = RecursiveCharacterTextSplitter(chunk_size = 75, chunk_overlap = 15)
chunks = splitter.split_documents(docs2)
for chunk in chunks:
    print(chunk.page_content)

Privacy and Security in Online Social Media  
Week -10 Assignment (Graded)
Maximum Marks = 30 (+6 bonus)
1. [3 marks] What is the 3R strategy to conduct ethical studies according
according to the Principles of
Humane Experimental Technique (Russell and Burch, 1959)?
2. [1 mark] What strategy can be used to determine the required sample
sample size for a survey
design?
3. [3 marks] What are the three principles of research with human subjects
human subjects and mention how
they are implemented.
4. [2 marks] Mention two instances of research data which were
which were deanonymized soon after
they were released publicly.
5. [2+2 marks] What were the major issues with the FB Emotional Contagion
Contagion study?
Suggest ways to modify the experiment such that such issues can be
issues can be mitigated.
6. [1 mark] The density histogram of the number of questions answered by
answered by OkCupid users
follows a _____ distribution
7. [3 marks] Draw the scope of considerations needed in data ut

In [21]:
from langchain_google_genai import GoogleGenerativeAIEmbeddings

os.environ["GOOGLE_API_KEY"] = userdata.get("GOOGLE_API_KEY")
embeddings = GoogleGenerativeAIEmbeddings(model="models/gemini-embedding-001")

vectors = embeddings.embed_documents([doc.page_content for doc in chunks])
print(len(vectors), len(vectors[0]))

40 3072


In [30]:
vectorstore = FAISS.from_documents(chunks, embeddings)

In [31]:
all_docs = vectorstore.docstore._dict

for doc_id, doc in all_docs.items():
    print(doc_id, doc.page_content, doc.metadata)

83fbf558-9bc9-4c00-964d-63be8bda7c25 Privacy and Security in Online Social Media  
Week -10 Assignment (Graded) {'source': 'rag_text.txt'}
7752df01-d7f5-4bd8-af57-542d06ed84fe Maximum Marks = 30 (+6 bonus) {'source': 'rag_text.txt'}
3149b05a-d231-43e4-906c-c5e24dd68067 1. [3 marks] What is the 3R strategy to conduct ethical studies according {'source': 'rag_text.txt'}
08c64fd2-25c4-4970-b38b-bac35ce0f489 according to the Principles of {'source': 'rag_text.txt'}
65d8eb6b-c0eb-4c74-aa28-e0b671b154cb Humane Experimental Technique (Russell and Burch, 1959)? {'source': 'rag_text.txt'}
a5f4cd6c-c17b-497d-96b3-eaa9e4c02961 2. [1 mark] What strategy can be used to determine the required sample {'source': 'rag_text.txt'}
4752c77c-08c9-47b9-aa85-68219f125cda sample size for a survey {'source': 'rag_text.txt'}
452733d3-f234-4c6f-a042-faca86a6cca6 design? {'source': 'rag_text.txt'}
15ed8f5a-690b-45b1-a595-879824ecef4f 3. [3 marks] What are the three principles of research with human subjects {'sou

In [33]:
vectorstore.save_local('faiss_index')
new_store = FAISS.load_local('faiss_index', embeddings, allow_dangerous_deserialization=True)

In [37]:
retrived_docs = vectorstore.similarity_search_with_score('what is the maximum marks?', k=2)
for result, distance_score in retrived_docs:
    print(result.page_content, distance_score)

Maximum Marks = 30 (+6 bonus) 0.45995128
6. [1 mark] The density histogram of the number of questions answered by 0.6970546


In [43]:
retriever = vectorstore.as_retriever(search_kwargs={'k': 3})
docs = retriever.invoke('what is the maximum marks?')
for result in docs:
    print(result.page_content)

Maximum Marks = 30 (+6 bonus)
6. [1 mark] The density histogram of the number of questions answered by
2. [1 mark] What strategy can be used to determine the required sample


In [47]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

template = """
Use the following context to answer the question. Make the answer precise
context: {context}

question: {question}

answer:
"""

prompt = ChatPromptTemplate.from_template(template)

def format_docs(docs):
    return '\n\n'.join(doc.page_content for doc in docs)

rag_chain = (
    {'context': retriever | format_docs, 'question': RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

query = 'what is the maximum marks?'
response = rag_chain.invoke(query)
display(Markdown(response))

The maximum marks for the assessment are **30 marks** (with an additional **6 bonus marks** that can be earned).