# RAG Model Data Ingestion & Cleaning Pipeline

This notebook processes and cleans data from multiple sources for the RAG model:
- **dataset.csv**: Career outcome data (59 student profiles)
- **holland-codes.txt**: Holland's RIASEC career theory documentation
- **lifespan.txt**: Super's Life-Span, Life-Space theory documentation


In [1]:
# Install required packages if not already installed
# !pip install langchain langchain-community langchain-core pandas numpy

import pandas as pd
import numpy as np
import re
from pathlib import Path
from typing import List, Dict, Any
from langchain_core.documents import Document
from langchain_community.document_loaders import CSVLoader, TextLoader

In [2]:

##CSV FILE LOADING

from langchain_community.document_loaders import CSVLoader
loader = CSVLoader("../data/dataset.csv")
csv_documents = loader.load()

print(f"Number of documents loaded: {len(csv_documents)}")
print(f"First document preview: {csv_documents[0].page_content[:200]}...")


Number of documents loaded: 59
First document preview: Age: 23
Education_Level: Bachelor's
Degree_Field: Computer Science
GPA: 3.8
Technical_Skills: Python;Java;SQL;AWS
Soft_Skills: Communication;Teamwork;Problem-Solving
Personality_Traits: Analytical
Lea...


In [3]:
from langchain_community.document_loaders import TextLoader

loader = TextLoader("../data/lifespan.txt")
lifespan_documents = loader.load()

print(f"Number of documents loaded: {len(lifespan_documents)}")
print(f"First document preview: {lifespan_documents[0].page_content[:200]}...")

loader = TextLoader("../data/holland-codes.txt")
holland_documents = loader.load()

print(f"Number of documents loaded: {len(holland_documents)}")
print(f"First document preview: {holland_documents[0].page_content[:200]}...")



Number of documents loaded: 1
First document preview: Life-Span, Life-Space Theory
The Life-Span, Life-Space Theory is a prominent model in vocational psychology developed by American psychologist Donald Super. It emphasizes that career development is a ...
Number of documents loaded: 1
First document preview: The Holland Codes or the Holland Occupational Themes (RIASEC[1]) are a taxonomy of interests[2] based on a theory of careers and vocational choice that was initially developed by American psychologist...


In [4]:
# Data cleaning functions
def clean_text(text: str) -> str:
    """Clean and normalize text content"""
    if not isinstance(text, str):
        return str(text)
    
    # Remove extra whitespace and normalize
    text = re.sub(r'\s+', ' ', text)
    # Remove special characters but keep basic punctuation
    text = re.sub(r'[^\w\s.,!?;:()\-]', '', text)
    # Remove multiple periods, commas, etc.
    text = re.sub(r'\.{2,}', '.', text)
    text = re.sub(r',{2,}', ',', text)
    
    return text.strip()

def clean_csv_data(df: pd.DataFrame) -> pd.DataFrame:
    """Clean CSV data for better processing"""
    # Create a copy to avoid modifying original
    df_clean = df.copy()
    
    # Clean text columns
    text_columns = ['Technical_Skills', 'Soft_Skills', 'Personality_Traits', 
                   'Past_Projects', 'Extracurriculars', 'Career_Outcome']
    
    for col in text_columns:
        if col in df_clean.columns:
            df_clean[col] = df_clean[col].apply(clean_text)
    
    # Clean semicolon-separated lists
    list_columns = ['Technical_Skills', 'Soft_Skills']
    for col in list_columns:
        if col in df_clean.columns:
            df_clean[col] = df_clean[col].apply(
                lambda x: '; '.join([skill.strip() for skill in str(x).split(';') if skill.strip()])
            )
    
    return df_clean

def process_text_file(file_path: str) -> str:
    """Process and clean text files"""
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()
    
    # Clean the content
    content = clean_text(content)
    
    # Split into sections for better chunking
    sections = re.split(r'\n\n+', content)
    cleaned_sections = [section.strip() for section in sections if section.strip()]
    
    return '\n\n'.join(cleaned_sections)

print("Data cleaning functions defined successfully!")


Data cleaning functions defined successfully!


In [5]:
# Load and clean CSV data
print("=== PROCESSING CSV DATA ===")
df = pd.read_csv("../data/dataset.csv")
print(f"Original CSV shape: {df.shape}")
print(f"Columns: {list(df.columns)}")

# Clean the data
df_clean = clean_csv_data(df)
print(f"Cleaned CSV shape: {df_clean.shape}")

# Display sample of cleaned data
print("\nSample of cleaned data:")
print(df_clean[['Age', 'Degree_Field', 'Technical_Skills', 'Career_Outcome']].head(3))

# Check for missing values
print(f"\nMissing values per column:")
print(df_clean.isnull().sum())


=== PROCESSING CSV DATA ===
Original CSV shape: (59, 13)
Columns: ['Age', 'Education_Level', 'Degree_Field', 'GPA', 'Technical_Skills', 'Soft_Skills', 'Personality_Traits', 'Learning_Style', 'Work_Style_Preference', 'Past_Projects', 'Internships_Completed', 'Extracurriculars', 'Career_Outcome']
Cleaned CSV shape: (59, 13)

Sample of cleaned data:
   Age             Degree_Field                         Technical_Skills  \
0   23         Computer Science                   Python; Java; SQL; AWS   
1   25             Data Science  Python; R; Machine Learning; TensorFlow   
2   22  Business Administration                      Excel; SQL; Tableau   

                           Career_Outcome  
0                Data Scientist at Google  
1  Machine Learning Engineer at Microsoft  
2             Marketing Analyst at Amazon  

Missing values per column:
Age                      0
Education_Level          0
Degree_Field             0
GPA                      0
Technical_Skills         0
Soft_Sk

In [None]:
# Process text files
print("\n=== PROCESSING TEXT FILES ===")

# Process Holland Codes
holland_content = process_text_file("../data/holland-codes.txt")
print(f"Holland Codes - Original length: {len(open('../data/holland-codes.txt').read())}")
print(f"Holland Codes - Cleaned length: {len(holland_content)}")
# Fix the f-string syntax error by using a variable for the split pattern
split_pattern = '\n\n'
print(f"Holland Codes - Number of sections: {len(holland_content.split(split_pattern))}")

# Process Lifespan theory
lifespan_content = process_text_file("../data/lifespan.txt")
print(f"Lifespan Theory - Original length: {len(open('../data/lifespan.txt').read())}")
print(f"Lifespan Theory - Cleaned length: {len(lifespan_content)}")
print(f"Lifespan Theory - Number of sections: {len(lifespan_content.split(split_pattern))}")

print("\nSample of cleaned Holland Codes content:")
print(holland_content[:300] + "...")


SyntaxError: f-string expression part cannot include a backslash (4213118502.py, line 8)

In [None]:
# Create LangChain documents for RAG
print("\n=== CREATING LANGCHAIN DOCUMENTS ===")

# Convert CSV data to documents
csv_documents = []
for idx, row in df_clean.iterrows():
    # Create a comprehensive document for each student profile
    content = f"""
    Student Profile {idx + 1}:
    Age: {row['Age']}
    Education Level: {row['Education_Level']}
    Degree Field: {row['Degree_Field']}
    GPA: {row['GPA']}
    Technical Skills: {row['Technical_Skills']}
    Soft Skills: {row['Soft_Skills']}
    Personality Traits: {row['Personality_Traits']}
    Learning Style: {row['Learning_Style']}
    Work Style Preference: {row['Work_Style_Preference']}
    Past Projects: {row['Past_Projects']}
    Internships Completed: {row['Internships_Completed']}
    Extracurriculars: {row['Extracurriculars']}
    Career Outcome: {row['Career_Outcome']}
    """
    
    doc = Document(
        page_content=clean_text(content),
        metadata={
            "source": "dataset.csv",
            "type": "student_profile",
            "student_id": idx + 1,
            "degree_field": row['Degree_Field'],
            "career_outcome": row['Career_Outcome']
        }
    )
    csv_documents.append(doc)

print(f"Created {len(csv_documents)} student profile documents")

# Create documents for theory content
theory_documents = []

# Holland Codes document
holland_doc = Document(
    page_content=holland_content,
    metadata={
        "source": "holland-codes.txt",
        "type": "career_theory",
        "theory": "Holland RIASEC"
    }
)
theory_documents.append(holland_doc)

# Lifespan theory document
lifespan_doc = Document(
    page_content=lifespan_content,
    metadata={
        "source": "lifespan.txt",
        "type": "career_theory",
        "theory": "Super Life-Span Life-Space"
    }
)
theory_documents.append(lifespan_doc)

print(f"Created {len(theory_documents)} theory documents")

# Combine all documents
all_documents = csv_documents + theory_documents
print(f"Total documents for RAG: {len(all_documents)}")


In [None]:
# Data quality analysis and validation
print("\n=== DATA QUALITY ANALYSIS ===")

# Analyze CSV data quality
print("CSV Data Quality:")
print(f"- Total records: {len(df_clean)}")
print(f"- Complete records: {len(df_clean.dropna())}")
print(f"- Records with missing values: {len(df_clean) - len(df_clean.dropna())}")

# Check for duplicates
duplicates = df_clean.duplicated().sum()
print(f"- Duplicate records: {duplicates}")

# Analyze degree field distribution
print(f"\nDegree Field Distribution:")
degree_counts = df_clean['Degree_Field'].value_counts()
print(degree_counts.head(10))

# Analyze career outcomes
print(f"\nCareer Outcome Distribution:")
career_counts = df_clean['Career_Outcome'].value_counts()
print(career_counts.head(10))

# Analyze text content quality
print(f"\nText Content Quality:")
split_pattern = '\n\n'
print(f"- Holland Codes sections: {len(holland_content.split(split_pattern))}")
print(f"- Lifespan Theory sections: {len(lifespan_content.split(split_pattern))}")

# Check document metadata
print(f"\nDocument Metadata Summary:")
for i, doc in enumerate(all_documents[:3]):  # Show first 3 documents
    print(f"Document {i+1}: {doc.metadata}")

print(f"\n✅ Data cleaning and processing completed successfully!")
print(f"📊 Ready for RAG model with {len(all_documents)} documents")


EMBEDDING AND VECTOR STORE


In [None]:
%pip install numpy  
import numpy as np
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings
import uuid
from typing import List, Dict, Any, Tuple
from sklearn.metrics.pairwise import cosine_similarity

# Initialize the SentenceTransformer model



/Users/satyammishra/Desktop/RAG MODEL/.venv/bin/python: No module named pip
Note: you may need to restart the kernel to use updated packages.


ModuleNotFoundError: No module named 'numpy'

In [None]:

class Embeddings:
    def __init__(self, model_name: str = "all-MiniLM-L6-v2"):
        self.model = SentenceTransformer(model_name)
        self.client = chromadb.Client(Settings(anonymized_telemetry=False))
        self.collection = None
        self.index_name = "rag_index"
        
        