In [8]:
import pandas as pd
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.pipeline import make_pipeline
from sklearn.base import BaseEstimator, TransformerMixin

# Load spaCy model
nlp = spacy.load("en_core_web_sm")

class TextPreprocessor(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.nlp = nlp

    def preprocess_text(self, text):
        """Preprocess text by lowercasing, lemmatizing, and removing stopwords/punctuation."""
        doc = self.nlp(text.lower())
        return " ".join([token.lemma_ for token in doc if not token.is_stop and not token.is_punct])

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        """Apply preprocessing to a list of texts."""
        return [self.preprocess_text(text) for text in X]

def load_data(file_path):
    """Load the dataset and drop rows with missing eligibility text."""
    df = pd.read_csv(file_path)
    df = df.dropna(subset=["Eligibility"])  # Drop rows with missing eligibility
    return df

def train_nlp_model(df):
    """Train the NLP pipeline (preprocessing + TF-IDF vectorization)."""
    preprocessor = TextPreprocessor()
    vectorizer = TfidfVectorizer(max_df=0.85, max_features=5000, ngram_range=(1, 2))
    pipeline = make_pipeline(preprocessor, vectorizer)
    tfidf_matrix = pipeline.fit_transform(df["Eligibility"])
    return pipeline, tfidf_matrix, df

def recommend_scholarships(user_input, pipeline, tfidf_matrix, df, top_n=9, threshold=0.3):
    """Recommend scholarships based on user input."""
    # Preprocess user input
    user_input_processed = pipeline.named_steps['textpreprocessor'].preprocess_text(user_input)
    user_tfidf = pipeline.named_steps['tfidfvectorizer'].transform([user_input_processed])

    # Compute similarity scores
    similarity_scores = cosine_similarity(user_tfidf, tfidf_matrix).flatten()

    # Get indices of top N matches
    top_indices = similarity_scores.argsort()[-top_n:][::-1]
    top_scores = similarity_scores[top_indices]

    # Filter by threshold and prepare recommendations
    recommendations = []
    for idx, score in zip(top_indices, top_scores):
        if score >= threshold:
            scholarship = df.iloc[idx][["Scholarship Name", "Provider", "Eligibility", "Amount"]].to_dict()
            scholarship["Similarity Score"] = score
            recommendations.append(scholarship)

    # Handle case where no recommendations meet the threshold
    if not recommendations:
        return [{"Scholarship Name": "No match found", "Provider": "N/A", "Eligibility": "N/A", "Amount": "N/A", "Similarity Score": 0}]
    return recommendations

def evaluate_model(df, pipeline, tfidf_matrix):
    """Evaluate the recommendation system using a train-test split."""
    # Split data into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(df["Eligibility"], df["Scholarship Name"], test_size=0.35, random_state=41)
    # Check the size of the train and test sets
    print(f"Training set size: {len(X_train)}")
    print(f"Test set size: {len(X_test)}")

    # Generate recommendations for test data
    y_pred = []
    for text in X_test:
        recommendations = recommend_scholarships(text, pipeline, tfidf_matrix, df, top_n=3)  # Top 1 recommendation for evaluation
        if recommendations[0]["Scholarship Name"] == "No match found":
            y_pred.append("No match found")
        else:
            y_pred.append(recommendations[0]["Scholarship Name"])

    # Filter out "No match found" for evaluation
    y_test_filtered = [y_true for y_true, y_pred in zip(y_test, y_pred) if y_pred != "No match found"]
    y_pred_filtered = [y_pred for y_pred in y_pred if y_pred != "No match found"]

    # Compute evaluation metrics
    if len(y_test_filtered) > 0:
        accuracy = accuracy_score(y_test_filtered, y_pred_filtered)
        precision = precision_score(y_test_filtered, y_pred_filtered, average='weighted', zero_division=1)
        recall = recall_score(y_test_filtered, y_pred_filtered, average='weighted', zero_division=1)
        f1 = f1_score(y_test_filtered, y_pred_filtered, average='weighted', zero_division=1)
        print(f"Accuracy: {accuracy:.2f}")
        print(f"Precision: {precision:.2f}")
        print(f"Recall: {recall:.2f}")
        print(f"F1-Score: {f1:.2f}")
    else:
        print("No valid recommendations to evaluate.")




if __name__ == "__main__":
    # Load and preprocess data
    df = load_data("combined60.csv")

    # Train NLP model
    pipeline, tfidf_matrix, df = train_nlp_model(df)

    # Evaluate model
    print("Evaluating model...")
    evaluate_model(df, pipeline, tfidf_matrix)

    # Example user input
    user_input = "I am a GATE-qualified female student from the obc category looking for a scholarship."
    print("\nTop Recommendations:")
    recommendations = recommend_scholarships(user_input, pipeline, tfidf_matrix, df, top_n=3)
    for idx, recommendation in enumerate(recommendations, 1):
        print(f"{idx}. {recommendation['Scholarship Name']} (Provider: {recommendation['Provider']}, Amount: {recommendation['Amount']})")
        print(f"   Eligibility: {recommendation['Eligibility']}")
     



Evaluating model...
Training set size: 39
Test set size: 21
Accuracy: 1.00
Precision: 1.00
Recall: 1.00
F1-Score: 1.00

Top Recommendations:
1. AICTE PG (GPAT/GATE) Scholarship (Provider: AICTE, Amount: INR 12,400 per month)
   Eligibility: GATE/GPAT qualified candidates
   Similarity Score: 0.35
2. ONGC Scholarship Scheme for OBC Category Students 2025 (Provider: nan, Amount: nan)
   Eligibility: The ONGC Foundation has programmed a scholarship for OBC category students, named the 'ONGC Foundation Scholarship Scheme for OBC Category Students 2021'.  a step by this organization, which they think will provide financial assistance for an expensive education. The financial reward of this scholarship is impressive, and the number of students allowed to apply for this scheme is up to 500. Thousands of students will apply for this scholarship, but applications of only those will be accepted, who belong to the OBC community, and are pursuing any high fee course, like Engineering, Ph.D., MBBS,

In [12]:
import spacy
import re
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.pipeline import make_pipeline
from sklearn.base import BaseEstimator, TransformerMixin

# Load spaCy model
nlp = spacy.load("en_core_web_sm")

class TextPreprocessor(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.nlp = nlp

    def preprocess_text(self, text):
        """Preprocess text by lowercasing, lemmatizing, and removing stopwords/punctuation."""
        doc = self.nlp(text.lower())
        processed_text = " ".join([token.lemma_ for token in doc if not token.is_stop and not token.is_punct])
        return processed_text

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        """Apply preprocessing to a list of texts."""
        return [self.preprocess_text(text) for text in X]

def load_data(file_path):
    """Load the dataset and drop rows with missing eligibility text."""
    df = pd.read_csv(file_path)
    df = df.dropna(subset=["Eligibility"])
    return df

def extract_numeric_conditions(user_input):
    """Extract numeric conditions like family income, age, marks, etc."""
    conditions = {}
    
    # Extract family income
    income_match = re.search(r"(income|salary).*?(\d+)\s*(lakh|lac)", user_input, re.IGNORECASE)
    if income_match:
        conditions["family_income"] = float(income_match.group(2))
    
    # Extract age
    age_match = re.search(r"(age).*?(\d+)", user_input, re.IGNORECASE)
    if age_match:
        conditions["age"] = int(age_match.group(2))
    
    # Extract marks
    marks_match = re.search(r"(marks|percentage).*?(\d+)", user_input, re.IGNORECASE)
    if marks_match:
        conditions["marks"] = int(marks_match.group(2))
    
    return conditions

def filter_scholarships(df, conditions):
    """Filter scholarships based on numeric conditions."""
    filtered_df = df.copy()

    # Filter by family income
    if "family_income" in conditions:
        filtered_df = filtered_df[filtered_df["Eligibility"].str.contains(r"income.*?\d+\s*(lakh|lac)", case=False, regex=True)]
        filtered_df = filtered_df[filtered_df["Eligibility"].apply(lambda x: float(re.search(r"(\d+)\s*(lakh|lac)", x).group(1)) <= conditions["family_income"])]

    # Filter by age
    if "age" in conditions:
        filtered_df = filtered_df[filtered_df["Eligibility"].str.contains(r"age.*?\d+", case=False, regex=True)]
        filtered_df = filtered_df[filtered_df["Eligibility"].apply(lambda x: int(re.search(r"age.*?(\d+)", x).group(1)) <= conditions["age"])]

    # Filter by marks
    if "marks" in conditions:
        filtered_df = filtered_df[filtered_df["Eligibility"].str.contains(r"marks.*?\d+", case=False, regex=True)]
        filtered_df = filtered_df[filtered_df["Eligibility"].apply(lambda x: int(re.search(r"marks.*?(\d+)", x).group(1)) >= conditions["marks"])]

    return filtered_df

def train_nlp_model(df):
    """Train the NLP pipeline (preprocessing + TF-IDF vectorization)."""
    preprocessor = TextPreprocessor()
    vectorizer = TfidfVectorizer(max_df=0.85, max_features=5000, ngram_range=(1, 2))
    pipeline = make_pipeline(preprocessor, vectorizer)
    tfidf_matrix = pipeline.fit_transform(df["Eligibility"])
    return pipeline, tfidf_matrix, df

def recommend_scholarships(user_input, pipeline, tfidf_matrix, df, top_n=3, threshold=0.1):
    """Recommend scholarships based on user input."""
    conditions = extract_numeric_conditions(user_input)
    filtered_df = filter_scholarships(df, conditions)

    if len(filtered_df) == 0:
        return [{"Scholarship Name": "No match found", "Provider": "N/A", "Eligibility": "N/A", "Amount": "N/A", "Similarity Score": 0}]

    user_input_processed = pipeline.named_steps['textpreprocessor'].preprocess_text(user_input)
    user_tfidf = pipeline.named_steps['tfidfvectorizer'].transform([user_input_processed])

    filtered_tfidf_matrix = pipeline.named_steps['tfidfvectorizer'].transform(filtered_df["Eligibility"])
    similarity_scores = cosine_similarity(user_tfidf, filtered_tfidf_matrix).flatten()

    top_indices = similarity_scores.argsort()[-top_n:][::-1]
    top_scores = similarity_scores[top_indices]

    recommendations = []
    for idx, score in zip(top_indices, top_scores):
        if score >= threshold:
            scholarship = filtered_df.iloc[idx][["Scholarship Name", "Provider", "Eligibility", "Amount"]].to_dict()
            scholarship["Similarity Score"] = score
            recommendations.append(scholarship)

    if not recommendations:
        return [{"Scholarship Name": "No match found", "Provider": "N/A", "Eligibility": "N/A", "Amount": "N/A", "Similarity Score": 0}]
    return recommendations
if __name__ == "__main__":
    # Load and preprocess data
    df = load_data("combined60.csv")

    # Train NLP model
    pipeline, tfidf_matrix, df = train_nlp_model(df)

    # Evaluate model
    print("Evaluating model...")
    evaluate_model(df, pipeline, tfidf_matrix)

    # Example user input
    user_input = "I am a GATE-qualified female student from the obc category looking for a scholarship."
    print("\nTop Recommendations:")
    recommendations = recommend_scholarships(user_input, pipeline, tfidf_matrix, df, top_n=3)
    for idx, recommendation in enumerate(recommendations, 1):
        print(f"{idx}. {recommendation['Scholarship Name']} (Provider: {recommendation['Provider']}, Amount: {recommendation['Amount']})")
        print(f"   Eligibility: {recommendation['Eligibility']}")




Evaluating model...
Training set size: 39
Test set size: 21
Accuracy: 1.00
Precision: 1.00
Recall: 1.00
F1-Score: 1.00

Top Recommendations:
1. AICTE PG (GPAT/GATE) Scholarship (Provider: AICTE, Amount: INR 12,400 per month)
   Eligibility: GATE/GPAT qualified candidates
2. ONGC Scholarship Scheme for OBC Category Students 2025 (Provider: nan, Amount: nan)
   Eligibility: The ONGC Foundation has programmed a scholarship for OBC category students, named the 'ONGC Foundation Scholarship Scheme for OBC Category Students 2021'.  a step by this organization, which they think will provide financial assistance for an expensive education. The financial reward of this scholarship is impressive, and the number of students allowed to apply for this scheme is up to 500. Thousands of students will apply for this scholarship, but applications of only those will be accepted, who belong to the OBC community, and are pursuing any high fee course, like Engineering, Ph.D., MBBS, MBA, PGDM, etc. Used in B

  filtered_df = filtered_df[filtered_df["Eligibility"].str.contains(r"income.*?\d+\s*(lakh|lac)", case=False, regex=True)]
