In [None]:
# ml_model/train.py
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
import joblib
import os

def train_college_prediction_model():
    """
    Train a machine learning model for college prediction based on historical UPTAC data.
    """
    # Load the dataset
    try:
        df = pd.read_csv('/content/cutoff_uptac.csv')
        print(f"Dataset loaded successfully with {len(df)} rows.")
    except Exception as e:
        print(f"Error loading dataset: {e}")
        return None, None, None

    # Data preparation
    # We need to prepare features and create a target variable

    # First, let's clean the data
    df = df.dropna(subset=['Closing Rank'])  # Remove rows with missing closing ranks

    # Create a binary target variable 'admitted'
    # For demonstration purposes, we'll create a synthetic target
    # In a real scenario, you'd use actual admission data

    # Let's create a threshold for each category and institute to simulate admissions
    df['admitted'] = 0  # Default is not admitted

    # Create unique combinations of Institute, Program, Stream, Category
    combinations = df.groupby(['Institute', 'Program', 'Stream', 'Category'])

    # For each combination, mark students as admitted if their rank is below closing rank
    # This is a simplified simulation
    student_ranks = np.random.randint(1000, 100000, size=len(df))
    df['student_rank'] = student_ranks

    # Mark as admitted if student rank is better than (less than) closing rank
    df['admitted'] = (df['student_rank'] <= df['Closing Rank']).astype(int)

    print(f"Admissions generated: {df['admitted'].sum()} admitted out of {len(df)}")

    # Prepare features
    X = df[['Closing Rank', 'Category', 'Quota', 'Institute', 'Program', 'Stream', 'Seat Gender']]
    y = df['admitted']

    # Handle categorical features
    encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
    categorical_features = ['Category', 'Quota', 'Institute', 'Program', 'Stream', 'Seat Gender']
    encoded_cats = encoder.fit_transform(X[categorical_features])

    # Combine with numerical features
    X_encoded = np.column_stack([X['Closing Rank'].values, encoded_cats])

    # Split data
    X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

    # Train model
    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)

    # Evaluate
    train_accuracy = model.score(X_train, y_train)
    test_accuracy = model.score(X_test, y_test)

    print(f"Train accuracy: {train_accuracy:.4f}")
    print(f"Test accuracy: {test_accuracy:.4f}")

    # Save model and encoder
    os.makedirs('models', exist_ok=True)
    joblib.dump(model, 'models/college_predictor_model.joblib')
    joblib.dump(encoder, 'models/feature_encoder.joblib')

    # Get list of unique institutes
    institutes = df['Institute'].unique().tolist()

    return model, encoder, institutes


if __name__ == "__main__":
    # Train the model
    train_college_prediction_model()


# ml_model/predict.py
import joblib
import numpy as np
import pandas as pd
from typing import List, Dict, Any

def load_model_and_encoder():
    """Load the trained model and feature encoder"""
    model = joblib.load('models/college_predictor_model.joblib')
    encoder = joblib.load('models/feature_encoder.joblib')
    return model, encoder

def predict_admission_probability(
    student_rank: int,
    category: str,
    quota: str,
    gender: str,
    institutes: List[str],
    programs: List[str] = None
) -> List[Dict[str, Any]]:
    """
    Predict admission probability for institutions based on input parameters

    Args:
        student_rank: JEE Main rank of the student
        category: Reservation category (General, OBC, SC, ST, EWS, etc.)
        quota: Quota type (UP, OS, etc.)
        gender: Gender (Male, Female)
        institutes: List of institutes to predict for
        programs: List of programs to consider (optional)

    Returns:
        List of dictionaries with institute names, programs and admission probabilities
    """
    model, encoder = load_model_and_encoder()

    results = []

    # Load dataset to get program-stream combinations for each institute
    try:
        df = pd.read_csv('cutoff_uptac.csv')
    except Exception as e:
        print(f"Error loading dataset: {e}")
        return []

    for institute in institutes:
        # Filter data for this institute
        institute_data = df[df['Institute'] == institute]

        if len(institute_data) == 0:
            continue

        # If programs specified, filter further
        if programs:
            institute_data = institute_data[institute_data['Program'].isin(programs)]

        # Get unique program-stream combinations
        program_streams = institute_data[['Program', 'Stream']].drop_duplicates()

        institute_results = []

        for _, row in program_streams.iterrows():
            program = row['Program']
            stream = row['Stream']

            # Get relevant closing ranks for this program-stream combination
            relevant_data = institute_data[
                (institute_data['Program'] == program) &
                (institute_data['Stream'] == stream)
            ]

            # Find closest matching category, quota and gender
            matching_data = relevant_data[
                (relevant_data['Category'] == category) &
                (relevant_data['Quota'] == quota) &
                (relevant_data['Seat Gender'] == gender)
            ]

            # If no exact match, use any data for this program
            if len(matching_data) == 0:
                matching_data = relevant_data

            if len(matching_data) == 0:
                continue

            # Get average closing rank
            avg_closing_rank = matching_data['Closing Rank'].mean()

            # Create sample for prediction
            sample = pd.DataFrame([{
                'Closing Rank': avg_closing_rank,
                'Category': category,
                'Quota': quota,
                'Institute': institute,
                'Program': program,
                'Stream': stream,
                'Gender': gender
            }])

            # Extract numerical feature
            rank_value = np.array([avg_closing_rank])

            # Encode categorical features
            try:
                categorical_features = ['Category', 'Quota', 'Institute', 'Program', 'Stream', 'Gender']
                encoded_cats = encoder.transform(sample[categorical_features])

                # Combine with numerical features
                sample_encoded = np.column_stack([rank_value, encoded_cats])

                # Get probability of admission
                admission_prob = model.predict_proba(sample_encoded)[0, 1] * 100  # Convert to percentage

                # Adjust probability based on relationship between student's rank and closing rank
                if student_rank <= avg_closing_rank:
                    # Student's rank is better than or equal to closing rank
                    admission_prob = min(100, admission_prob * 1.5)  # Increase probability
                else:
                    # Student's rank is worse than closing rank
                    ratio = student_rank / avg_closing_rank
                    if ratio > 1.5:  # If rank is 50% worse
                        admission_prob = max(0, admission_prob / 2)  # Reduce probability
                    else:
                        admission_prob = max(0, admission_prob * (2 - ratio))  # Scale down

                # Round to nearest integer
                admission_prob = round(admission_prob)

                # Calculate cutoff range
                cutoff_range = {
                    'min': int(avg_closing_rank * 0.9),
                    'max': int(avg_closing_rank * 1.1)
                }

                # Add to institute results
                institute_results.append({
                    'program': program,
                    'stream': stream,
                    'probability': admission_prob,
                    'cutoffRange': cutoff_range,
                    'averageClosingRank': int(avg_closing_rank)
                })

            except Exception as e:
                print(f"Error making prediction for {institute}-{program}-{stream}: {e}")

        # Sort institute results by probability
        institute_results.sort(key=lambda x: x['probability'], reverse=True)

        if institute_results:
            # Calculate overall institute probability (average of top 3 programs)
            top_programs = institute_results[:min(3, len(institute_results))]
            overall_prob = sum(p['probability'] for p in top_programs) / len(top_programs)

            # Add institute to results
            results.append({
                'name': institute,
                'overallProbability': round(overall_prob),
                'programs': institute_results
            })

    # Sort by overall probability (highest first)
    results.sort(key=lambda x: x['overallProbability'], reverse=True)

    return results

def get_dataset_stats():
    """Get statistics about the dataset"""
    try:
        df = pd.read_csv('cutoff_uptac.csv')

        stats = {
            'total_rows': len(df),
            'institutes': df['Institute'].nunique(),
            'programs': df['Program'].nunique(),
            'streams': df['Stream'].nunique(),
            'categories': df['Category'].unique().tolist(),
            'quotas': df['Quota'].unique().tolist(),
            'rounds': df['Round'].unique().tolist(),
            'rank_range': {
                'min': df['Closing Rank'].min(),
                'max': df['Closing Rank'].max(),
                'avg': df['Closing Rank'].mean()
            }
        }

        return stats
    except Exception as e:
        print(f"Error getting dataset stats: {e}")
        return None

# Example usage
if __name__ == "__main__":
    # Example student details
    student_rank = 25000
    category = "General"
    quota = "UP"
    gender = "Male"

    # Get dataset stats
    stats = get_dataset_stats()
    if stats:
        print(f"Dataset contains {stats['institutes']} institutes and {stats['programs']} programs")

        # Get list of all institutes
        df = pd.read_csv('cutoff_uptac.csv')
        all_institutes = df['Institute'].unique().tolist()

        # Get predictions for top 5 institutes
        top_institutes = all_institutes[:5]
        predictions = predict_admission_probability(
            student_rank=student_rank,
            category=category,
            quota=quota,
            gender=gender,
            institutes=top_institutes
        )

        print(f"\nPredicted admission chances for a {category} candidate with rank {student_rank}:")
        for institute in predictions:
            print(f"\n{institute['name']}: {institute['overallProbability']}% chance")

            for program in institute['programs'][:3]:  # Show top 3 programs
                print(f"  - {program['program']} - {program['stream']}: {program['probability']}% chance")

Dataset loaded successfully with 9299 rows.
Admissions generated: 9152 admitted out of 9299
Train accuracy: 1.0000
Test accuracy: 0.9817
Dataset contains 190 institutes and 88 programs
Error making prediction for KIET GROUP OF INSTITUTIONS(KRISHNA INSTT. OF ENGG. & TECHNOLOGY),GHAZIABAD-Computer Science And Engineering(Artificial Intelligence & Machine Learning)-B.Tech: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- Gender
Feature names seen at fit time, yet now missing:
- Seat Gender

Error making prediction for KIET GROUP OF INSTITUTIONS(KRISHNA INSTT. OF ENGG. & TECHNOLOGY),GHAZIABAD-Electrical & Computer Engg.-B.Tech: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- Gender
Feature names seen at fit time, yet now missing:
- Seat Gender

Error making prediction for KIET GROUP OF INSTITUTIONS(KRISHNA INSTT. OF ENGG. & TECHNOLOGY),GHAZIABAD-Computer Science and Engineering-B.Tech

In [None]:
pip install joblib



In [2]:
# ml_model/train.py
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
import joblib
import os

def train_college_prediction_model():
    """
    Train a machine learning model for college prediction based on historical UPTAC data.
    """
    # Load the dataset
    try:
        df = pd.read_csv('/content/cutoff_uptac.csv')
        print(f"Dataset loaded successfully with {len(df)} rows.")
    except Exception as e:
        print(f"Error loading dataset: {e}")
        return None, None, None

    # Data preparation
    # We need to prepare features and create a target variable

    # First, let's clean the data
    df = df.dropna(subset=['Closing Rank'])  # Remove rows with missing closing ranks

    # Create a binary target variable 'admitted'
    # For demonstration purposes, we'll create a synthetic target
    # In a real scenario, you'd use actual admission data

    # Let's create a threshold for each category and institute to simulate admissions
    df['admitted'] = 0  # Default is not admitted

    # Create unique combinations of Institute, Program, Stream, Category
    combinations = df.groupby(['Institute', 'Program', 'Stream', 'Category'])

    # For each combination, mark students as admitted if their rank is below closing rank
    # This is a simplified simulation
    student_ranks = np.random.randint(1000, 100000, size=len(df))
    df['student_rank'] = student_ranks

    # Mark as admitted if student rank is better than (less than) closing rank
    df['admitted'] = (df['student_rank'] <= df['Closing Rank']).astype(int)

    print(f"Admissions generated: {df['admitted'].sum()} admitted out of {len(df)}")

    # Prepare features
    X = df[['Closing Rank', 'Category', 'Quota', 'Institute', 'Program', 'Stream', 'Seat Gender']]
    y = df['admitted']

    # Handle categorical features
    encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
    categorical_features = ['Category', 'Quota', 'Institute', 'Program', 'Stream', 'Seat Gender']
    encoded_cats = encoder.fit_transform(X[categorical_features])

    # Combine with numerical features
    X_encoded = np.column_stack([X['Closing Rank'].values, encoded_cats])

    # Split data
    X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

    # Train model
    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)

    # Evaluate
    train_accuracy = model.score(X_train, y_train)
    test_accuracy = model.score(X_test, y_test)

    print(f"Train accuracy: {train_accuracy:.4f}")
    print(f"Test accuracy: {test_accuracy:.4f}")

    # Save model and encoder
    os.makedirs('models', exist_ok=True)
    joblib.dump(model, 'models/college_predictor_model.joblib')
    joblib.dump(encoder, 'models/feature_encoder.joblib')

    # Get list of unique institutes
    institutes = df['Institute'].unique().tolist()

    return model, encoder, institutes


if __name__ == "__main__":
    # Train the model
    train_college_prediction_model()


# ml_model/predict.py
import joblib
import numpy as np
import pandas as pd
from typing import List, Dict, Any

def load_model_and_encoder():
    """Load the trained model and feature encoder"""
    model = joblib.load('models/college_predictor_model.joblib')
    encoder = joblib.load('models/feature_encoder.joblib')
    return model, encoder

def predict_admission_probability(
    student_rank: int,
    category: str,
    quota: str,
    seat_gender: str,
    institutes: List[str],
    programs: List[str] = None
) -> List[Dict[str, Any]]:
    """
    Predict admission probability for institutions based on input parameters

    Args:
        student_rank: JEE Main rank of the student
        category: Reservation category (General, OBC, SC, ST, EWS, etc.)
        quota: Quota type (UP, OS, etc.)
        seat_gender: Seat Gender (Male, Female, Gender-Neutral)
        institutes: List of institutes to predict for
        programs: List of programs to consider (optional)

    Returns:
        List of dictionaries with institute names, programs and admission probabilities
    """
    model, encoder = load_model_and_encoder()

    results = []

    # Load dataset to get program-stream combinations for each institute
    try:
        df = pd.read_csv('cutoff_uptac.csv')
    except Exception as e:
        print(f"Error loading dataset: {e}")
        return []

    for institute in institutes:
        # Filter data for this institute
        institute_data = df[df['Institute'] == institute]

        if len(institute_data) == 0:
            continue

        # If programs specified, filter further
        if programs:
            institute_data = institute_data[institute_data['Program'].isin(programs)]

        # Get unique program-stream combinations
        program_streams = institute_data[['Program', 'Stream']].drop_duplicates()

        institute_results = []

        for _, row in program_streams.iterrows():
            program = row['Program']
            stream = row['Stream']

            # Get relevant closing ranks for this program-stream combination
            relevant_data = institute_data[
                (institute_data['Program'] == program) &
                (institute_data['Stream'] == stream)
            ]

            # Find closest matching category, quota and gender
            matching_data = relevant_data[
                (relevant_data['Category'] == category) &
                (relevant_data['Quota'] == quota) &
                (relevant_data['Seat Gender'] == seat_gender)
            ]

            # If no exact match, use any data for this program
            if len(matching_data) == 0:
                matching_data = relevant_data

            if len(matching_data) == 0:
                continue

            # Get average closing rank
            avg_closing_rank = matching_data['Closing Rank'].mean()

            # Create sample for prediction
            sample = pd.DataFrame([{
                'Closing Rank': avg_closing_rank,
                'Category': category,
                'Quota': quota,
                'Institute': institute,
                'Program': program,
                'Stream': stream,
                'Seat Gender': seat_gender
            }])

            # Extract numerical feature
            rank_value = np.array([avg_closing_rank])

            # Encode categorical features
            try:
                categorical_features = ['Category', 'Quota', 'Institute', 'Program', 'Stream', 'Seat Gender']
                encoded_cats = encoder.transform(sample[categorical_features])

                # Combine with numerical features
                sample_encoded = np.column_stack([rank_value, encoded_cats])

                # Get probability of admission
                admission_prob = model.predict_proba(sample_encoded)[0, 1] * 100  # Convert to percentage

                # Adjust probability based on relationship between student's rank and closing rank
                if student_rank <= avg_closing_rank:
                    # Student's rank is better than or equal to closing rank
                    admission_prob = min(100, admission_prob * 1.5)  # Increase probability
                else:
                    # Student's rank is worse than closing rank
                    ratio = student_rank / avg_closing_rank
                    if ratio > 1.5:  # If rank is 50% worse
                        admission_prob = max(0, admission_prob / 2)  # Reduce probability
                    else:
                        admission_prob = max(0, admission_prob * (2 - ratio))  # Scale down

                # Round to nearest integer
                admission_prob = round(admission_prob)

                # Calculate cutoff range
                cutoff_range = {
                    'min': int(avg_closing_rank * 0.9),
                    'max': int(avg_closing_rank * 1.1)
                }

                # Add to institute results
                institute_results.append({
                    'program': program,
                    'stream': stream,
                    'probability': admission_prob,
                    'cutoffRange': cutoff_range,
                    'averageClosingRank': int(avg_closing_rank)
                })

            except Exception as e:
                print(f"Error making prediction for {institute}-{program}-{stream}: {e}")

        # Sort institute results by probability
        institute_results.sort(key=lambda x: x['probability'], reverse=True)

        if institute_results:
            # Calculate overall institute probability (average of top 3 programs)
            top_programs = institute_results[:min(3, len(institute_results))]
            overall_prob = sum(p['probability'] for p in top_programs) / len(top_programs)

            # Add institute to results
            results.append({
                'name': institute,
                'overallProbability': round(overall_prob),
                'programs': institute_results
            })

    # Sort by overall probability (highest first)
    results.sort(key=lambda x: x['overallProbability'], reverse=True)

    return results

def get_dataset_stats():
    """Get statistics about the dataset"""
    try:
        df = pd.read_csv('cutoff_uptac.csv')

        stats = {
            'total_rows': len(df),
            'institutes': df['Institute'].nunique(),
            'programs': df['Program'].nunique(),
            'streams': df['Stream'].nunique(),
            'categories': df['Category'].unique().tolist(),
            'quotas': df['Quota'].unique().tolist(),
            'rounds': df['Round'].unique().tolist(),
            'rank_range': {
                'min': df['Closing Rank'].min(),
                'max': df['Closing Rank'].max(),
                'avg': df['Closing Rank'].mean()
            }
        }

        return stats
    except Exception as e:
        print(f"Error getting dataset stats: {e}")
        return None

# Example usage
if __name__ == "__main__":
    # Example student details
    student_rank = int(input("Enter your JEE Main rank: "))
    category =input("Enter your reservation category: ")
    quota = input("Enter your home state: ")
    seat_gender = input("Enter your gender: ")

    # Get dataset stats
    stats = get_dataset_stats()
    if stats:
        print(f"Dataset contains {stats['institutes']} institutes and {stats['programs']} programs")

        # Get list of all institutes
        df = pd.read_csv('cutoff_uptac.csv')
        all_institutes = df['Institute'].unique().tolist()

        # Get predictions for top 5 institutes
        top_institutes = all_institutes[:5]
        predictions = predict_admission_probability(
            student_rank=student_rank,
            category=category,
            quota=quota,
            seat_gender=seat_gender,
            institutes=top_institutes
        )

        print(f"\nPredicted admission chances for a {category} candidate with rank {student_rank}:")
        for institute in predictions:
            print(f"\n{institute['name']}: {institute['overallProbability']}% chance")

            for program in institute['programs'][:3]:  # Show top 3 programs
                print(f"  - {program['program']} - {program['stream']}: {program['probability']}% chance")

Dataset loaded successfully with 9299 rows.
Admissions generated: 9147 admitted out of 9299
Train accuracy: 1.0000
Test accuracy: 0.9796
Enter your JEE Main rank: 123700
Enter your reservation category: gen
Enter your home state: all india
Enter your gender: male
Dataset contains 190 institutes and 88 programs

Predicted admission chances for a gen candidate with rank 123700:

KIET GROUP OF INSTITUTIONS(KRISHNA INSTT. OF ENGG. & TECHNOLOGY),GHAZIABAD: 100% chance
  - Computer Science And Engineering(Artificial Intelligence & Machine Learning) - B.Tech: 100% chance
  - Electrical & Computer Engg. - B.Tech: 100% chance
  - Computer Science and Engineering - B.Tech: 100% chance

RAJKIYA ENGINEERING COLLEGE, PARTAPGARH: 100% chance
  - Civil Engineering - B.Tech: 100% chance
  - Computer Science and Engineering - B.Tech: 100% chance
  - Mechanical Engineering - B.Tech: 100% chance

INSTITUTE OF ENGG. & RURAL TECHNOLOGY,ALLAHABAD: 100% chance
  - Computer Science and Engineering - B.Tech: 1