In [11]:
import pandas as pd

# Load your CSV
df = pd.read_csv("C:\\Code\\python\\sih\\back-end\\data\\pmis_sample.csv")

# Rename columns manually
df = df.rename(columns={
    "EDUCATION": "Education",
    "SKILL_1": "Skill no. 1",
    "SKILL_2": "Skill no. 2",
    "SKILL_3": "Skill no. 3",
    "INTEREST": "Interest",
    "LOCATION": "Location",
    "INTERNSHIP": "Internship"
})

# Save back to the same file (overwrite)
df.to_csv("C:\\Code\\python\\sih\\back-end\\data\\pmis_sample.csv", index=False)


In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import joblib

# Set seed for reproducibility
np.random.seed(42)

# ============================
# 1. LOAD DATA
# ============================
df = pd.read_csv("C:\\Code\\python\\sih\\back-end\\data\\pmis_sample.csv")

# Features & Target
X = df[['Education', 'Skill no. 1', 'Skill no. 2', 'Skill no. 3', 'Interest', 'Location']]
y = df['Internship']

# ============================
# 2. PIPELINE: Preprocessing + Model
# ============================
# Define categorical columns
categorical_cols = X.columns.tolist()

# Preprocessing: Impute missing values + OneHotEncode
preprocessor = ColumnTransformer(
    transformers=[
        ("cat", Pipeline(steps=[
            ("imputer", SimpleImputer(strategy="most_frequent")),
            ("encoder", OneHotEncoder(handle_unknown="ignore" , drop = "first"))
        ]), categorical_cols)
    ]
)

# Full pipeline = preprocessing + classifier
pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", RandomForestClassifier(random_state=42))
])

# ============================
# 3. GRID SEARCH
# ============================
param_grid = {
    "classifier__n_estimators": [100, 200],
    "classifier__max_depth": [None, 10, 20],
    "classifier__min_samples_split": [2, 5],
    "classifier__min_samples_leaf": [1, 2],
    "classifier__max_features": ["sqrt", "log2"]
}

grid_search = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    scoring="accuracy",
    cv=3,
    verbose=1,
    n_jobs=-1
)

grid_search.fit(X, y)
best_model = grid_search.best_estimator_

# ============================
# 4. SAVE MODEL
# ============================
print("✅ Best Hyperparameters:")
print(grid_search.best_params_)

joblib.dump(best_model, "C:\\Code\\python\\sih\\back-end\\model\\internship_model.pkl")
df.to_csv("C:\\Code\\python\\sih\\back-end\\data\\training_data.csv", index=False)

print("✅ Model (pipeline) trained and saved as 'internship_model.pkl'")
print("✅ Cleaned training data saved as 'training_data.csv'")

Fitting 3 folds for each of 48 candidates, totalling 144 fits
✅ Best Hyperparameters:
{'classifier__max_depth': None, 'classifier__max_features': 'log2', 'classifier__min_samples_leaf': 2, 'classifier__min_samples_split': 5, 'classifier__n_estimators': 100}
✅ Model (pipeline) trained and saved as 'internship_model.pkl'
✅ Cleaned training data saved as 'training_data.csv'


In [15]:
import pandas as pd
import joblib
import numpy as np

# Load trained pipeline model
model = joblib.load('C:\\Code\\python\\sih\\back-end\\model\\internship_model.pkl')

# Load raw training dataset
training_data = pd.read_csv('C:\\Code\\python\\sih\\back-end\\data\\pmis_sample.csv')

def predict_internship(new_data_dict):
    """
    Predicts internships for a user and returns JSON with only internship names.
    """
    new_data = pd.DataFrame([new_data_dict])

    # RULE-BASED condition (match on Education, Skills, Location)
    condition = (
        (training_data['Education'] == new_data.iloc[0]['Education']) &
        (training_data['Skill no. 1'] == new_data.iloc[0]['Skill no. 1']) &
        (training_data['Skill no. 2'] == new_data.iloc[0]['Skill no. 2']) &
        (training_data['Skill no. 3'] == new_data.iloc[0]['Skill no. 3']) &
        (training_data['Location'] == new_data.iloc[0]['Location'])
    )
    exact_matches = training_data[condition]

    internships = []
    if not exact_matches.empty:
        internships = exact_matches['Internship'].drop_duplicates().tolist()

    # If less than 4 → use ML predictions to fill remaining
    if len(internships) < 4:
        probs = model.predict_proba(new_data)[0]
        top_idx = np.argsort(probs)[-4:][::-1]   # top 4
        ml_preds = [model.classes_[i] for i in top_idx]

        for pred in ml_preds:
            if pred not in internships:
                internships.append(pred)
            if len(internships) == 4:
                break

    # Convert list into JSON objects (for frontend cards)
    result = {"Top_4_Internships": [{"Internship": name} for name in internships]}
    return result


# =============================
# HARDCODED TEST USER
# =============================
test_user = {
    'Education': 'M.Tech',
    'Skill no. 1': 'Remote Sensing',
    'Skill no. 2': 'Biostatistics',
    'Skill no. 3': 'Power BI',
    'Interest': 'Research',
    'Location': 'Uttar Pradesh'
}

# Run prediction
print(predict_internship(test_user))



{'Top_4_Internships': [{'Internship': 'Automation QA Intern'}, {'Internship': 'Agri Data Intern'}, {'Internship': 'Solar PV Intern'}, {'Internship': 'Learning Platform Intern'}]}
