In [1]:
import pandas as pd
import numpy as np
import random

# Set random seed for reproducibility
np.random.seed(42)

# Number of institutions to simulate
n_institutions = 200
years = [2023, 2024]

all_data = []

for year in years:
    # Mock institution names
    institutions = [f"College_{i+1}" for i in range(n_institutions)]

    # Existing features
    faculty_count = np.random.randint(50, 500, n_institutions)
    infra_score = np.random.randint(40, 100, n_institutions)
    naac_grades = random.choices(["A++", "A+", "A", "B++", "B+", "B", "C"], k=n_institutions)
    naac_numeric_map = {"A++":95, "A+":90, "A":85, "B++":75, "B+":70, "B":60, "C":50}
    naac_score = [naac_numeric_map[g] for g in naac_grades]
    required_docs = 12
    uploaded_docs = np.random.randint(5, required_docs+1, n_institutions)
    doc_sufficiency = (uploaded_docs / required_docs) * 100
    faculty_norm = 100 * (faculty_count - faculty_count.min()) / (faculty_count.max() - faculty_count.min())
    compliance_score = 0.4*faculty_norm + 0.3*np.array(infra_score) + 0.3*np.array(naac_score)

    # New features to add
    quality_index = np.random.randint(50, 100, n_institutions)
    financial_health_score = np.random.randint(30, 90, n_institutions)
    research_productivity = np.random.uniform(0.5, 5.0, n_institutions).round(2)
    outcome_indicator = np.random.uniform(60, 100, n_institutions).round(2)
    trust_risk = np.random.uniform(1, 10, n_institutions).round(2)


    # Build DataFrame for the current year
    df_year = pd.DataFrame({
        "Year": year,
        "Institution": institutions,
        "Faculty_Count": faculty_count,
        "Infra_Score": infra_score,
        "NAAC_Grade": naac_grades,
        "NAAC_Score": naac_score,
        "Uploaded_Docs": uploaded_docs,
        "Required_Docs": required_docs,
        "Doc_Sufficiency_%": doc_sufficiency.round(2),
        "Compliance_Score": compliance_score.round(2),
        "Quality_Index": quality_index,
        "Financial_Health_Score": financial_health_score,
        "Research_Productivity": research_productivity,
        "Outcome_Indicator": outcome_indicator,
        "Trust_Risk": trust_risk
    })
    all_data.append(df_year)

# Concatenate data from all years
df = pd.concat(all_data, ignore_index=True)

display(df)

Unnamed: 0,Year,Institution,Faculty_Count,Infra_Score,NAAC_Grade,NAAC_Score,Uploaded_Docs,Required_Docs,Doc_Sufficiency_%,Compliance_Score,Quality_Index,Financial_Health_Score,Research_Productivity,Outcome_Indicator,Trust_Risk
0,2023,College_1,152,83,B,60,10,12,83.33,51.94,54,56,3.38,99.91,2.66
1,2023,College_2,485,59,A,85,9,12,75.00,82.04,71,39,4.12,90.10,1.81
2,2023,College_3,398,69,A,85,7,12,58.33,77.25,78,55,4.56,88.28,7.94
3,2023,College_4,320,50,A,85,10,12,83.33,64.57,52,48,3.28,91.14,1.61
4,2023,College_5,156,97,A,85,12,12,100.00,64.00,61,68,4.91,65.73,8.53
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
395,2024,College_196,441,45,A,85,7,12,58.33,73.80,95,61,2.19,85.71,3.56
396,2024,College_197,316,56,C,50,5,12,41.67,55.39,85,57,2.92,67.83,5.17
397,2024,College_198,263,98,C,50,12,12,100.00,63.23,89,42,2.86,95.88,4.95
398,2024,College_199,228,64,B+,70,10,12,83.33,55.90,86,60,4.32,80.81,2.97


In [2]:
df.to_csv('institution_data.csv', index=False)

In [3]:
df.to_excel('institution_data.xlsx', index=False)

In [5]:
display(df.columns)

Index(['Year', 'Institution', 'Faculty_Count', 'Infra_Score', 'NAAC_Grade',
       'NAAC_Score', 'Uploaded_Docs', 'Required_Docs', 'Doc_Sufficiency_%',
       'Compliance_Score', 'Quality_Index', 'Financial_Health_Score',
       'Research_Productivity', 'Outcome_Indicator', 'Trust_Risk'],
      dtype='object')