In [91]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from xgboost import XGBRegressor
from sklearn.neighbors import NearestNeighbors


In [None]:
df = pd.read_csv('data/colleges_data.csv')

In [93]:
cols_model = [
    'latest.admissions.admission_rate.overall',
    'latest.completion.completion_cohort_4yr_150nt',
    'latest.admissions.sat_scores.average.overall',
    'latest.student.demographics.race_ethnicity.white',
    'latest.student.demographics.race_ethnicity.black',
    'latest.student.demographics.race_ethnicity.hispanic',
    'latest.student.demographics.race_ethnicity.asian',
    'latest.student.demographics.median_family_income',
    'latest.student.share_firstgeneration_parents.middleschool',
    'latest.student.share_firstgeneration_parents.highschool',
    'latest.student.share_firstgeneration_parents.somecollege',
    'school.instructional_expenditure_per_fte',
    'latest.earnings.6_yrs_after_entry.median',
    'id',  # identifiant
    'school.state',
    'school.region_id',
    'school.locale',
    'school.name'
]

In [94]:
df = df[cols_model].dropna().reset_index(drop=True)

In [95]:
df.rename(columns={
    'latest.admissions.admission_rate.overall': 'Admission rate',
    'latest.completion.completion_cohort_4yr_150nt': 'Completion rate',
    'latest.admissions.sat_scores.average.overall': 'Average SAT score',
    'latest.student.demographics.race_ethnicity.white': 'Race_white',
    'latest.student.demographics.race_ethnicity.black': 'Race_black',
    'latest.student.demographics.race_ethnicity.hispanic': 'Race_hispanic',
    'latest.student.demographics.race_ethnicity.asian': 'Race_asian',
    'latest.student.demographics.median_family_income': 'Median family income',
    'latest.student.share_firstgeneration_parents.middleschool': 'Parents from middleschool',
    'latest.student.share_firstgeneration_parents.highschool': 'Parents from highschool',
    'latest.student.share_firstgeneration_parents.somecollege': 'Parents from college',
    'school.instructional_expenditure_per_fte': 'Expenditures per student',
    'latest.earnings.6_yrs_after_entry.median': 'Median earnings',
    'id': 'ID',
    'school.state': 'State',
    'school.region_id': 'Region',
    'school.locale': 'Locale',
    'school.name': 'School Name'
}, inplace=True)

In [None]:
X = df.drop(columns=['Median earnings', 'ID', 'School Name'])
y = df['Median earnings']

In [97]:
X_encoded = pd.get_dummies(X, columns=['State', 'Region', 'Locale'], drop_first=True)


In [98]:
# Entraînement du modèle
final_model = Pipeline([
    ('scaler', StandardScaler()),
    ('model', XGBRegressor(random_state=42, eta=0.1, max_depth=12,
                           min_child_weight=2, alpha=0.3, subsample=0.5))
])

final_model.fit(X_encoded.values, y.values)


0,1,2
,steps,"[('scaler', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,objective,'reg:squarederror'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,
,early_stopping_rounds,
,enable_categorical,False


In [99]:
# Préparation du modèle KNN sur les mêmes données encodées pour recommandation
model_knn = NearestNeighbors(n_neighbors=10, algorithm='brute')
model_knn.fit(X_encoded.values)


0,1,2
,n_neighbors,10
,radius,1.0
,algorithm,'brute'
,leaf_size,30
,metric,'minkowski'
,p,2
,metric_params,
,n_jobs,


In [100]:
# --- Exemple profil étudiant ---

sat = 1300
race = 1  # 1:white, 2:black, 3:hispanic, 4:asian
fam_income = 90000
parents_edu = 2  # 1:middle, 2:highschool, 3:college


In [101]:
# Construire le vecteur étudiant (mêmes colonnes que X_encoded)
student_base = {
    'Admission rate': 0,
    'Completion rate': 0,
    'Average SAT score': sat,
    'Race_white': 0,
    'Race_black': 0,
    'Race_hispanic': 0,
    'Race_asian': 0,
    'Median family income': fam_income,
    'Parents from middleschool': 0,
    'Parents from highschool': 0,
    'Parents from college': 0,
    'Expenditures per student': X['Expenditures per student'].mean(),
}


In [102]:
# Activer la race et le niveau d’éducation des parents
race_map = {1: 'Race_white', 2: 'Race_black', 3: 'Race_hispanic', 4: 'Race_asian'}
parents_edu_map = {1: 'Parents from middleschool', 2: 'Parents from highschool', 3: 'Parents from college'}

student_base[race_map[race]] = 1
student_base[parents_edu_map[parents_edu]] = 1


In [103]:
# Pour toutes les colonnes catégorielles encodées (ex: State_AL, Region_2, Locale_3...), on met 0
for col in X_encoded.columns:
    if col not in student_base:
        student_base[col] = 0

student_vector = pd.DataFrame([student_base])[X_encoded.columns]


In [104]:
# Trouver les écoles les plus proches
distances, indices = model_knn.kneighbors(student_vector.values)
closest_indices = indices[0]


In [105]:
# Extraire les noms et IDs des écoles recommandées
recommended_names = df.iloc[closest_indices]['School Name'].values
recommended_ids = df.iloc[closest_indices]['ID'].values


In [106]:
# Calcul du revenu moyen réel des écoles recommandées
mean_earnings_recommended = df.iloc[closest_indices]['Median earnings'].mean()


In [107]:
# Prédiction du revenu pour le profil étudiant
predicted_earning = final_model.predict(student_vector.values)[0]


In [108]:

print("\nÉcoles recommandées :")
for i, idx in enumerate(closest_indices, 1):
    name = df.loc[idx, 'School Name']
    admission_rate = df.loc[idx, 'Admission rate']
    admission_pct = round(admission_rate * 100, 2)
    print(f"{i}. {name} (Taux d'admission : {admission_pct}%)")

print(f"\nRevenu moyen réel des écoles recommandées : ${round(mean_earnings_recommended, 2)}")
print(f"Revenu prédit pour cet étudiant : ${round(predicted_earning, 2)}")

pct_diff = abs(mean_earnings_recommended - predicted_earning) / mean_earnings_recommended * 100
print(f"Différence en pourcentage entre prédit et réel : {round(pct_diff, 2)} %")



Écoles recommandées :
1. University of New Hampshire at Manchester (Taux d'admission : 87.33%)
2. Augustana College (Taux d'admission : 67.65%)
3. Duquesne University (Taux d'admission : 78.85%)
4. Hope College (Taux d'admission : 75.85%)
5. Ithaca College (Taux d'admission : 69.94%)
6. Clarkson University (Taux d'admission : 77.16%)
7. University of New Hampshire-Main Campus (Taux d'admission : 86.87%)
8. College of Saint Benedict (Taux d'admission : 91.36%)
9. University of Vermont (Taux d'admission : 60.02%)
10. Emmanuel College (Taux d'admission : 80.6%)

Revenu moyen réel des écoles recommandées : $54573.1
Revenu prédit pour cet étudiant : $58213.71875
Différence en pourcentage entre prédit et réel : 6.67 %
