In [1]:
pip install scikit-learn

Collecting scikit-learn
  Downloading scikit_learn-1.7.2-cp312-cp312-win_amd64.whl.metadata (11 kB)
Collecting scipy>=1.8.0 (from scikit-learn)
  Downloading scipy-1.16.3-cp312-cp312-win_amd64.whl.metadata (60 kB)
Collecting joblib>=1.2.0 (from scikit-learn)
  Downloading joblib-1.5.2-py3-none-any.whl.metadata (5.6 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Downloading threadpoolctl-3.6.0-py3-none-any.whl.metadata (13 kB)
Downloading scikit_learn-1.7.2-cp312-cp312-win_amd64.whl (8.7 MB)
   ---------------------------------------- 0.0/8.7 MB ? eta -:--:--
   -- ------------------------------------- 0.5/8.7 MB 3.3 MB/s eta 0:00:03
   ------- -------------------------------- 1.6/8.7 MB 3.8 MB/s eta 0:00:02
   -------- ------------------------------- 1.8/8.7 MB 3.4 MB/s eta 0:00:03
   ---------- ----------------------------- 2.4/8.7 MB 2.7 MB/s eta 0:00:03
   ------------- -------------------------- 2.9/8.7 MB 2.6 MB/s eta 0:00:03
   -------------- ------------------------- 


[notice] A new release of pip is available: 25.0.1 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
import json
import warnings
warnings.filterwarnings('ignore')

In [3]:
class AdmissionsDataPreprocessor:
    """
    Data preprocessing pipeline for Indian student admissions data
    """
    
    def __init__(self, json_file_path):
        """Initialize with JSON file path"""
        self.json_file_path = json_file_path
        self.df = None
        self.df_processed = None
        self.scalers = {}
        self.encoders = {}
        self.load_data()
    
    def load_data(self):
        """Load JSON data into pandas DataFrame"""
        print("Loading data...")
        with open(self.json_file_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
        self.df = pd.DataFrame(data)
        self.df_processed = self.df.copy()
        print(f"Data loaded: {self.df.shape[0]} rows, {self.df.shape[1]} columns")

In [6]:
def handle_missing_values(self):
        """Handle missing values in the dataset"""
        print("\n" + "="*80)
        print("STEP 1: HANDLING MISSING VALUES")
        print("="*80)
        
        # GRE scores - use median imputation for missing values
        gre_cols = ['gre_total', 'gre_verbal', 'gre_quant', 'gre_awa']
        for col in gre_cols:
            if col in self.df_processed.columns:
                missing_count = self.df_processed[col].isna().sum()
                if missing_count > 0:
                    median_val = self.df_processed[col].median()
                    self.df_processed[col].fillna(median_val, inplace=True)
                    print(f"{col}: Filled {missing_count} missing values with median ({median_val:.2f})")
        
        # Scholarship amount - fill with 0 for missing
        if 'scholarship_amount' in self.df_processed.columns:
            self.df_processed['scholarship_amount'].fillna(0, inplace=True)
            print("scholarship_amount: Filled missing values with 0")
        
        # Rankings - fill with a high number (indicating no rank) or median
        rank_cols = ['cs_rank', 'eng_rank', 'mba_rank', 'gen_rank']
        for col in rank_cols:
            if col in self.df_processed.columns:
                missing_count = self.df_processed[col].isna().sum()
                if missing_count > 0:
                    # Fill with 9999 to indicate no rank
                    self.df_processed[col].fillna(9999, inplace=True)
                    print(f"{col}: Filled {missing_count} missing values with 9999 (no rank)")
        
        print("\nMissing value handling complete!")
        return self.df_processed

AdmissionsDataPreprocessor.handle_missing_values = handle_missing_values

In [7]:
def feature_engineering(self):
        """Create new features from existing ones"""
        print("\n" + "="*80)
        print("STEP 2: FEATURE ENGINEERING")
        print("="*80)
        
        # 1. GPA strength category
        self.df_processed['gpa_category'] = pd.cut(
            self.df_processed['gpa_normalized'],
            bins=[0, 6.0, 7.5, 8.5, 10.0],
            labels=['Low', 'Medium', 'High', 'Very High']
        )
        print("✓ Created gpa_category (Low, Medium, High, Very High)")
        
        # 2. English proficiency level
        self.df_processed['english_proficiency'] = pd.cut(
            self.df_processed['english_test_normalized'],
            bins=[0, 90, 100, 110, 120],
            labels=['Basic', 'Good', 'Very Good', 'Excellent']
        )
        print("✓ Created english_proficiency (Basic, Good, Very Good, Excellent)")
        
        # 3. Total experience (work + internship)
        self.df_processed['total_experience'] = (
            self.df_processed['work_experience'] + 
            self.df_processed['internship_experience']
        )
        print("✓ Created total_experience (work + internship months)")
        
        # 4. Experience category
        self.df_processed['experience_category'] = pd.cut(
            self.df_processed['total_experience'],
            bins=[-1, 0, 12, 36, 1000],
            labels=['None', 'Junior', 'Mid', 'Senior']
        )
        print("✓ Created experience_category (None, Junior, Mid, Senior)")
        
        # 5. Has publications flag
        self.df_processed['has_publications'] = (self.df_processed['publications'] > 0).astype(int)
        print("✓ Created has_publications (0 or 1)")
        
        # 6. GRE strength (if available)
        if 'gre_total' in self.df_processed.columns:
            self.df_processed['gre_strength'] = pd.cut(
                self.df_processed['gre_total'],
                bins=[0, 300, 310, 320, 340],
                labels=['Low', 'Medium', 'High', 'Very High']
            )
            print("✓ Created gre_strength (Low, Medium, High, Very High)")
        
        # 7. Application timing (early, regular, late based on historical patterns)
        self.df_processed['is_fall_term'] = (
            self.df_processed['application_term'] == 'fall'
        ).astype(int)
        print("✓ Created is_fall_term (1 for fall, 0 for others)")
        
        # 8. University prestige proxy (based on ranking if available)
        if 'gen_rank' in self.df_processed.columns:
            self.df_processed['university_tier'] = pd.cut(
                self.df_processed['gen_rank'],
                bins=[0, 50, 100, 200, 10000],
                labels=['Top_50', 'Top_100', 'Top_200', 'Others']
            )
            print("✓ Created university_tier (Top_50, Top_100, Top_200, Others)")
        
        # 9. Academic alignment score
        self.df_processed['academic_alignment_score'] = self.df_processed['major_alignment']
        print("✓ Retained academic_alignment_score")
        
        # 10. Composite academic score
        gpa_norm = self.df_processed['gpa_normalized'] / 10.0
        english_norm = self.df_processed['english_test_normalized'] / 120.0
        
        if 'gre_total' in self.df_processed.columns:
            gre_norm = self.df_processed['gre_total'] / 340.0
            self.df_processed['composite_academic_score'] = (
                0.4 * gpa_norm + 0.3 * english_norm + 0.3 * gre_norm
            )
        else:
            self.df_processed['composite_academic_score'] = (
                0.6 * gpa_norm + 0.4 * english_norm
            )
        print("✓ Created composite_academic_score (weighted academic performance)")
        
        print("\nFeature engineering complete!")
        print(f"New feature count: {self.df_processed.shape[1] - self.df.shape[1]}")
        
        return self.df_processed

AdmissionsDataPreprocessor.feature_engineering = feature_engineering

In [8]:
def encode_categorical_variables(self):
        """Encode categorical variables"""
        print("\n" + "="*80)
        print("STEP 3: ENCODING CATEGORICAL VARIABLES")
        print("="*80)
        
        # Label encode binary and ordinal variables
        binary_cols = ['student_type', 'has_scholarship', 'application_term']
        
        for col in binary_cols:
            if col in self.df_processed.columns:
                le = LabelEncoder()
                self.df_processed[f'{col}_encoded'] = le.fit_transform(
                    self.df_processed[col].astype(str)
                )
                self.encoders[col] = le
                print(f"✓ Label encoded {col}")
        
        # One-hot encode high cardinality categorical variables
        categorical_cols = [
            'credential_standardized',
            'categorical_course_name',
            'ug_major_bucket'
        ]
        
        for col in categorical_cols:
            if col in self.df_processed.columns:
                # Get top 10 categories and group others
                top_categories = self.df_processed[col].value_counts().head(10).index
                self.df_processed[f'{col}_grouped'] = self.df_processed[col].apply(
                    lambda x: x if x in top_categories else 'Other'
                )
                
                # One-hot encode
                dummies = pd.get_dummies(
                    self.df_processed[f'{col}_grouped'],
                    prefix=col
                )
                self.df_processed = pd.concat([self.df_processed, dummies], axis=1)
                print(f"✓ One-hot encoded {col} (top 10 + Other)")
        
        print("\nCategorical encoding complete!")
        return self.df_processed

AdmissionsDataPreprocessor.encode_categorical_variables = encode_categorical_variables

In [9]:
def handle_outliers(self, method='iqr'):
        """Detect and handle outliers"""
        print("\n" + "="*80)
        print("STEP 4: OUTLIER DETECTION AND HANDLING")
        print("="*80)
        
        numerical_cols = [
            'gpa_normalized', 'english_test_normalized', 'gre_total',
            'work_experience', 'relevant_work_experience',
            'internship_experience', 'publications'
        ]
        
        outliers_summary = {}
        
        for col in numerical_cols:
            if col in self.df_processed.columns:
                Q1 = self.df_processed[col].quantile(0.25)
                Q3 = self.df_processed[col].quantile(0.75)
                IQR = Q3 - Q1
                
                lower_bound = Q1 - 3 * IQR
                upper_bound = Q3 + 3 * IQR
                
                outliers = (
                    (self.df_processed[col] < lower_bound) | 
                    (self.df_processed[col] > upper_bound)
                )
                outlier_count = outliers.sum()
                
                if outlier_count > 0:
                    # Cap outliers instead of removing
                    self.df_processed[col] = self.df_processed[col].clip(
                        lower=lower_bound,
                        upper=upper_bound
                    )
                    outliers_summary[col] = outlier_count
                    print(f"✓ {col}: Capped {outlier_count} outliers")
        
        if not outliers_summary:
            print("No significant outliers detected!")
        
        return outliers_summary

AdmissionsDataPreprocessor.handle_outliers = handle_outliers

In [10]:
def normalize_features(self):
        """Normalize numerical features"""
        print("\n" + "="*80)
        print("STEP 5: FEATURE NORMALIZATION")
        print("="*80)
        
        numerical_cols = [
            'gpa_normalized', 'english_test_normalized', 'gre_total',
            'gre_verbal', 'gre_quant', 'gre_awa',
            'work_experience', 'relevant_work_experience',
            'internship_experience', 'total_experience',
            'publications', 'composite_academic_score'
        ]
        
        cols_to_scale = [col for col in numerical_cols if col in self.df_processed.columns]
        
        scaler = StandardScaler()
        self.df_processed[cols_to_scale] = scaler.fit_transform(
            self.df_processed[cols_to_scale]
        )
        self.scalers['standard_scaler'] = scaler
        
        print(f"✓ Standardized {len(cols_to_scale)} numerical features")
        print("  Features now have mean=0 and std=1")
        
        return self.df_processed

AdmissionsDataPreprocessor.normalize_features = normalize_features

In [11]:
def create_feature_sets(self):
        """Create different feature sets for modeling"""
        print("\n" + "="*80)
        print("STEP 6: CREATING FEATURE SETS")
        print("="*80)
        
        # Basic features (academic only)
        basic_features = [
            'gpa_normalized', 'english_test_normalized',
            'gre_total', 'composite_academic_score'
        ]
        basic_features = [f for f in basic_features if f in self.df_processed.columns]
        
        # Extended features (academic + experience)
        extended_features = basic_features + [
            'work_experience', 'relevant_work_experience',
            'internship_experience', 'has_publications',
            'major_alignment'
        ]
        extended_features = [f for f in extended_features if f in self.df_processed.columns]
        
        # Full features (all engineered features)
        full_features = [col for col in self.df_processed.columns 
                        if col not in ['id', 'student_id', 'student_name',
                                      'university_name', 'course_name',
                                      'undergrad_university', 'undergrad_canonical',
                                      'admission_result', 'application_status']
                        and not col.endswith('_stripped')]
        
        feature_sets = {
            'basic': basic_features,
            'extended': extended_features,
            'full': full_features
        }
        
        print(f"✓ Basic feature set: {len(basic_features)} features")
        print(f"✓ Extended feature set: {len(extended_features)} features")
        print(f"✓ Full feature set: {len(full_features)} features")
        
        return feature_sets

AdmissionsDataPreprocessor.create_feature_sets = create_feature_sets

In [12]:
def save_processed_data(self, output_prefix='admissions_processed'):
        """Save processed data to files"""
        print("\n" + "="*80)
        print("SAVING PROCESSED DATA")
        print("="*80)
        
        # Save full processed dataset
        csv_path = f"{output_prefix}.csv"
        self.df_processed.to_csv(csv_path, index=False)
        print(f"✓ Saved processed data to: {csv_path}")
        
        # Save as JSON
        json_path = f"{output_prefix}.json"
        self.df_processed.to_json(json_path, orient='records', indent=2)
        print(f"✓ Saved processed data to: {json_path}")
        
        # Save preprocessing metadata
        metadata = {
            'original_shape': self.df.shape,
            'processed_shape': self.df_processed.shape,
            'new_features_count': self.df_processed.shape[1] - self.df.shape[1],
            'encoders': {k: v.classes_.tolist() for k, v in self.encoders.items()},
        }
        
        metadata_path = f"{output_prefix}_metadata.json"
        with open(metadata_path, 'w') as f:
            json.dump(metadata, f, indent=2)
        print(f"✓ Saved metadata to: {metadata_path}")
        
        return csv_path, json_path

AdmissionsDataPreprocessor.save_processed_data = save_processed_data

In [13]:
def run_full_pipeline(self):
        """Execute complete preprocessing pipeline"""
        print("\n" + "="*80)
        print("RUNNING COMPLETE DATA PREPROCESSING PIPELINE")
        print("="*80)
        
        self.handle_missing_values()
        self.feature_engineering()
        self.encode_categorical_variables()
        self.handle_outliers()
        self.normalize_features()
        feature_sets = self.create_feature_sets()
        csv_path, json_path = self.save_processed_data()
        
        print("\n" + "="*80)
        print("PREPROCESSING COMPLETE!")
        print("="*80)
        print(f"\nProcessed dataset shape: {self.df_processed.shape}")
        print(f"Files saved: {csv_path}, {json_path}")
        
        return self.df_processed, feature_sets

AdmissionsDataPreprocessor.run_full_pipeline = run_full_pipeline

In [14]:
# Initialize preprocessor
json_file = "data.json"  # Update with your file path

preprocessor = AdmissionsDataPreprocessor(json_file)

# Run full pipeline
processed_df, feature_sets = preprocessor.run_full_pipeline()

print("\n✓ Data is ready for analysis and modeling!")

Loading data...
Data loaded: 250795 rows, 51 columns

RUNNING COMPLETE DATA PREPROCESSING PIPELINE

STEP 1: HANDLING MISSING VALUES
scholarship_amount: Filled missing values with 0
cs_rank: Filled 104071 missing values with 9999 (no rank)
eng_rank: Filled 196857 missing values with 9999 (no rank)
mba_rank: Filled 243609 missing values with 9999 (no rank)
gen_rank: Filled 36590 missing values with 9999 (no rank)

Missing value handling complete!

STEP 2: FEATURE ENGINEERING
✓ Created gpa_category (Low, Medium, High, Very High)
✓ Created english_proficiency (Basic, Good, Very Good, Excellent)
✓ Created total_experience (work + internship months)
✓ Created experience_category (None, Junior, Mid, Senior)
✓ Created has_publications (0 or 1)
✓ Created gre_strength (Low, Medium, High, Very High)
✓ Created is_fall_term (1 for fall, 0 for others)
✓ Created university_tier (Top_50, Top_100, Top_200, Others)
✓ Retained academic_alignment_score
✓ Created composite_academic_score (weighted academic

In [15]:
# Display first 5 rows (default)
processed_df.head()

Unnamed: 0,id,student_id,student_name,student_type,university_name,university_name_stripped,course_name,credential,credential_standardized,categorical_course_name,...,ug_major_bucket_Bio_Biomed_Health_LifeSci,ug_major_bucket_Business_Management_Finance,ug_major_bucket_Chemical_Materials_Petroleum,ug_major_bucket_Civil_Construction_Env_Arch,ug_major_bucket_Computer_Science_Software,ug_major_bucket_Data_Science_AI_Machine_Learning,ug_major_bucket_Electrical_Electronics_ECE,ug_major_bucket_Humanities_Social_Design_Arts,ug_major_bucket_Mechanical_Industrial_Aero,ug_major_bucket_Other
0,4c241fd2-12c7-45cb-a88f-3e45000f7fc6,1433417,Nidhi Gujar,International,"Texas A&M University, College Station",texas a&m university college station,Public Health,Masters,Masters (Technical),Bio_Biomed_Health_LifeSci,...,True,False,False,False,False,False,False,False,False,False
1,a09a96c2-1a18-4ee6-8559-127e75c1974c,1433417,Nidhi Gujar,International,Brown University,brown university,Public Health,Masters,Masters (Technical),Bio_Biomed_Health_LifeSci,...,True,False,False,False,False,False,False,False,False,False
2,5a742e81-1a10-47eb-a0b2-a650b60889fb,1433417,Nidhi Gujar,International,New York University,new york university,Public Health,Masters,Masters (Technical),Bio_Biomed_Health_LifeSci,...,True,False,False,False,False,False,False,False,False,False
3,c72a197a-2440-4d19-aad1-b1cb38b6a584,1433417,Nidhi Gujar,International,Columbia University,columbia university,Public Health,MPH,Masters (Professional),Bio_Biomed_Health_LifeSci,...,True,False,False,False,False,False,False,False,False,False
4,13c53445-952f-4f76-bf58-75ba37ae8f01,1433417,Nidhi Gujar,International,Emory University,emory university,Global Health,MPH,Masters (Professional),Bio_Biomed_Health_LifeSci,...,True,False,False,False,False,False,False,False,False,False


In [16]:
# Display 10 random rows
processed_df.sample(10)

Unnamed: 0,id,student_id,student_name,student_type,university_name,university_name_stripped,course_name,credential,credential_standardized,categorical_course_name,...,ug_major_bucket_Bio_Biomed_Health_LifeSci,ug_major_bucket_Business_Management_Finance,ug_major_bucket_Chemical_Materials_Petroleum,ug_major_bucket_Civil_Construction_Env_Arch,ug_major_bucket_Computer_Science_Software,ug_major_bucket_Data_Science_AI_Machine_Learning,ug_major_bucket_Electrical_Electronics_ECE,ug_major_bucket_Humanities_Social_Design_Arts,ug_major_bucket_Mechanical_Industrial_Aero,ug_major_bucket_Other
247010,a73dd470-8290-4406-9fca-0e6cc519f069,315822,Meet,International,Johns Hopkins University,johns hopkins university,Engineering Management,MS,Masters (Technical),Business_Management_Finance,...,False,False,True,False,False,False,False,False,False,False
195887,7482332b-371b-42f1-8f32-a4f6f3fccfc8,450406,Aparna Siripurapu,International,University of Central Florida,university of central florida,Data Analytics,MS,Masters (Technical),Data_Science_AI_Machine_Learning,...,False,False,False,False,False,False,True,False,False,False
98031,65dab3b2-79c5-4f8c-bb69-39ef8e83c9d3,720278,Harish Gayam,International,Rice University,rice university,Computer Science,Masters,Masters (Technical),Computer_Science_Software,...,False,False,False,False,False,False,True,False,False,False
110304,cffb5ef5-281f-4934-bec2-2a7ed844044b,693817,Sai Krishna,International,University of South Florida,university of south florida,Computer Science,MS,Masters (Technical),Computer_Science_Software,...,False,False,False,False,True,False,False,False,False,False
102798,44237515-2395-4964-b291-02d102a4bd2f,710563,Akhilraj .T,International,"California State University, Sacramento",california state university sacramento,Computer Science,MS,Masters (Technical),Computer_Science_Software,...,False,False,False,False,True,False,False,False,False,False
205099,d7977f31-e220-4ac2-a50c-2caca687a862,343167,Zubair Mohammed,International,Michigan Technological University,michigan technological university,Computer Science - Coursework,MS,Masters (Technical),Computer_Science_Software,...,False,False,False,False,True,False,False,False,False,False
25534,f6644962-9301-4c65-af30-410ee1682a0d,593304,Abi Krishnan,International,The University of Texas at Arlington,the university of texas at arlington,Business Analytics,MS,Masters (Technical),Data_Science_AI_Machine_Learning,...,False,False,False,False,True,False,False,False,False,False
215938,365b11f0-36bc-4a59-90aa-6d7b8773cc34,351310,Shreyansh Nawlakha,International,Colorado State University,colorado state university,Computer Science,MS,Masters (Technical),Computer_Science_Software,...,False,False,False,False,True,False,False,False,False,False
20301,1db43494-347e-405a-9058-62da2f1ecd53,1124222,Shiva Teja,International,University of New Haven,university of new haven,Computer Science,MS,Masters (Technical),Computer_Science_Software,...,False,False,False,False,False,False,False,False,False,True
156511,cfd133d7-1317-454b-9fe9-f5f1aec231dd,450090,Prathamesh,International,University of Illinois Urbana-Champaign,university of illinois urbana-champaign,Information Management,MS,Masters (Technical),Computer_Science_Software,...,False,False,False,False,False,False,True,False,False,False


In [17]:
# View only specific columns
processed_df[['gpa_normalized', 'english_test_normalized', 'gre_total', 'composite_academic_score']].head(10)

Unnamed: 0,gpa_normalized,english_test_normalized,gre_total,composite_academic_score
0,-0.406996,1.209935,-0.903447,0.079538
1,-0.406996,1.209935,-0.903447,0.079538
2,-0.406996,1.209935,-0.903447,0.079538
3,-0.406996,1.209935,-0.903447,0.079538
4,-0.406996,1.209935,-0.903447,0.079538
5,-0.406996,1.209935,-0.903447,0.079538
6,-0.406996,1.209935,-0.903447,0.079538
7,-0.406996,1.209935,-0.903447,0.079538
8,0.662708,-0.390233,-0.722221,0.175358
9,0.662708,-0.390233,-0.722221,0.175358


In [18]:
# Show all columns (no truncation)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
processed_df.head()

Unnamed: 0,id,student_id,student_name,student_type,university_name,university_name_stripped,course_name,credential,credential_standardized,categorical_course_name,target_degree,application_status,admission_result,application_term,application_year,gpa,gpa_scale,gpa_normalized,gpa_missing,undergrad_major,ug_major_bucket,undergrad_university,undergrad_canonical,undergrad_canonical_stripped,undergrad_missing,toefl,ielts,english_test_normalized,english_missing,gre_total,gre_verbal,gre_quant,gre_awa,gre_missing,work_experience,relevant_work_experience,internship_experience,publications,has_scholarship,scholarship_amount,scholarship_currency,cs_rank,cs_rank_missing,eng_rank,eng_rank_missing,mba_rank,mba_rank_missing,gen_rank,gen_rank_missing,stripped_name,major_alignment,gpa_category,english_proficiency,total_experience,experience_category,has_publications,gre_strength,is_fall_term,university_tier,academic_alignment_score,composite_academic_score,student_type_encoded,has_scholarship_encoded,application_term_encoded,credential_standardized_grouped,credential_standardized_Bachelors,credential_standardized_Doctoral,credential_standardized_Graduate Certificate,credential_standardized_Masters (Professional),credential_standardized_Masters (Technical),credential_standardized_Other,categorical_course_name_grouped,categorical_course_name_Bio_Biomed_Health_LifeSci,categorical_course_name_Business_Management_Finance,categorical_course_name_Chemical_Materials_Petroleum,categorical_course_name_Civil_Construction_Env_Arch,categorical_course_name_Computer_Science_Software,categorical_course_name_Data_Science_AI_Machine_Learning,categorical_course_name_Electrical_Electronics_ECE,categorical_course_name_Humanities_Social_Design_Arts,categorical_course_name_Mechanical_Industrial_Aero,categorical_course_name_Other,ug_major_bucket_grouped,ug_major_bucket_Bio_Biomed_Health_LifeSci,ug_major_bucket_Business_Management_Finance,ug_major_bucket_Chemical_Materials_Petroleum,ug_major_bucket_Civil_Construction_Env_Arch,ug_major_bucket_Computer_Science_Software,ug_major_bucket_Data_Science_AI_Machine_Learning,ug_major_bucket_Electrical_Electronics_ECE,ug_major_bucket_Humanities_Social_Design_Arts,ug_major_bucket_Mechanical_Industrial_Aero,ug_major_bucket_Other
0,4c241fd2-12c7-45cb-a88f-3e45000f7fc6,1433417,Nidhi Gujar,International,"Texas A&M University, College Station",texas a&m university college station,Public Health,Masters,Masters (Technical),Bio_Biomed_Health_LifeSci,masters,6,0,fall,2025,75,100.0,-0.406996,0,Dental Science,Bio_Biomed_Health_LifeSci,Yashwantrao Chavan Dental College,"Yashwantrao Chavan Dental College, Ahmednagar",yashwantrao chavan dental college ahmednagar,0,112.0,6.5,1.209935,0,-0.903447,-0.084669,-1.281441,0.066855,True,0.246185,0.246185,0.0,0.0,False,0.0,,9999.0,1,9999.0,1,9999.0,1,51.0,0,yashwantrao chavan dental college,1,Medium,Excellent,0.912832,Senior,1,Low,1,Top_100,1,0.079538,1,0,0,Masters (Technical),False,False,False,False,True,False,Bio_Biomed_Health_LifeSci,True,False,False,False,False,False,False,False,False,False,Bio_Biomed_Health_LifeSci,True,False,False,False,False,False,False,False,False,False
1,a09a96c2-1a18-4ee6-8559-127e75c1974c,1433417,Nidhi Gujar,International,Brown University,brown university,Public Health,Masters,Masters (Technical),Bio_Biomed_Health_LifeSci,masters,7,1,fall,2025,75,100.0,-0.406996,0,Dental Science,Bio_Biomed_Health_LifeSci,Yashwantrao Chavan Dental College,"Yashwantrao Chavan Dental College, Ahmednagar",yashwantrao chavan dental college ahmednagar,0,112.0,6.5,1.209935,0,-0.903447,-0.084669,-1.281441,0.066855,True,0.246185,0.246185,0.0,0.0,True,16000.0,USD,9999.0,1,9999.0,1,9999.0,1,13.0,0,yashwantrao chavan dental college,1,Medium,Excellent,0.912832,Senior,1,Low,1,Top_50,1,0.079538,1,1,0,Masters (Technical),False,False,False,False,True,False,Bio_Biomed_Health_LifeSci,True,False,False,False,False,False,False,False,False,False,Bio_Biomed_Health_LifeSci,True,False,False,False,False,False,False,False,False,False
2,5a742e81-1a10-47eb-a0b2-a650b60889fb,1433417,Nidhi Gujar,International,New York University,new york university,Public Health,Masters,Masters (Technical),Bio_Biomed_Health_LifeSci,masters,7,1,fall,2025,75,100.0,-0.406996,0,Dental Science,Bio_Biomed_Health_LifeSci,Yashwantrao Chavan Dental College,"Yashwantrao Chavan Dental College, Ahmednagar",yashwantrao chavan dental college ahmednagar,0,112.0,6.5,1.209935,0,-0.903447,-0.084669,-1.281441,0.066855,True,0.246185,0.246185,0.0,0.0,True,18000.0,USD,9999.0,1,9999.0,1,9999.0,1,30.0,0,yashwantrao chavan dental college,1,Medium,Excellent,0.912832,Senior,1,Low,1,Top_50,1,0.079538,1,1,0,Masters (Technical),False,False,False,False,True,False,Bio_Biomed_Health_LifeSci,True,False,False,False,False,False,False,False,False,False,Bio_Biomed_Health_LifeSci,True,False,False,False,False,False,False,False,False,False
3,c72a197a-2440-4d19-aad1-b1cb38b6a584,1433417,Nidhi Gujar,International,Columbia University,columbia university,Public Health,MPH,Masters (Professional),Bio_Biomed_Health_LifeSci,masters,6,0,fall,2025,75,100.0,-0.406996,0,Dental Science,Bio_Biomed_Health_LifeSci,Yashwantrao Chavan Dental College,"Yashwantrao Chavan Dental College, Ahmednagar",yashwantrao chavan dental college ahmednagar,0,112.0,6.5,1.209935,0,-0.903447,-0.084669,-1.281441,0.066855,True,0.246185,0.246185,0.0,0.0,False,0.0,,9999.0,1,9999.0,1,9999.0,1,13.0,0,yashwantrao chavan dental college,1,Medium,Excellent,0.912832,Senior,1,Low,1,Top_50,1,0.079538,1,0,0,Masters (Professional),False,False,False,True,False,False,Bio_Biomed_Health_LifeSci,True,False,False,False,False,False,False,False,False,False,Bio_Biomed_Health_LifeSci,True,False,False,False,False,False,False,False,False,False
4,13c53445-952f-4f76-bf58-75ba37ae8f01,1433417,Nidhi Gujar,International,Emory University,emory university,Global Health,MPH,Masters (Professional),Bio_Biomed_Health_LifeSci,masters,7,1,fall,2025,75,100.0,-0.406996,0,Dental Science,Bio_Biomed_Health_LifeSci,Yashwantrao Chavan Dental College,"Yashwantrao Chavan Dental College, Ahmednagar",yashwantrao chavan dental college ahmednagar,0,112.0,6.5,1.209935,0,-0.903447,-0.084669,-1.281441,0.066855,True,0.246185,0.246185,0.0,0.0,False,0.0,,9999.0,1,9999.0,1,9999.0,1,24.0,0,yashwantrao chavan dental college,1,Medium,Excellent,0.912832,Senior,1,Low,1,Top_50,1,0.079538,1,0,0,Masters (Professional),False,False,False,True,False,False,Bio_Biomed_Health_LifeSci,True,False,False,False,False,False,False,False,False,False,Bio_Biomed_Health_LifeSci,True,False,False,False,False,False,False,False,False,False


In [19]:
# For large datasets - create scrollable display
from IPython.display import display
display(processed_df.head(20))

Unnamed: 0,id,student_id,student_name,student_type,university_name,university_name_stripped,course_name,credential,credential_standardized,categorical_course_name,target_degree,application_status,admission_result,application_term,application_year,gpa,gpa_scale,gpa_normalized,gpa_missing,undergrad_major,ug_major_bucket,undergrad_university,undergrad_canonical,undergrad_canonical_stripped,undergrad_missing,toefl,ielts,english_test_normalized,english_missing,gre_total,gre_verbal,gre_quant,gre_awa,gre_missing,work_experience,relevant_work_experience,internship_experience,publications,has_scholarship,scholarship_amount,scholarship_currency,cs_rank,cs_rank_missing,eng_rank,eng_rank_missing,mba_rank,mba_rank_missing,gen_rank,gen_rank_missing,stripped_name,major_alignment,gpa_category,english_proficiency,total_experience,experience_category,has_publications,gre_strength,is_fall_term,university_tier,academic_alignment_score,composite_academic_score,student_type_encoded,has_scholarship_encoded,application_term_encoded,credential_standardized_grouped,credential_standardized_Bachelors,credential_standardized_Doctoral,credential_standardized_Graduate Certificate,credential_standardized_Masters (Professional),credential_standardized_Masters (Technical),credential_standardized_Other,categorical_course_name_grouped,categorical_course_name_Bio_Biomed_Health_LifeSci,categorical_course_name_Business_Management_Finance,categorical_course_name_Chemical_Materials_Petroleum,categorical_course_name_Civil_Construction_Env_Arch,categorical_course_name_Computer_Science_Software,categorical_course_name_Data_Science_AI_Machine_Learning,categorical_course_name_Electrical_Electronics_ECE,categorical_course_name_Humanities_Social_Design_Arts,categorical_course_name_Mechanical_Industrial_Aero,categorical_course_name_Other,ug_major_bucket_grouped,ug_major_bucket_Bio_Biomed_Health_LifeSci,ug_major_bucket_Business_Management_Finance,ug_major_bucket_Chemical_Materials_Petroleum,ug_major_bucket_Civil_Construction_Env_Arch,ug_major_bucket_Computer_Science_Software,ug_major_bucket_Data_Science_AI_Machine_Learning,ug_major_bucket_Electrical_Electronics_ECE,ug_major_bucket_Humanities_Social_Design_Arts,ug_major_bucket_Mechanical_Industrial_Aero,ug_major_bucket_Other
0,4c241fd2-12c7-45cb-a88f-3e45000f7fc6,1433417,Nidhi Gujar,International,"Texas A&M University, College Station",texas a&m university college station,Public Health,Masters,Masters (Technical),Bio_Biomed_Health_LifeSci,masters,6,0,fall,2025,75.0,100.0,-0.406996,0,Dental Science,Bio_Biomed_Health_LifeSci,Yashwantrao Chavan Dental College,"Yashwantrao Chavan Dental College, Ahmednagar",yashwantrao chavan dental college ahmednagar,0,112.0,6.5,1.209935,0,-0.903447,-0.084669,-1.281441,0.066855,True,0.246185,0.246185,0.0,0.0,False,0.0,,9999.0,1,9999.0,1,9999.0,1,51.0,0,yashwantrao chavan dental college,1,Medium,Excellent,0.912832,Senior,1,Low,1,Top_100,1,0.079538,1,0,0,Masters (Technical),False,False,False,False,True,False,Bio_Biomed_Health_LifeSci,True,False,False,False,False,False,False,False,False,False,Bio_Biomed_Health_LifeSci,True,False,False,False,False,False,False,False,False,False
1,a09a96c2-1a18-4ee6-8559-127e75c1974c,1433417,Nidhi Gujar,International,Brown University,brown university,Public Health,Masters,Masters (Technical),Bio_Biomed_Health_LifeSci,masters,7,1,fall,2025,75.0,100.0,-0.406996,0,Dental Science,Bio_Biomed_Health_LifeSci,Yashwantrao Chavan Dental College,"Yashwantrao Chavan Dental College, Ahmednagar",yashwantrao chavan dental college ahmednagar,0,112.0,6.5,1.209935,0,-0.903447,-0.084669,-1.281441,0.066855,True,0.246185,0.246185,0.0,0.0,True,16000.0,USD,9999.0,1,9999.0,1,9999.0,1,13.0,0,yashwantrao chavan dental college,1,Medium,Excellent,0.912832,Senior,1,Low,1,Top_50,1,0.079538,1,1,0,Masters (Technical),False,False,False,False,True,False,Bio_Biomed_Health_LifeSci,True,False,False,False,False,False,False,False,False,False,Bio_Biomed_Health_LifeSci,True,False,False,False,False,False,False,False,False,False
2,5a742e81-1a10-47eb-a0b2-a650b60889fb,1433417,Nidhi Gujar,International,New York University,new york university,Public Health,Masters,Masters (Technical),Bio_Biomed_Health_LifeSci,masters,7,1,fall,2025,75.0,100.0,-0.406996,0,Dental Science,Bio_Biomed_Health_LifeSci,Yashwantrao Chavan Dental College,"Yashwantrao Chavan Dental College, Ahmednagar",yashwantrao chavan dental college ahmednagar,0,112.0,6.5,1.209935,0,-0.903447,-0.084669,-1.281441,0.066855,True,0.246185,0.246185,0.0,0.0,True,18000.0,USD,9999.0,1,9999.0,1,9999.0,1,30.0,0,yashwantrao chavan dental college,1,Medium,Excellent,0.912832,Senior,1,Low,1,Top_50,1,0.079538,1,1,0,Masters (Technical),False,False,False,False,True,False,Bio_Biomed_Health_LifeSci,True,False,False,False,False,False,False,False,False,False,Bio_Biomed_Health_LifeSci,True,False,False,False,False,False,False,False,False,False
3,c72a197a-2440-4d19-aad1-b1cb38b6a584,1433417,Nidhi Gujar,International,Columbia University,columbia university,Public Health,MPH,Masters (Professional),Bio_Biomed_Health_LifeSci,masters,6,0,fall,2025,75.0,100.0,-0.406996,0,Dental Science,Bio_Biomed_Health_LifeSci,Yashwantrao Chavan Dental College,"Yashwantrao Chavan Dental College, Ahmednagar",yashwantrao chavan dental college ahmednagar,0,112.0,6.5,1.209935,0,-0.903447,-0.084669,-1.281441,0.066855,True,0.246185,0.246185,0.0,0.0,False,0.0,,9999.0,1,9999.0,1,9999.0,1,13.0,0,yashwantrao chavan dental college,1,Medium,Excellent,0.912832,Senior,1,Low,1,Top_50,1,0.079538,1,0,0,Masters (Professional),False,False,False,True,False,False,Bio_Biomed_Health_LifeSci,True,False,False,False,False,False,False,False,False,False,Bio_Biomed_Health_LifeSci,True,False,False,False,False,False,False,False,False,False
4,13c53445-952f-4f76-bf58-75ba37ae8f01,1433417,Nidhi Gujar,International,Emory University,emory university,Global Health,MPH,Masters (Professional),Bio_Biomed_Health_LifeSci,masters,7,1,fall,2025,75.0,100.0,-0.406996,0,Dental Science,Bio_Biomed_Health_LifeSci,Yashwantrao Chavan Dental College,"Yashwantrao Chavan Dental College, Ahmednagar",yashwantrao chavan dental college ahmednagar,0,112.0,6.5,1.209935,0,-0.903447,-0.084669,-1.281441,0.066855,True,0.246185,0.246185,0.0,0.0,False,0.0,,9999.0,1,9999.0,1,9999.0,1,24.0,0,yashwantrao chavan dental college,1,Medium,Excellent,0.912832,Senior,1,Low,1,Top_50,1,0.079538,1,0,0,Masters (Professional),False,False,False,True,False,False,Bio_Biomed_Health_LifeSci,True,False,False,False,False,False,False,False,False,False,Bio_Biomed_Health_LifeSci,True,False,False,False,False,False,False,False,False,False
5,42f8c3db-5419-42b7-9e9b-556a3bc628e9,1433417,Nidhi Gujar,International,Drexel University,drexel university,Public Health,Masters,Masters (Technical),Bio_Biomed_Health_LifeSci,masters,7,1,fall,2025,75.0,100.0,-0.406996,0,Dental Science,Bio_Biomed_Health_LifeSci,Yashwantrao Chavan Dental College,"Yashwantrao Chavan Dental College, Ahmednagar",yashwantrao chavan dental college ahmednagar,0,112.0,6.5,1.209935,0,-0.903447,-0.084669,-1.281441,0.066855,True,0.246185,0.246185,0.0,0.0,False,0.0,,9999.0,1,9999.0,1,9999.0,1,86.0,0,yashwantrao chavan dental college,1,Medium,Excellent,0.912832,Senior,1,Low,1,Top_100,1,0.079538,1,0,0,Masters (Technical),False,False,False,False,True,False,Bio_Biomed_Health_LifeSci,True,False,False,False,False,False,False,False,False,False,Bio_Biomed_Health_LifeSci,True,False,False,False,False,False,False,False,False,False
6,514e1d3f-f4a1-4f76-943a-61ddff1cc19d,1433417,Nidhi Gujar,International,The George Washington University,the george washington university,Public Health - Epidemiology,Masters,Masters (Technical),Bio_Biomed_Health_LifeSci,masters,7,1,fall,2025,75.0,100.0,-0.406996,0,Dental Science,Bio_Biomed_Health_LifeSci,Yashwantrao Chavan Dental College,"Yashwantrao Chavan Dental College, Ahmednagar",yashwantrao chavan dental college ahmednagar,0,112.0,6.5,1.209935,0,-0.903447,-0.084669,-1.281441,0.066855,True,0.246185,0.246185,0.0,0.0,False,0.0,USD,9999.0,1,9999.0,1,9999.0,1,63.0,0,yashwantrao chavan dental college,1,Medium,Excellent,0.912832,Senior,1,Low,1,Top_100,1,0.079538,1,0,0,Masters (Technical),False,False,False,False,True,False,Bio_Biomed_Health_LifeSci,True,False,False,False,False,False,False,False,False,False,Bio_Biomed_Health_LifeSci,True,False,False,False,False,False,False,False,False,False
7,e5897164-137d-4547-8246-5a3b2a527fc0,1433417,Nidhi Gujar,International,"University of Michigan, Ann Arbor",university of michigan ann arbor,Public Health,Masters,Masters (Technical),Bio_Biomed_Health_LifeSci,masters,7,1,fall,2025,75.0,100.0,-0.406996,0,Dental Science,Bio_Biomed_Health_LifeSci,Yashwantrao Chavan Dental College,"Yashwantrao Chavan Dental College, Ahmednagar",yashwantrao chavan dental college ahmednagar,0,112.0,6.5,1.209935,0,-0.903447,-0.084669,-1.281441,0.066855,True,0.246185,0.246185,0.0,0.0,False,0.0,USD,9999.0,1,9999.0,1,9999.0,1,21.0,0,yashwantrao chavan dental college,1,Medium,Excellent,0.912832,Senior,1,Low,1,Top_50,1,0.079538,1,0,0,Masters (Technical),False,False,False,False,True,False,Bio_Biomed_Health_LifeSci,True,False,False,False,False,False,False,False,False,False,Bio_Biomed_Health_LifeSci,True,False,False,False,False,False,False,False,False,False
8,51dcdd4c-2d8f-47fe-a79c-0ce4a2322d1a,1439746,Manu Siddharth,International,University of Washington,university of washington,Information Management,MS,Masters (Technical),Computer_Science_Software,masters,7,1,fall,2026,8.53,,0.662708,0,Electronics and Communication Engineering,Electrical_Electronics_ECE,Vivekanand Education Society's Institute Of Te...,Vivekanand Education Society's Institute of Te...,vivekanand education society's institute of te...,0,95.0,7.0,-0.390233,0,-0.722221,-0.877277,-0.329267,0.066855,False,0.948815,0.948815,0.0,0.0,False,0.0,USD,7.0,0,9999.0,1,9999.0,1,46.0,0,vivekanand education society s institute of te...,0,Very High,Good,0.699657,Senior,1,Medium,1,Top_50,0,0.175358,1,0,0,Masters (Technical),False,False,False,False,True,False,Computer_Science_Software,False,False,False,False,True,False,False,False,False,False,Electrical_Electronics_ECE,False,False,False,False,False,False,True,False,False,False
9,504c7ddc-be08-47cf-8004-1426b1da7c39,1439746,Manu Siddharth,International,University of Washington,university of washington,Data Science,MS,Masters (Technical),Data_Science_AI_Machine_Learning,masters,6,0,fall,2026,8.53,,0.662708,0,Electronics and Communication Engineering,Electrical_Electronics_ECE,Vivekanand Education Society's Institute Of Te...,Vivekanand Education Society's Institute of Te...,vivekanand education society's institute of te...,0,95.0,7.0,-0.390233,0,-0.722221,-0.877277,-0.329267,0.066855,False,0.948815,0.948815,0.0,0.0,False,0.0,,7.0,0,9999.0,1,9999.0,1,46.0,0,vivekanand education society s institute of te...,0,Very High,Good,0.699657,Senior,1,Medium,1,Top_50,0,0.175358,1,0,0,Masters (Technical),False,False,False,False,True,False,Data_Science_AI_Machine_Learning,False,False,False,False,False,True,False,False,False,False,Electrical_Electronics_ECE,False,False,False,False,False,False,True,False,False,False


In [20]:
# Shape, data types, and sample values
processed_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 250795 entries, 0 to 250794
Data columns (total 93 columns):
 #   Column                                                    Non-Null Count   Dtype   
---  ------                                                    --------------   -----   
 0   id                                                        250795 non-null  object  
 1   student_id                                                250795 non-null  int64   
 2   student_name                                              250795 non-null  object  
 3   student_type                                              250795 non-null  object  
 4   university_name                                           250795 non-null  object  
 5   university_name_stripped                                  250795 non-null  object  
 6   course_name                                               250795 non-null  object  
 7   credential                                                250795 non-null  object  

In [21]:
# Cell 12: Quick Data Inspection
print("="*80)
print("PROCESSED DATA PREVIEW")
print("="*80)
print(f"\nDataset Shape: {processed_df.shape[0]} rows × {processed_df.shape[1]} columns")
print(f"\nColumn Names ({len(processed_df.columns)} total):")
print(processed_df.columns.tolist())

print("\n" + "="*80)
print("FIRST 10 ROWS:")
print("="*80)
display(processed_df.head(10))

print("\n" + "="*80)
print("BASIC STATISTICS:")
print("="*80)
display(processed_df.describe())

PROCESSED DATA PREVIEW

Dataset Shape: 250795 rows × 93 columns

Column Names (93 total):
['id', 'student_id', 'student_name', 'student_type', 'university_name', 'university_name_stripped', 'course_name', 'credential', 'credential_standardized', 'categorical_course_name', 'target_degree', 'application_status', 'admission_result', 'application_term', 'application_year', 'gpa', 'gpa_scale', 'gpa_normalized', 'gpa_missing', 'undergrad_major', 'ug_major_bucket', 'undergrad_university', 'undergrad_canonical', 'undergrad_canonical_stripped', 'undergrad_missing', 'toefl', 'ielts', 'english_test_normalized', 'english_missing', 'gre_total', 'gre_verbal', 'gre_quant', 'gre_awa', 'gre_missing', 'work_experience', 'relevant_work_experience', 'internship_experience', 'publications', 'has_scholarship', 'scholarship_amount', 'scholarship_currency', 'cs_rank', 'cs_rank_missing', 'eng_rank', 'eng_rank_missing', 'mba_rank', 'mba_rank_missing', 'gen_rank', 'gen_rank_missing', 'stripped_name', 'major_alig

Unnamed: 0,id,student_id,student_name,student_type,university_name,university_name_stripped,course_name,credential,credential_standardized,categorical_course_name,target_degree,application_status,admission_result,application_term,application_year,gpa,gpa_scale,gpa_normalized,gpa_missing,undergrad_major,ug_major_bucket,undergrad_university,undergrad_canonical,undergrad_canonical_stripped,undergrad_missing,toefl,ielts,english_test_normalized,english_missing,gre_total,gre_verbal,gre_quant,gre_awa,gre_missing,work_experience,relevant_work_experience,internship_experience,publications,has_scholarship,scholarship_amount,scholarship_currency,cs_rank,cs_rank_missing,eng_rank,eng_rank_missing,mba_rank,mba_rank_missing,gen_rank,gen_rank_missing,stripped_name,major_alignment,gpa_category,english_proficiency,total_experience,experience_category,has_publications,gre_strength,is_fall_term,university_tier,academic_alignment_score,composite_academic_score,student_type_encoded,has_scholarship_encoded,application_term_encoded,credential_standardized_grouped,credential_standardized_Bachelors,credential_standardized_Doctoral,credential_standardized_Graduate Certificate,credential_standardized_Masters (Professional),credential_standardized_Masters (Technical),credential_standardized_Other,categorical_course_name_grouped,categorical_course_name_Bio_Biomed_Health_LifeSci,categorical_course_name_Business_Management_Finance,categorical_course_name_Chemical_Materials_Petroleum,categorical_course_name_Civil_Construction_Env_Arch,categorical_course_name_Computer_Science_Software,categorical_course_name_Data_Science_AI_Machine_Learning,categorical_course_name_Electrical_Electronics_ECE,categorical_course_name_Humanities_Social_Design_Arts,categorical_course_name_Mechanical_Industrial_Aero,categorical_course_name_Other,ug_major_bucket_grouped,ug_major_bucket_Bio_Biomed_Health_LifeSci,ug_major_bucket_Business_Management_Finance,ug_major_bucket_Chemical_Materials_Petroleum,ug_major_bucket_Civil_Construction_Env_Arch,ug_major_bucket_Computer_Science_Software,ug_major_bucket_Data_Science_AI_Machine_Learning,ug_major_bucket_Electrical_Electronics_ECE,ug_major_bucket_Humanities_Social_Design_Arts,ug_major_bucket_Mechanical_Industrial_Aero,ug_major_bucket_Other
0,4c241fd2-12c7-45cb-a88f-3e45000f7fc6,1433417,Nidhi Gujar,International,"Texas A&M University, College Station",texas a&m university college station,Public Health,Masters,Masters (Technical),Bio_Biomed_Health_LifeSci,masters,6,0,fall,2025,75.0,100.0,-0.406996,0,Dental Science,Bio_Biomed_Health_LifeSci,Yashwantrao Chavan Dental College,"Yashwantrao Chavan Dental College, Ahmednagar",yashwantrao chavan dental college ahmednagar,0,112.0,6.5,1.209935,0,-0.903447,-0.084669,-1.281441,0.066855,True,0.246185,0.246185,0.0,0.0,False,0.0,,9999.0,1,9999.0,1,9999.0,1,51.0,0,yashwantrao chavan dental college,1,Medium,Excellent,0.912832,Senior,1,Low,1,Top_100,1,0.079538,1,0,0,Masters (Technical),False,False,False,False,True,False,Bio_Biomed_Health_LifeSci,True,False,False,False,False,False,False,False,False,False,Bio_Biomed_Health_LifeSci,True,False,False,False,False,False,False,False,False,False
1,a09a96c2-1a18-4ee6-8559-127e75c1974c,1433417,Nidhi Gujar,International,Brown University,brown university,Public Health,Masters,Masters (Technical),Bio_Biomed_Health_LifeSci,masters,7,1,fall,2025,75.0,100.0,-0.406996,0,Dental Science,Bio_Biomed_Health_LifeSci,Yashwantrao Chavan Dental College,"Yashwantrao Chavan Dental College, Ahmednagar",yashwantrao chavan dental college ahmednagar,0,112.0,6.5,1.209935,0,-0.903447,-0.084669,-1.281441,0.066855,True,0.246185,0.246185,0.0,0.0,True,16000.0,USD,9999.0,1,9999.0,1,9999.0,1,13.0,0,yashwantrao chavan dental college,1,Medium,Excellent,0.912832,Senior,1,Low,1,Top_50,1,0.079538,1,1,0,Masters (Technical),False,False,False,False,True,False,Bio_Biomed_Health_LifeSci,True,False,False,False,False,False,False,False,False,False,Bio_Biomed_Health_LifeSci,True,False,False,False,False,False,False,False,False,False
2,5a742e81-1a10-47eb-a0b2-a650b60889fb,1433417,Nidhi Gujar,International,New York University,new york university,Public Health,Masters,Masters (Technical),Bio_Biomed_Health_LifeSci,masters,7,1,fall,2025,75.0,100.0,-0.406996,0,Dental Science,Bio_Biomed_Health_LifeSci,Yashwantrao Chavan Dental College,"Yashwantrao Chavan Dental College, Ahmednagar",yashwantrao chavan dental college ahmednagar,0,112.0,6.5,1.209935,0,-0.903447,-0.084669,-1.281441,0.066855,True,0.246185,0.246185,0.0,0.0,True,18000.0,USD,9999.0,1,9999.0,1,9999.0,1,30.0,0,yashwantrao chavan dental college,1,Medium,Excellent,0.912832,Senior,1,Low,1,Top_50,1,0.079538,1,1,0,Masters (Technical),False,False,False,False,True,False,Bio_Biomed_Health_LifeSci,True,False,False,False,False,False,False,False,False,False,Bio_Biomed_Health_LifeSci,True,False,False,False,False,False,False,False,False,False
3,c72a197a-2440-4d19-aad1-b1cb38b6a584,1433417,Nidhi Gujar,International,Columbia University,columbia university,Public Health,MPH,Masters (Professional),Bio_Biomed_Health_LifeSci,masters,6,0,fall,2025,75.0,100.0,-0.406996,0,Dental Science,Bio_Biomed_Health_LifeSci,Yashwantrao Chavan Dental College,"Yashwantrao Chavan Dental College, Ahmednagar",yashwantrao chavan dental college ahmednagar,0,112.0,6.5,1.209935,0,-0.903447,-0.084669,-1.281441,0.066855,True,0.246185,0.246185,0.0,0.0,False,0.0,,9999.0,1,9999.0,1,9999.0,1,13.0,0,yashwantrao chavan dental college,1,Medium,Excellent,0.912832,Senior,1,Low,1,Top_50,1,0.079538,1,0,0,Masters (Professional),False,False,False,True,False,False,Bio_Biomed_Health_LifeSci,True,False,False,False,False,False,False,False,False,False,Bio_Biomed_Health_LifeSci,True,False,False,False,False,False,False,False,False,False
4,13c53445-952f-4f76-bf58-75ba37ae8f01,1433417,Nidhi Gujar,International,Emory University,emory university,Global Health,MPH,Masters (Professional),Bio_Biomed_Health_LifeSci,masters,7,1,fall,2025,75.0,100.0,-0.406996,0,Dental Science,Bio_Biomed_Health_LifeSci,Yashwantrao Chavan Dental College,"Yashwantrao Chavan Dental College, Ahmednagar",yashwantrao chavan dental college ahmednagar,0,112.0,6.5,1.209935,0,-0.903447,-0.084669,-1.281441,0.066855,True,0.246185,0.246185,0.0,0.0,False,0.0,,9999.0,1,9999.0,1,9999.0,1,24.0,0,yashwantrao chavan dental college,1,Medium,Excellent,0.912832,Senior,1,Low,1,Top_50,1,0.079538,1,0,0,Masters (Professional),False,False,False,True,False,False,Bio_Biomed_Health_LifeSci,True,False,False,False,False,False,False,False,False,False,Bio_Biomed_Health_LifeSci,True,False,False,False,False,False,False,False,False,False
5,42f8c3db-5419-42b7-9e9b-556a3bc628e9,1433417,Nidhi Gujar,International,Drexel University,drexel university,Public Health,Masters,Masters (Technical),Bio_Biomed_Health_LifeSci,masters,7,1,fall,2025,75.0,100.0,-0.406996,0,Dental Science,Bio_Biomed_Health_LifeSci,Yashwantrao Chavan Dental College,"Yashwantrao Chavan Dental College, Ahmednagar",yashwantrao chavan dental college ahmednagar,0,112.0,6.5,1.209935,0,-0.903447,-0.084669,-1.281441,0.066855,True,0.246185,0.246185,0.0,0.0,False,0.0,,9999.0,1,9999.0,1,9999.0,1,86.0,0,yashwantrao chavan dental college,1,Medium,Excellent,0.912832,Senior,1,Low,1,Top_100,1,0.079538,1,0,0,Masters (Technical),False,False,False,False,True,False,Bio_Biomed_Health_LifeSci,True,False,False,False,False,False,False,False,False,False,Bio_Biomed_Health_LifeSci,True,False,False,False,False,False,False,False,False,False
6,514e1d3f-f4a1-4f76-943a-61ddff1cc19d,1433417,Nidhi Gujar,International,The George Washington University,the george washington university,Public Health - Epidemiology,Masters,Masters (Technical),Bio_Biomed_Health_LifeSci,masters,7,1,fall,2025,75.0,100.0,-0.406996,0,Dental Science,Bio_Biomed_Health_LifeSci,Yashwantrao Chavan Dental College,"Yashwantrao Chavan Dental College, Ahmednagar",yashwantrao chavan dental college ahmednagar,0,112.0,6.5,1.209935,0,-0.903447,-0.084669,-1.281441,0.066855,True,0.246185,0.246185,0.0,0.0,False,0.0,USD,9999.0,1,9999.0,1,9999.0,1,63.0,0,yashwantrao chavan dental college,1,Medium,Excellent,0.912832,Senior,1,Low,1,Top_100,1,0.079538,1,0,0,Masters (Technical),False,False,False,False,True,False,Bio_Biomed_Health_LifeSci,True,False,False,False,False,False,False,False,False,False,Bio_Biomed_Health_LifeSci,True,False,False,False,False,False,False,False,False,False
7,e5897164-137d-4547-8246-5a3b2a527fc0,1433417,Nidhi Gujar,International,"University of Michigan, Ann Arbor",university of michigan ann arbor,Public Health,Masters,Masters (Technical),Bio_Biomed_Health_LifeSci,masters,7,1,fall,2025,75.0,100.0,-0.406996,0,Dental Science,Bio_Biomed_Health_LifeSci,Yashwantrao Chavan Dental College,"Yashwantrao Chavan Dental College, Ahmednagar",yashwantrao chavan dental college ahmednagar,0,112.0,6.5,1.209935,0,-0.903447,-0.084669,-1.281441,0.066855,True,0.246185,0.246185,0.0,0.0,False,0.0,USD,9999.0,1,9999.0,1,9999.0,1,21.0,0,yashwantrao chavan dental college,1,Medium,Excellent,0.912832,Senior,1,Low,1,Top_50,1,0.079538,1,0,0,Masters (Technical),False,False,False,False,True,False,Bio_Biomed_Health_LifeSci,True,False,False,False,False,False,False,False,False,False,Bio_Biomed_Health_LifeSci,True,False,False,False,False,False,False,False,False,False
8,51dcdd4c-2d8f-47fe-a79c-0ce4a2322d1a,1439746,Manu Siddharth,International,University of Washington,university of washington,Information Management,MS,Masters (Technical),Computer_Science_Software,masters,7,1,fall,2026,8.53,,0.662708,0,Electronics and Communication Engineering,Electrical_Electronics_ECE,Vivekanand Education Society's Institute Of Te...,Vivekanand Education Society's Institute of Te...,vivekanand education society's institute of te...,0,95.0,7.0,-0.390233,0,-0.722221,-0.877277,-0.329267,0.066855,False,0.948815,0.948815,0.0,0.0,False,0.0,USD,7.0,0,9999.0,1,9999.0,1,46.0,0,vivekanand education society s institute of te...,0,Very High,Good,0.699657,Senior,1,Medium,1,Top_50,0,0.175358,1,0,0,Masters (Technical),False,False,False,False,True,False,Computer_Science_Software,False,False,False,False,True,False,False,False,False,False,Electrical_Electronics_ECE,False,False,False,False,False,False,True,False,False,False
9,504c7ddc-be08-47cf-8004-1426b1da7c39,1439746,Manu Siddharth,International,University of Washington,university of washington,Data Science,MS,Masters (Technical),Data_Science_AI_Machine_Learning,masters,6,0,fall,2026,8.53,,0.662708,0,Electronics and Communication Engineering,Electrical_Electronics_ECE,Vivekanand Education Society's Institute Of Te...,Vivekanand Education Society's Institute of Te...,vivekanand education society's institute of te...,0,95.0,7.0,-0.390233,0,-0.722221,-0.877277,-0.329267,0.066855,False,0.948815,0.948815,0.0,0.0,False,0.0,,7.0,0,9999.0,1,9999.0,1,46.0,0,vivekanand education society s institute of te...,0,Very High,Good,0.699657,Senior,1,Medium,1,Top_50,0,0.175358,1,0,0,Masters (Technical),False,False,False,False,True,False,Data_Science_AI_Machine_Learning,False,False,False,False,False,True,False,False,False,False,Electrical_Electronics_ECE,False,False,False,False,False,False,True,False,False,False



BASIC STATISTICS:


Unnamed: 0,student_id,application_status,admission_result,application_year,gpa_scale,gpa_normalized,gpa_missing,undergrad_missing,toefl,ielts,english_test_normalized,english_missing,gre_total,gre_verbal,gre_quant,gre_awa,work_experience,relevant_work_experience,internship_experience,publications,scholarship_amount,cs_rank,cs_rank_missing,eng_rank,eng_rank_missing,mba_rank,mba_rank_missing,gen_rank,gen_rank_missing,major_alignment,total_experience,has_publications,is_fall_term,academic_alignment_score,composite_academic_score,student_type_encoded,has_scholarship_encoded,application_term_encoded
count,250795.0,250795.0,250795.0,250795.0,49162.0,250795.0,250795.0,250795.0,129693.0,102219.0,250795.0,250795.0,250795.0,250795.0,250795.0,250795.0,250795.0,250795.0,250795.0,250795.0,250795.0,250795.0,250795.0,250795.0,250795.0,250795.0,250795.0,250795.0,250795.0,250795.0,250795.0,250795.0,250795.0,250795.0,250795.0,250795.0,250795.0,250795.0
mean,675385.0,6.657665,0.657665,2021.137567,100.0,4.931968e-16,0.039658,0.051927,99.83656,7.172223,-3.662711e-16,0.0,1.853114e-15,1.972787e-15,-1.624648e-15,5.240215e-16,-8.703472000000001e-17,-8.703472000000001e-17,0.0,0.0,74.947327,4187.440627,0.414964,7859.685656,0.784932,9713.872888,0.971347,1537.789386,0.145896,0.700389,-2.901157e-17,0.17665,0.83768,0.700389,-1.167716e-15,0.988995,0.006647,0.345553
std,360811.3,0.474492,0.474492,2.550151,0.0,1.000002,0.195155,0.22188,17.844224,2.797946,1.000002,0.0,1.000002,1.000002,1.000002,1.000002,1.000002,1.000002,0.0,0.0,1185.449512,4894.615043,0.492717,4087.035868,0.41087,1660.14008,0.166829,3497.618577,0.353003,0.458089,1.000002,0.381373,0.368745,0.458089,1.000002,0.104326,0.081257,0.805395
min,482.0,6.0,0.0,2011.0,100.0,-4.83121,0.0,0.0,0.0,0.0,-3.270535,0.0,-6.068375,-23.86293,-21.68516,-5.149077,-0.7199318,-0.7199318,0.0,0.0,-3.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,-0.6504484,0.0,0.0,0.0,-10.59527,0.0,0.0,0.0
25%,379943.0,6.0,0.0,2019.0,100.0,-0.6562477,0.0,0.0,96.0,6.5,-0.3902331,0.0,-0.9034465,-0.401712,-0.8733661,-0.6782781,-0.7199318,-0.7199318,0.0,0.0,0.0,51.0,0.0,9999.0,1.0,9999.0,1.0,51.0,0.0,0.0,-0.6504484,0.0,1.0,0.0,-0.5902743,1.0,0.0,0.0
50%,564849.0,7.0,1.0,2021.0,100.0,0.1122778,0.0,0.0,103.0,7.0,0.1431562,0.0,0.002681223,-0.08466859,0.2148324,0.06685505,-0.632103,-0.632103,0.0,0.0,0.0,120.0,0.0,9999.0,1.0,9999.0,1.0,84.0,0.0,1.0,-0.508332,0.0,1.0,1.0,0.03673359,1.0,0.0,0.0
75%,929094.0,7.0,1.0,2023.0,100.0,0.7354065,0.0,0.0,109.0,7.5,0.5698676,0.0,0.8181962,0.5494183,0.7589316,0.8119882,0.5096715,0.5096715,0.0,0.0,0.0,9999.0,1.0,9999.0,1.0,9999.0,1.0,152.0,0.0,1.0,0.4509537,0.0,1.0,1.0,0.6848859,1.0,0.0,0.0
max,1596061.0,7.0,1.0,2027.0,100.0,2.189374,1.0,1.0,120.0,119.0,2.063358,0.0,2.721064,3.085766,1.439056,3.792521,4.198481,4.198481,0.0,0.0,110000.0,9999.0,1.0,9999.0,1.0,9999.0,1.0,9999.0,1.0,1.0,105.6526,1.0,1.0,1.0,2.80252,1.0,1.0,4.0


In [22]:
# See how many NaN values in each column
nan_counts = processed_df.isnull().sum()
print("NaN counts per column:")
print(nan_counts[nan_counts > 0])  # Only show columns with NaN

NaN counts per column:
target_degree              104
gpa                       9581
gpa_scale               201633
undergrad_university     11053
toefl                   121102
ielts                   148576
scholarship_currency    220574
gpa_category               513
experience_category          4
gre_strength                14
dtype: int64


In [23]:
# Complete missing value summary
missing_summary = pd.DataFrame({
    'Column': processed_df.columns,
    'NaN_Count': processed_df.isnull().sum().values,
    'NaN_Percentage': (processed_df.isnull().sum().values / len(processed_df) * 100).round(2)
})
missing_summary = missing_summary[missing_summary['NaN_Count'] > 0].sort_values('NaN_Count', ascending=False)
print(missing_summary)

                  Column  NaN_Count  NaN_Percentage
40  scholarship_currency     220574           87.95
16             gpa_scale     201633           80.40
26                 ielts     148576           59.24
25                 toefl     121102           48.29
21  undergrad_university      11053            4.41
15                   gpa       9581            3.82
51          gpa_category        513            0.20
10         target_degree        104            0.04
56          gre_strength         14            0.01
54   experience_category          4            0.00


In [25]:
# Cell: Handle Remaining NaN Values After Preprocessing

print("="*80)
print("FIXING REMAINING NaN VALUES")
print("="*80)

# 1. HIGH NaN COLUMNS (mostly missing by design)

# scholarship_currency: Fill with 'None'
if 'scholarship_currency' in processed_df.columns:
    processed_df['scholarship_currency'].fillna('None', inplace=True)
    print("✓ Fixed scholarship_currency: Filled with 'None'")

# gpa_scale: Fill with most common scale
if 'gpa_scale' in processed_df.columns:
    most_common_scale = processed_df['gpa_scale'].mode()[0] if not processed_df['gpa_scale'].mode().empty else '10'
    processed_df['gpa_scale'].fillna(most_common_scale, inplace=True)
    print(f"✓ Fixed gpa_scale: Filled with most common scale '{most_common_scale}'")

# English tests (IELTS/TOEFL)
if 'ielts' in processed_df.columns:
    processed_df['ielts'].fillna(0, inplace=True)
    print("✓ Fixed ielts: Filled with 0 (student took TOEFL instead)")

if 'toefl' in processed_df.columns:
    processed_df['toefl'].fillna(0, inplace=True)
    print("✓ Fixed toefl: Filled with 0 (student took IELTS instead)")

# 2. MEDIUM NaN COLUMNS

# undergrad_university: Fill with 'Unknown'
if 'undergrad_university' in processed_df.columns:
    processed_df['undergrad_university'].fillna('Unknown', inplace=True)
    print("✓ Fixed undergrad_university: Filled with 'Unknown'")

# gpa: Convert to numeric first, then fill with median
if 'gpa' in processed_df.columns:
    # Convert to numeric (handles string values)
    processed_df['gpa'] = pd.to_numeric(processed_df['gpa'], errors='coerce')
    median_gpa = processed_df['gpa'].median()
    processed_df['gpa'].fillna(median_gpa, inplace=True)
    print(f"✓ Fixed gpa: Converted to numeric and filled with median ({median_gpa:.2f})")

# 3. LOW NaN COLUMNS (categorical from pd.cut)

# gpa_category: Fill with 'Unknown'
if 'gpa_category' in processed_df.columns:
    processed_df['gpa_category'] = processed_df['gpa_category'].astype(str)
    processed_df['gpa_category'].replace('nan', 'Unknown', inplace=True)
    processed_df['gpa_category'].fillna('Unknown', inplace=True)
    print("✓ Fixed gpa_category: Filled with 'Unknown'")

# gre_strength: Fill with 'Unknown'
if 'gre_strength' in processed_df.columns:
    processed_df['gre_strength'] = processed_df['gre_strength'].astype(str)
    processed_df['gre_strength'].replace('nan', 'Unknown', inplace=True)
    processed_df['gre_strength'].fillna('Unknown', inplace=True)
    print("✓ Fixed gre_strength: Filled with 'Unknown'")

# experience_category: Fill with 'Unknown'
if 'experience_category' in processed_df.columns:
    processed_df['experience_category'] = processed_df['experience_category'].astype(str)
    processed_df['experience_category'].replace('nan', 'Unknown', inplace=True)
    processed_df['experience_category'].fillna('Unknown', inplace=True)
    print("✓ Fixed experience_category: Filled with 'Unknown'")

# english_proficiency: Fill with 'Unknown'
if 'english_proficiency' in processed_df.columns:
    processed_df['english_proficiency'] = processed_df['english_proficiency'].astype(str)
    processed_df['english_proficiency'].replace('nan', 'Unknown', inplace=True)
    processed_df['english_proficiency'].fillna('Unknown', inplace=True)
    print("✓ Fixed english_proficiency: Filled with 'Unknown'")

# university_tier: Fill with 'Unknown'
if 'university_tier' in processed_df.columns:
    processed_df['university_tier'] = processed_df['university_tier'].astype(str)
    processed_df['university_tier'].replace('nan', 'Unknown', inplace=True)
    processed_df['university_tier'].fillna('Unknown', inplace=True)
    print("✓ Fixed university_tier: Filled with 'Unknown'")

# target_degree: Fill with mode
if 'target_degree' in processed_df.columns:
    most_common_degree = processed_df['target_degree'].mode()[0] if not processed_df['target_degree'].mode().empty else 'Unknown'
    processed_df['target_degree'].fillna(most_common_degree, inplace=True)
    print(f"✓ Fixed target_degree: Filled with mode '{most_common_degree}'")

print("\n" + "="*80)
print("VERIFICATION: Checking for remaining NaN values...")
print("="*80)

remaining_nan = processed_df.isnull().sum()
columns_with_nan = remaining_nan[remaining_nan > 0]

if len(columns_with_nan) > 0:
    print(f"\n⚠️  Still have NaN in {len(columns_with_nan)} columns:")
    for col, count in columns_with_nan.items():
        percentage = (count / len(processed_df)) * 100
        print(f"  • {col}: {count} NaN ({percentage:.2f}%)")
else:
    print("\n✅ SUCCESS! No NaN values remaining - Data is completely clean!")

total_nan = processed_df.isnull().sum().sum()
print(f"\n📊 Total NaN values now: {total_nan}")

FIXING REMAINING NaN VALUES
✓ Fixed scholarship_currency: Filled with 'None'
✓ Fixed gpa_scale: Filled with most common scale '100.0'
✓ Fixed ielts: Filled with 0 (student took TOEFL instead)
✓ Fixed toefl: Filled with 0 (student took IELTS instead)
✓ Fixed undergrad_university: Filled with 'Unknown'
✓ Fixed gpa: Converted to numeric and filled with median (8.41)
✓ Fixed gpa_category: Filled with 'Unknown'
✓ Fixed gre_strength: Filled with 'Unknown'
✓ Fixed experience_category: Filled with 'Unknown'
✓ Fixed english_proficiency: Filled with 'Unknown'
✓ Fixed university_tier: Filled with 'Unknown'
✓ Fixed target_degree: Filled with mode 'masters'

VERIFICATION: Checking for remaining NaN values...

✅ SUCCESS! No NaN values remaining - Data is completely clean!

📊 Total NaN values now: 0


In [26]:
# See how many NaN values in each column
nan_counts = processed_df.isnull().sum()
print("NaN counts per column:")
print(nan_counts[nan_counts > 0])  # Only show columns with NaN

NaN counts per column:
Series([], dtype: int64)


In [27]:
processed_df.head()

Unnamed: 0,id,student_id,student_name,student_type,university_name,university_name_stripped,course_name,credential,credential_standardized,categorical_course_name,target_degree,application_status,admission_result,application_term,application_year,gpa,gpa_scale,gpa_normalized,gpa_missing,undergrad_major,ug_major_bucket,undergrad_university,undergrad_canonical,undergrad_canonical_stripped,undergrad_missing,toefl,ielts,english_test_normalized,english_missing,gre_total,gre_verbal,gre_quant,gre_awa,gre_missing,work_experience,relevant_work_experience,internship_experience,publications,has_scholarship,scholarship_amount,scholarship_currency,cs_rank,cs_rank_missing,eng_rank,eng_rank_missing,mba_rank,mba_rank_missing,gen_rank,gen_rank_missing,stripped_name,major_alignment,gpa_category,english_proficiency,total_experience,experience_category,has_publications,gre_strength,is_fall_term,university_tier,academic_alignment_score,composite_academic_score,student_type_encoded,has_scholarship_encoded,application_term_encoded,credential_standardized_grouped,credential_standardized_Bachelors,credential_standardized_Doctoral,credential_standardized_Graduate Certificate,credential_standardized_Masters (Professional),credential_standardized_Masters (Technical),credential_standardized_Other,categorical_course_name_grouped,categorical_course_name_Bio_Biomed_Health_LifeSci,categorical_course_name_Business_Management_Finance,categorical_course_name_Chemical_Materials_Petroleum,categorical_course_name_Civil_Construction_Env_Arch,categorical_course_name_Computer_Science_Software,categorical_course_name_Data_Science_AI_Machine_Learning,categorical_course_name_Electrical_Electronics_ECE,categorical_course_name_Humanities_Social_Design_Arts,categorical_course_name_Mechanical_Industrial_Aero,categorical_course_name_Other,ug_major_bucket_grouped,ug_major_bucket_Bio_Biomed_Health_LifeSci,ug_major_bucket_Business_Management_Finance,ug_major_bucket_Chemical_Materials_Petroleum,ug_major_bucket_Civil_Construction_Env_Arch,ug_major_bucket_Computer_Science_Software,ug_major_bucket_Data_Science_AI_Machine_Learning,ug_major_bucket_Electrical_Electronics_ECE,ug_major_bucket_Humanities_Social_Design_Arts,ug_major_bucket_Mechanical_Industrial_Aero,ug_major_bucket_Other
0,4c241fd2-12c7-45cb-a88f-3e45000f7fc6,1433417,Nidhi Gujar,International,"Texas A&M University, College Station",texas a&m university college station,Public Health,Masters,Masters (Technical),Bio_Biomed_Health_LifeSci,masters,6,0,fall,2025,75.0,100.0,-0.406996,0,Dental Science,Bio_Biomed_Health_LifeSci,Yashwantrao Chavan Dental College,"Yashwantrao Chavan Dental College, Ahmednagar",yashwantrao chavan dental college ahmednagar,0,112.0,6.5,1.209935,0,-0.903447,-0.084669,-1.281441,0.066855,True,0.246185,0.246185,0.0,0.0,False,0.0,,9999.0,1,9999.0,1,9999.0,1,51.0,0,yashwantrao chavan dental college,1,Medium,Excellent,0.912832,Senior,1,Low,1,Top_100,1,0.079538,1,0,0,Masters (Technical),False,False,False,False,True,False,Bio_Biomed_Health_LifeSci,True,False,False,False,False,False,False,False,False,False,Bio_Biomed_Health_LifeSci,True,False,False,False,False,False,False,False,False,False
1,a09a96c2-1a18-4ee6-8559-127e75c1974c,1433417,Nidhi Gujar,International,Brown University,brown university,Public Health,Masters,Masters (Technical),Bio_Biomed_Health_LifeSci,masters,7,1,fall,2025,75.0,100.0,-0.406996,0,Dental Science,Bio_Biomed_Health_LifeSci,Yashwantrao Chavan Dental College,"Yashwantrao Chavan Dental College, Ahmednagar",yashwantrao chavan dental college ahmednagar,0,112.0,6.5,1.209935,0,-0.903447,-0.084669,-1.281441,0.066855,True,0.246185,0.246185,0.0,0.0,True,16000.0,USD,9999.0,1,9999.0,1,9999.0,1,13.0,0,yashwantrao chavan dental college,1,Medium,Excellent,0.912832,Senior,1,Low,1,Top_50,1,0.079538,1,1,0,Masters (Technical),False,False,False,False,True,False,Bio_Biomed_Health_LifeSci,True,False,False,False,False,False,False,False,False,False,Bio_Biomed_Health_LifeSci,True,False,False,False,False,False,False,False,False,False
2,5a742e81-1a10-47eb-a0b2-a650b60889fb,1433417,Nidhi Gujar,International,New York University,new york university,Public Health,Masters,Masters (Technical),Bio_Biomed_Health_LifeSci,masters,7,1,fall,2025,75.0,100.0,-0.406996,0,Dental Science,Bio_Biomed_Health_LifeSci,Yashwantrao Chavan Dental College,"Yashwantrao Chavan Dental College, Ahmednagar",yashwantrao chavan dental college ahmednagar,0,112.0,6.5,1.209935,0,-0.903447,-0.084669,-1.281441,0.066855,True,0.246185,0.246185,0.0,0.0,True,18000.0,USD,9999.0,1,9999.0,1,9999.0,1,30.0,0,yashwantrao chavan dental college,1,Medium,Excellent,0.912832,Senior,1,Low,1,Top_50,1,0.079538,1,1,0,Masters (Technical),False,False,False,False,True,False,Bio_Biomed_Health_LifeSci,True,False,False,False,False,False,False,False,False,False,Bio_Biomed_Health_LifeSci,True,False,False,False,False,False,False,False,False,False
3,c72a197a-2440-4d19-aad1-b1cb38b6a584,1433417,Nidhi Gujar,International,Columbia University,columbia university,Public Health,MPH,Masters (Professional),Bio_Biomed_Health_LifeSci,masters,6,0,fall,2025,75.0,100.0,-0.406996,0,Dental Science,Bio_Biomed_Health_LifeSci,Yashwantrao Chavan Dental College,"Yashwantrao Chavan Dental College, Ahmednagar",yashwantrao chavan dental college ahmednagar,0,112.0,6.5,1.209935,0,-0.903447,-0.084669,-1.281441,0.066855,True,0.246185,0.246185,0.0,0.0,False,0.0,,9999.0,1,9999.0,1,9999.0,1,13.0,0,yashwantrao chavan dental college,1,Medium,Excellent,0.912832,Senior,1,Low,1,Top_50,1,0.079538,1,0,0,Masters (Professional),False,False,False,True,False,False,Bio_Biomed_Health_LifeSci,True,False,False,False,False,False,False,False,False,False,Bio_Biomed_Health_LifeSci,True,False,False,False,False,False,False,False,False,False
4,13c53445-952f-4f76-bf58-75ba37ae8f01,1433417,Nidhi Gujar,International,Emory University,emory university,Global Health,MPH,Masters (Professional),Bio_Biomed_Health_LifeSci,masters,7,1,fall,2025,75.0,100.0,-0.406996,0,Dental Science,Bio_Biomed_Health_LifeSci,Yashwantrao Chavan Dental College,"Yashwantrao Chavan Dental College, Ahmednagar",yashwantrao chavan dental college ahmednagar,0,112.0,6.5,1.209935,0,-0.903447,-0.084669,-1.281441,0.066855,True,0.246185,0.246185,0.0,0.0,False,0.0,,9999.0,1,9999.0,1,9999.0,1,24.0,0,yashwantrao chavan dental college,1,Medium,Excellent,0.912832,Senior,1,Low,1,Top_50,1,0.079538,1,0,0,Masters (Professional),False,False,False,True,False,False,Bio_Biomed_Health_LifeSci,True,False,False,False,False,False,False,False,False,False,Bio_Biomed_Health_LifeSci,True,False,False,False,False,False,False,False,False,False


In [30]:
# Cell: Test 2 - Feature Engineering Validation

print("="*80)
print("TEST 2: FEATURE ENGINEERING VALIDATION")
print("="*80)

# 1. Check if new features were created
original_columns = [
    'gpa_normalized', 'english_test_normalized', 'work_experience', 
    'internship_experience', 'publications'
]

new_features = [
    'gpa_category', 'english_proficiency', 'total_experience',
    'experience_category', 'has_publications', 'composite_academic_score'
]

print("\n1. Checking if engineered features exist:")
for feature in new_features:
    if feature in processed_df.columns:
        print(f"   ✅ {feature} - EXISTS")
    else:
        print(f"   ❌ {feature} - MISSING")

# 2. Validate composite_academic_score
if 'composite_academic_score' in processed_df.columns:
    print("\n2. Composite Academic Score Statistics:")
    print(f"   Min: {processed_df['composite_academic_score'].min():.4f}")
    print(f"   Max: {processed_df['composite_academic_score'].max():.4f}")
    print(f"   Mean: {processed_df['composite_academic_score'].mean():.4f}")
    print(f"   Std: {processed_df['composite_academic_score'].std():.4f}")
    
    # After normalization, mean should be ~0 and std ~1
    if abs(processed_df['composite_academic_score'].mean()) < 0.1 and abs(processed_df['composite_academic_score'].std() - 1) < 0.1:
        print("   ✅ PASS: Score is properly normalized")
    else:
        print("   ⚠️  WARNING: Score may not be properly normalized")

# 3. Check categorical distributions
print("\n3. Categorical Feature Distributions:")
categorical_features = ['gpa_category', 'english_proficiency', 'experience_category']

for feature in categorical_features:
    if feature in processed_df.columns:
        print(f"\n   {feature}:")
        print(processed_df[feature].value_counts())

TEST 2: FEATURE ENGINEERING VALIDATION

1. Checking if engineered features exist:
   ✅ gpa_category - EXISTS
   ✅ english_proficiency - EXISTS
   ✅ total_experience - EXISTS
   ✅ experience_category - EXISTS
   ✅ has_publications - EXISTS
   ✅ composite_academic_score - EXISTS

2. Composite Academic Score Statistics:
   Min: -10.5953
   Max: 2.8025
   Mean: -0.0000
   Std: 1.0000
   ✅ PASS: Score is properly normalized

3. Categorical Feature Distributions:

   gpa_category:
gpa_category
High         99449
Medium       75149
Very High    68103
Low           7581
Unknown        513
Name: count, dtype: int64

   english_proficiency:
english_proficiency
Very Good    104918
Good          68535
Excellent     39983
Basic         37359
Name: count, dtype: int64

   experience_category:
experience_category
None       116452
Mid         59326
Senior      50419
Junior      24594
Unknown         4
Name: count, dtype: int64
