In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from typing import Union

In [2]:
%run Classification.ipynb

['sl_no', 'ssc_p', 'hsc_p', 'degree_p', 'etest_p', 'mba_p']
['Previous qualification (grade)', 'Admission grade', 'Curricular units 1st sem (grade)', 'Curricular units 2nd sem (grade)']
['StudentID', 'StudyTimeWeekly', 'GPA']


In [3]:
datasets = {
    'education_1_df': 'Datasets/campus-placement-prediction.csv',
    'education_2_df': 'Datasets/predict-dropout-or-academic-success.csv',
    'education_3_df': 'Datasets/student-performance-dataset.csv',
    'finance_1_df': 'Datasets/bank-customer-churn-prediction.csv',
    'finance_2_df': 'Datasets/financial-risk-for-loan-approval.csv',
    'finance_3_df': 'Datasets/loan-approval-classification-dataset.csv',
    'health_1_df': 'Datasets/fetal-health-classifiation.csv',
    'health_2_df': 'Datasets/heart-disease-health-indicators-dataset.csv',
    'health_3_df': 'Datasets/patient-treatment-classification.csv'
}

anomalous_datasets = {
    'a_education_1_df': 'Datasets/campus-placement-prediction_anomaly.csv',
    'a_education_2_df': 'Datasets/predict-dropout-or-academic-success_anomaly.csv',
    'a_education_3_df': 'Datasets/student-performance-dataset_anomaly.csv',
}

In [4]:
def cell_generation(dataframe: pd.DataFrame, column: str, filename: str) -> pd.DataFrame:

    if not dataframe[column].isna().any():
        return dataframe 
        
    X = dataframe.drop(columns=[column])
    y = dataframe[column]

    known_values = y.notna()
    unknown_values = y.isna()

    X_train = X[known_values]
    y_train = y[known_values]

    X_test = X[unknown_values]

    model = RandomForestRegressor(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)

    dataframe.loc[dataframe.index[unknown_values], column] = y_pred
    
    dataframe.to_csv(filename, index=False)
    
    return dataframe

In [5]:
def generate_cell_values() -> None:
    for df_name, file_path in anomalous_datasets.items():
        df = pd.read_csv(file_path)
        used_cols = [col for col in df.columns if df[col].nunique() > 50]
        for col in used_cols:
            df = cell_generation(df, col, file_path)
        classify(df, file_path)