In [16]:
import pandas as pd

'''
Outer join of dataframes on TumoID
'''
# List of file locations
files = ['Demographic data.xlsx', 'Priority Districts.xlsx', 'Closed Students.xlsx']

# Read the first file
df = pd.read_excel(files[0])

# Loop over the rest of the files and merge
for file in files[1:]:
    df_other = pd.read_excel(file)
    df = pd.merge(df, df_other, on='TumoID', how='outer')

# Replace categorical values in 'Family Income' column to 1s and 2s
df['FamilyIncome'] = df['FamilyIncome'].replace({'inférieur à 800': 3, 'de 801 à 1300': 2, 'de 1301 à 2500': 1, 'supérieur à 2500': 0})

# Fill in Missing Priority School Data
df['PrioritySchool'] = df['PrioritySchool'].fillna('NON')

# Check for missing values in the dataframe (None found)
df.isnull().sum() 

# Replace categorical values in 'Priority School' columns to 0s and 1s
df['PrioritySchool'] = df['PrioritySchool'].replace({'OUI': 1, 'NON': 0})

# Replace categorical values in 'Priority District' columns to 0s and 1s
df['PriorityDistrict'] = df['PriorityDistrict'].replace({'OUI': 1, 'NON': 0})

df.head()


Unnamed: 0,TumoID,ApplicationDate,RegistrationDate,AgeAtregistration,Sex,PostalCode,PrioritySchool,AccountClosingDate,TumoStatus,FamilyIncome,PriorityDistrict,TerminationType,TerminationReason
0,220627000000.0,2022-06-04,2022-06-27,12.0,Homme,75011.0,0,NaT,Actif,,0.0,,
1,200919000000.0,2020-06-20,2020-09-19,17.0,Femme,92000.0,0,2022-09-15,Alumni,,0.0,,
2,180726000000.0,2018-09-23,2018-09-23,16.0,Homme,92800.0,0,2020-07-21,Alumni,,0.0,,
3,230311000000.0,2022-10-05,2023-03-11,15.0,Femme,75010.0,1,NaT,Actif,2.0,0.0,,
4,231108000000.0,2023-06-13,2023-11-08,12.0,Homme,75018.0,1,NaT,Actif,2.0,0.0,,


In [17]:
# Estimatng Missing Family Incomes:

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder

# Prepare the features and target
data_with_income = df.dropna(subset=['FamilyIncome'])
data_without_income = df[df['FamilyIncome'].isnull()]

features_with_income = data_with_income[['PrioritySchool', 'PriorityDistrict']]
target_with_income = data_with_income['FamilyIncome']

# Split the data into training and test sets
features_train, features_test, target_train, target_test = train_test_split(features_with_income, target_with_income, test_size=0.2, random_state=42)

# Train the decision tree
tree = DecisionTreeClassifier()
tree.fit(features_train, target_train)

# Now you can use the trained model to predict the missing 'FamilyIncome' values
features_without_income = data_without_income[['PrioritySchool', 'PriorityDistrict']]
predictions = tree.predict(features_without_income)

# Fill the missing 'FamilyIncome' entries with the predictions
df.loc[data_without_income.index, 'FamilyIncome'] = predictions

# Add Column for total Socioeconomic Score
df['SocioeconomicScore'] = df['PrioritySchool'] + df['PriorityDistrict'] + df['FamilyIncome']

df.head()

# Save the merged DataFrame to a new Excel file
df.to_excel('Socioeconomic Aggregation.xlsx', index=False)
