In [41]:
import pandas as pd

'''
Outer join of dataframes on TumoID
'''
# List of file locations
files = ['Demographic data.xlsx', 'Priority Districts.xlsx']

# Read the first file
df = pd.read_excel(files[0])

# Loop over the rest of the files and merge
for file in files[1:]:
    df_other = pd.read_excel(file)
    df = pd.merge(df, df_other, on='TumoID', how='outer')
    
# Fill in blanks values for Priority School
df['PrioritySchool'] = df['PrioritySchool'].fillna('NON')

# Replace categorical values in 'Family Income' column to 1s and 2s
df['FamilyIncome'] = df['FamilyIncome'].replace({'inférieur à 800': 1, 'de 801 à 1300': 2, 'de 1301 à 2500': 3, 'supérieur à 2500': 4})

# Replace categorical values in 'Priority School' columns to 0s and 1s
df['PrioritySchool'] = df['PrioritySchool'].replace({'OUI': 0, 'NON': 1})

# Replace categorical values in 'Priority District' columns to 0s and 1s
df['PriorityDistrict'] = df['PriorityDistrict'].replace({'OUI': 0, 'NON': 1})

df.head()


Unnamed: 0,TumoID,DOB,Sex,Address,PostalCode,City,Etablissement scolaire fréquenté,PrioritySchool,FamilyIncome,PriorityDistrict
0,220627000000.0,2009-08-03,Homme,154 Av. Ledru Rollin,75011,Paris,87 rue Léon Frot,1,,1.0
1,200919000000.0,2003-07-24,Femme,1 Rue Joseph Terneau,92000,Nanterre,AGORA | LYCEE | 92800 PUTEAUX,1,,1.0
2,180726000000.0,2002-01-28,Homme,12 Rue Paul Bert,92800,Puteaux,Agora | Lycée | 92800 Puteaux,1,,1.0
3,230311000000.0,2008-02-02,Femme,33 Bd de la Chapelle,75010,Paris,AIMÉ CÉSAIRE | COLLEGE | 75018 PARIS,0,2.0,1.0
4,231108000000.0,2011-05-26,Homme,9 Rue d'Aubervilliers,75018,Paris,AIMÉ CÉSAIRE | COLLEGE | 75018 PARIS,0,2.0,1.0


In [42]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder

# Prepare the features and target
data_with_income = df.dropna(subset=['FamilyIncome'])
data_without_income = df[df['FamilyIncome'].isnull()]

features_with_income = data_with_income[['PrioritySchool', 'PriorityDistrict']]
target_with_income = data_with_income['FamilyIncome']

# Split the data into training and test sets
features_train, features_test, target_train, target_test = train_test_split(features_with_income, target_with_income, test_size=0.2, random_state=42)

# Train the decision tree
tree = DecisionTreeClassifier()
tree.fit(features_train, target_train)

# Now you can use the trained model to predict the missing 'FamilyIncome' values
features_without_income = data_without_income[['PrioritySchool', 'PriorityDistrict']]
predictions = tree.predict(features_without_income)

# Fill the missing 'FamilyIncome' entries with the predictions
df.loc[data_without_income.index, 'FamilyIncome'] = predictions

# Save the merged DataFrame to a new Excel file
df.to_excel('demographic_agg.xlsx', index=False)


In [43]:
# from sklearn.model_selection import train_test_split
# from sklearn.tree import DecisionTreeClassifier
# from sklearn.preprocessing import LabelEncoder

# # Prepare the features and target
# data = df[['PrioritySchool', 'PriorityDistrict', 'FamilyIncome']]

# # data.isnull().sum() 
# data = data.dropna()
# data.isnull().sum() 

# features = data[['PrioritySchool', 'PriorityDistrict']]
# target = data['FamilyIncome']

# # target.head()

# # Split the data into training and test sets
# features_train, features_test, target_train, target_test = train_test_split(features, target, test_size=0.2, random_state=42)

# # Train the decision tree
# tree = DecisionTreeClassifier()
# tree.fit(features_train, target_train)

# # Now you can use the trained model to make predictions
# predictions = tree.predict(features_test)

# # Get the indices of the rows with missing 'FamilyIncome'
# missing_indices = df[df['FamilyIncome'].isnull()].index

# # Make sure the number of missing indices matches the number of predictions
# assert len(missing_indices) == len(predictions)

# # Fill the missing 'FamilyIncome' entries with the predictions
# df.loc[missing_indices, 'FamilyIncome'] = predictions




In [None]:
# Drop Columns
# columns_to_drop = ['ApplicationDate','RegistrationDate', 'AccountClosingDate', 
#                    'TumoStatus', 'OrientationInformationMeetingDate', 'Status', 'AttendingSince',
#                    'AttendingSince', 'AttendingSince', 'AttendingSince',
#                    'AttendingSince', 'AttendingSince', 'AttendingSince']
# df_transformed = df_transformed.drop(columns=columns_to_drop)