In [3]:
import openpyxl
print(openpyxl.__version__)

3.1.3


In [7]:
# !pip install --upgrade bottleneck
import pandas as pd
pd.set_option('future.no_silent_downcasting', True)

'''
Outer join of dataframes on TumoID
'''
# List of file locations
files = ['Demographic data.xlsx', 'Priority Districts.xlsx', 'Closed Students.xlsx']

# Read the first file
df = pd.read_excel(files[0])

# Loop over the rest of the files and merge
for file in files[1:]:
    df_other = pd.read_excel(file)
    df = pd.merge(df, df_other, on='TumoID', how='outer')

# Replace categorical values in 'Family Income' column to 1s and 2s
df['FamilyIncome'] = df['FamilyIncome'].replace({'inférieur à 800': 3, 'de 801 à 1300': 2, 'de 1301 à 2500': 1, 'supérieur à 2500': 0})
df['FamilyIncome'] = df['FamilyIncome'].infer_objects(copy=False)

# Fill in Missing Priority School Data
df['PrioritySchool'] = df['PrioritySchool'].fillna('NON')

# Check for missing values in the dataframe (None found)
df.isnull().sum() 

# Replace categorical values in 'Priority School' columns to 0s and 1s
df['PrioritySchool'] = df['PrioritySchool'].replace({'OUI': 1, 'NON': 0})
df['PrioritySchool'] = df['PrioritySchool'].infer_objects(copy=False)

# Replace categorical values in 'Priority District' columns to 0s and 1s
df['PriorityDistrict'] = df['PriorityDistrict'].replace({'OUI': 1, 'NON': 0})
df['PriorityDistrict'] = df['PriorityDistrict'].infer_objects(copy=False)

df.head()

# Save the merged DataFrame to a new Excel file
df.to_excel('Sociodemographic.xlsx', index=False)?

In [11]:
# import matplotlib.pyplot as plt
# import seaborn as sns

# Filter the DataFrame
filtered_df = df.loc[(df['PrioritySchool'] == 1) | (df['PriorityDistrict'] == 1) | (df['FamilyIncome'] == 3)]

# Count the number of students
num_students = filtered_df.shape[0]
print(f"Number of students from disadvantaged backgrounds: {num_students}")

# Visualize the data
# sns.countplot(data=filtered_df, x='FamilyIncome')
# plt.title('Number of Students from Disadvantaged Backgrounds')
# plt.show()


Number of students from disadvantaged backgrounds: 590


In [None]:
# Estimatng Missing Family Incomes:

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder

# Prepare the features and target
data_with_income = df.dropna(subset=['FamilyIncome'])
data_without_income = df[df['FamilyIncome'].isnull()]

features_with_income = data_with_income[['PrioritySchool', 'PriorityDistrict']]
target_with_income = data_with_income['FamilyIncome']

# Split the data into training and test sets
features_train, features_test, target_train, target_test = train_test_split(features_with_income, target_with_income, test_size=0.2, random_state=42)

# Train the decision tree
tree = DecisionTreeClassifier()
tree.fit(features_train, target_train)

# Now you can use the trained model to predict the missing 'FamilyIncome' values
features_without_income = data_without_income[['PrioritySchool', 'PriorityDistrict']]
predictions = tree.predict(features_without_income)

# Fill the missing 'FamilyIncome' entries with the predictions
df.loc[data_without_income.index, 'FamilyIncome'] = predictions

# Add Column for total Socioeconomic Score
df['SocioeconomicScore'] = df['PrioritySchool'] + df['PriorityDistrict'] + df['FamilyIncome']

df.head()

# Save the merged DataFrame to a new Excel file
df.to_excel('Socioeconomic Aggregation.xlsx', index=False)
