#Task3:



##Data Loading & Initial Exploration

In [None]:
from google.colab import drive
drive.mount('/content/drive')

###Import Libraries & Load Data


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


In [None]:
data = pd.read_csv('/content/drive/MyDrive/General_Education_School.csv')

In [None]:
# copy data in df without first row
df = data.iloc[1:].copy()

In [None]:
df.shape

In [None]:
df.head()

In [None]:
# Check dataset information
df.info()

In [None]:
df.describe()

###Check Dataset Structure


In [None]:
print("Number of schools:", df['DBN'].nunique())
print("Number of schools:", df['School Name'].nunique())
print("\nSchool Type Distribution:")
print(df['School Type'].value_counts())

##Data Cleaning & Preprocessing

In [None]:
def convert_percent(x):
    if pd.isna(x) or x == 'NA':
        return None
    try:
        return float(str(x).strip('%'))/100 if '%' in str(x) else float(x)
    except ValueError:
        return None

# Apply conversion to response rate columns
response_cols = ['Parent Response Rate', 'Teacher Response Rate', 'Student Response Rate']
for col in response_cols:
    df[col] = df[col].apply(convert_percent)

# Handle missing Student Response Rates for Early Childhood/Elementary Schools
early_childhood_mask = df['School Type'].str.contains('Early Childhood School|Elementary School', case=False, na=False)
df.loc[early_childhood_mask, 'Student Response Rate'] = 'NaN'  # Assume no student surveys


###Check for Duplicates & Missing Data

In [None]:
df = df.drop_duplicates()  # Remove duplicates
print("\nMissing Values:")
print(df.isnull().sum())

In [None]:
missing_rows = df[df['Student Response Rate'].isna()]
print("\n Student Response Rate:")
print(missing_rows[['DBN','School Type']])

In [None]:
mean_response = df.loc[df['School Type'] == 'Middle School', 'Student Response Rate'].mean()
df.loc[df['School Type'] == 'Middle School', 'Student Response Rate'] = df.loc[df['School Type'] == 'Middle School', 'Student Response Rate'].fillna(mean_response)
print(df.isnull().sum())

##Exploratory Data Analysis (EDA)

In [None]:
numeric_cols = ['Parent Response Rate', 'Teacher Response Rate', 'Student Response Rate','Total Safety and Respect Score', 'Total Communication Score', 'Total Engagement Score', 'Total Academic Expectations Score']
for col in numeric_cols:
    df[col] = pd.to_numeric(df[col], errors='coerce')


In [None]:
response_columns = ['Parent Response Rate', 'Teacher Response Rate', 'Student Response Rate']
print("\nAverage Response Rates:")
avg_rates = df[response_columns].apply(lambda x: x.mean(skipna=True) if x.name == 'Student Response Rate' else x.mean())
print(avg_rates)


In [None]:
# Set style for better visualization
sns.set(style="whitegrid")
variables = ['Total Safety and Respect Score', 'Total Communication Score', 'Total Engagement Score']
# Create figure with 3 side-by-side boxplots
plt.figure(figsize=(10, 6))
plt.suptitle('Distribution of School Performance Scores', fontsize=16, y=1.05)
for i, var in enumerate(variables, 1):
    plt.subplot(1, 3, i)
    # Convert to numeric, handling errors
    numeric_data = pd.to_numeric(df[var], errors='coerce')
    sns.boxplot(x=numeric_data, color='green')
    plt.title(var)
    # Calculate mean of numeric data, ignoring NaNs
    mean_value = np.nanmean(numeric_data)
    plt.axvline(mean_value, color='red', linestyle='--', label=f'Mean: {mean_value:.2f}')
    plt.legend()
plt.tight_layout()
plt.show()

##Correlation & Deep Dive Analysis

In [None]:
#numeric_cols = df.select_dtypes(include='number').columns
corr_matrix = df[ ['Parent Response Rate', 'Teacher Response Rate', 'Student Response Rate']].corr()

plt.figure(figsize=(6, 6))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm')
plt.title("correlition between")
plt.show()

In [None]:
corr_matrix = df[ ['Parent Response Rate', 'Total Safety and Respect Score','Total Communication Score'	,'Total Engagement Score','Total Academic Expectations Score' ]].corr()
plt.figure(figsize=(6, 6))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm')
plt.title("correlition between")
plt.show()

## Correlation & Deep Dive Analysis

###Group by School Type

In [None]:
df['Total Communication and Engagement Score'] = (df['Total Communication Score'] + df['Total Engagement Score']) / 2
df = df.drop(columns=['Total Communication Score', 'Total Engagement Score'])

In [None]:
numeric_cols = ['Parent Response Rate', 'Teacher Response Rate', 'Student Response Rate','Total Safety and Respect Score', 'Total Communication and Engagement Score' , 'Total Academic Expectations Score']
for col in numeric_cols:
    df[col] = pd.to_numeric(df[col], errors='coerce')


In [None]:
grouped_df = df.groupby('School Type')[numeric_cols].mean()
print("\nResponse Rates by School Type:")
print(grouped_df.to_string(float_format="%.2f", index=True))

###Group by School Type & Borough

In [None]:
borough_mapping = {'M': 'Manhattan', 'K': 'Brooklyn', 'Q': 'Queens', 'X': 'Bronx', 'R': 'Staten Island'}
df['Borough'] = df['DBN'].str[2].map(borough_mapping)
analysis = df.groupby(['Borough', 'School Type'])[numeric_cols].mean().round(2)

def display_pretty_table(data, title):
    display(data.style
            .background_gradient(cmap='Blues')
            .set_caption(title)
            .set_properties(**{'text-align': 'center',
                              'border': '1px solid black'})
            .format(precision=2)
       )

# جدول نسب الاستجابة
response_table = analysis[['Parent Response Rate', 'Teacher Response Rate', 'Student Response Rate',
                          'Total Safety and Respect Score', 'Total Communication and Engagement Score']]
display_pretty_table(response_table, 'analyst')