In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# --- Step 1: Load the Dataset ---
# NOTE: This script assumes you are using the 500-entry file we generated.
# If your file is named differently, please change the filename below.
try:
    df = pd.read_excel('Bangladeshi_Student_Data.xlsx')
except FileNotFoundError:
    print("Error: Make sure 'Bangladeshi_Student_Data.xlsx' is in the same folder as the script.")
    exit()

# --- Step 2: Clean Data ---
# Drop rows with any missing values to prepare for calculations and plotting.
df_cleaned = df.dropna().copy() # Using .copy() to avoid SettingWithCopyWarning
# Convert Age to integer for cleaner plotting
df_cleaned['Age'] = df_cleaned['Age'].astype(int)


# --- Step 3: MODIFIED - Prepare Random Sample for Plotting ---

# Take a random sample of 30 students from the cleaned data for plotting
if len(df_cleaned) >= 30:
    plot_sample = df_cleaned.sample(n=30, random_state=42)
else:
    # Handle cases where there are fewer than 30 clean rows
    print(f"Warning: Fewer than 30 clean rows available. Using all {len(df_cleaned)} rows for plotting.")
    plot_sample = df_cleaned

# Sort the sample data for better visualization in each respective plot
marks_plot_data = plot_sample.sort_values(by='Exam Marks')
age_plot_data = plot_sample.sort_values(by='Age')


# --- Step 4: Perform Statistical Analysis (on the entire cleaned dataset) ---
print("--- NumPy Analysis ---")

# Extract 'Age' and 'Marks' from the cleaned data
ages_cleaned = df_cleaned['Age'].values
marks_cleaned = df_cleaned['Exam Marks'].values

# Calculate statistics
age_mean = np.mean(ages_cleaned)
age_median = np.median(ages_cleaned)
age_std = np.std(ages_cleaned)

marks_mean = np.mean(marks_cleaned)
marks_median = np.median(marks_cleaned)
marks_std = np.std(marks_cleaned)

print("\n--- Age Statistics (from cleaned data) ---")
print(f"Mean Age: {age_mean:.2f}")
print(f"Median Age: {age_median:.2f}")
print(f"Standard Deviation of Age: {age_std:.2f}")

print("\n--- Marks Statistics (from cleaned data) ---")
print(f"Mean Marks: {marks_mean:.2f}")
print(f"Median Marks: {marks_median:.2f}")
print(f"Standard Deviation of Marks: {marks_std:.2f}")

print("\n\n--- Pandas Analysis ---")
print("\nOriginal DataFrame with missing values (first 5 rows):")
print(df.head()) # Using .head() to avoid printing all 500 rows

print("\nAverage (Mean) values calculated by Pandas (ignores missing values):")
average_values = df[['Age', 'Exam Marks']].mean()
print(average_values)


# --- Step 5: Display Bar Charts ---
print("\n\n--- Generating Focused Bar Charts ---")

# --- MODIFIED: Bar Chart for 30 Random Student Ages ---
plt.figure(num='Ages of 30 Random Students', figsize=(15, 7))
plt.bar(age_plot_data['Name'], age_plot_data['Age'], color='skyblue')
plt.xlabel("Student Names")
plt.ylabel("Age")
plt.title("Ages of 30 Randomly Selected Students")
plt.xticks(rotation=45, ha='right')
# Add text labels on top of each age bar
for i, (name, value) in enumerate(zip(age_plot_data['Name'], age_plot_data['Age'])):
    plt.text(i, value + 0.1, str(value), ha='center', va='bottom')
plt.tight_layout()


# --- MODIFIED: Bar Chart for 30 Random Student Marks ---
plt.figure(num='Marks of 30 Random Students', figsize=(15, 7))
plt.bar(marks_plot_data['Name'], marks_plot_data['Exam Marks'], color='lightgreen')
plt.xlabel("Student Names")
plt.ylabel("Exam Marks")
plt.title("Exam Marks of 30 Randomly Selected Students")
plt.xticks(rotation=45, ha='right')
# Add text labels on top of each mark bar
for i, (name, value) in enumerate(zip(marks_plot_data['Name'], marks_plot_data['Exam Marks'])):
    plt.text(i, value + 1, str(int(value)), ha='center', va='bottom')
plt.tight_layout()


Error: Make sure 'Bangladeshi_Student_Data.xlsx' is in the same folder as the script.


NameError: name 'df' is not defined

In [None]:


# --- UNCHANGED: Summary Statistics Charts ---
# These plots remain the same as they summarize the entire dataset.
age_stats_labels = ['Mean', 'Median', 'Standard Deviation']
age_stats_values = [age_mean, age_median, age_std]
plt.figure(num='Age Statistics Chart', figsize=(8, 5))
plt.bar(age_stats_labels, age_stats_values, color=['#ff9999','#66b3ff','#99ff99'])
plt.title("Summary Statistics for All Student Ages")
for i, value in enumerate(age_stats_values):
    plt.text(i, value + 0.05, f"{value:.2f}", ha='center')
plt.tight_layout()

marks_stats_labels = ['Mean', 'Median', 'Standard Deviation']
marks_stats_values = [marks_mean, marks_median, marks_std]
plt.figure(num='Marks Statistics Chart', figsize=(8, 5))
plt.bar(marks_stats_labels, marks_stats_values, color=['#ffcc99','#c2c2f0','#ffb3e6'])
plt.title("Summary Statistics for All Student Marks")
for i, value in enumerate(marks_stats_values):
    plt.text(i, value + 0.5, f"{value:.2f}", ha='center')
plt.tight_layout()


# Show all figures
plt.show()

print("\nAnalysis complete. The first two charts now show a random sample of 30 students.")
