# Data Project Rubric:

# Data Analysis with Python (Pandas & Matplotlib)

## 1. Project Overview
   
         ● Objective: Which city has the highest number of accepted students.
   
         ● Data Source: Describe where the data comes from and how it might answer the question.
              This data source comes from https://www.kaggle.com/datasets/zeeshier/student-admission-records

In [None]:
# Get started
import pandas as pd
import matplotlib.pyplot as plt



## 2. Data Collection and Loading  

In [None]:
# Load Data: Use Pandas to load a dataset (CSV, Excel, or database).
orig_student_info = pd.read_csv('student_admission_record_dirty.csv')
print(orig_student_info)

In [None]:
# Insert underscores. Makes it easier for me to read columns names
orig_student_info.columns = orig_student_info.columns.str.replace(' ', '_')
print(orig_student_info)

In [None]:
# Initial Check: Display the first few rows and basic information about the dataset, noting column names, types, and missing values.
orig_student_info.head()

In [None]:
# Choose which columns or data to focus on and drop the rest.
# I don't need name, age, or gender to answer my objective
new_student_info = orig_student_info.drop(['Name', 'Age', 'Gender'], axis='columns')
new_student_info

## 3. Data Cleaning and Preparation  

In [None]:
# handle missing values:
# how many are there?
# count NaNs in each column
column_nan_count = new_student_info.isnull().sum()
print("NaN count per column:")
print(column_nan_count)

In [None]:
# for the 'Admission Test Scores' and 'High School Percentage', these columns are just 'FYI' and they don't really affect the calculations I'm doing.
# So I'll replace the 'NaN' with the column mean (average)
new_student_info['Admission_Test_Score'] = new_student_info['Admission_Test_Score'].fillna(new_student_info['Admission_Test_Score'].mean())
new_student_info['High_School_Percentage'] = new_student_info['High_School_Percentage'].fillna(new_student_info['High_School_Percentage'].mean())
pd.options.display.float_format = '{:.2f}'.format
print('\nResult :\n', new_student_info)
# test_nan_count = new_student_info.isnull().sum()
# print("NaN count per column:")
# print(test_nan_count)

In [None]:
# fill missing "City' with 'Unknown'
# remove any record that has a missing 'Admission Status' since there's not way to guess at those.
# this leaves 147 records
new_student_info['City'] = new_student_info['City'].fillna('Unknown')
new_student_info = new_student_info.dropna()

# set display options to show all columns and rows
# pd.set_option('display.max_columns', None)
# pd.set_option('display.max_rows', None)

# print the dataframe
print(new_student_info)

# test_nan_count = new_student_info.isnull().sum()
# print("NaN count per column:")
# print(test_nan_count)


In [None]:
# Feature Engineering: The number of 'Accepted' vs 'Rejected' per city

# Group by 'City' and 'Admission_Status' and calculate the total count for each group
city_admissions = new_student_info.groupby(['City', 'Admission_Status'])['Admission_Status'].count().reset_index(name='Count')

# Create a pivot table to show accepted and rejected totals by city
city_admissions_pivot = city_admissions.pivot(index='City', columns='Admission_Status', values='Count').fillna(0)

# Rename the columns for better readability 
city_admissions_pivot = city_admissions_pivot.rename(columns={'Accepted': 'Accepted', 'Rejected': 'Rejected'})

# Display the pivot table
# print(city_admissions_pivot)

# Convert the pivot table to a string
pivot_table_string = city_admissions_pivot.to_string()

# Find the index of the newline character after the header
header_end_index = pivot_table_string.find('\n')

# Insert a newline character after the header
modified_pivot_table_string = pivot_table_string[:header_end_index + 1] + '\n' + pivot_table_string[header_end_index + 1:]

# Print the modified string
print(modified_pivot_table_string)





## 4. Exploratory Data Analysis (EDA)

In [None]:
# Descriptive Statistics:
new_student_info.describe()

In [None]:
# Data Visualizations:
# Assuming 'city_admissions_pivot' is your DataFrame
city_admissions_pivot.plot(kind='bar', figsize=(10, 6))

plt.title('Student Admissions by City')
plt.xlabel('City')
plt.ylabel('Number of Students')
plt.xticks(rotation=45, ha='right')  # Rotate x-axis labels for better readability
plt.legend(title='Admission Status')

plt.tight_layout()  # Adjust layout to prevent labels from overlapping
plt.show()