In [1]:
# Add libraries
import pandas as pd
import os

In [2]:
# Files to load
school_data_to_load = os.path.join("Resources/schools_complete.csv")
student_data_to_load = os.path.join("Resources/students_complete.csv")

In [3]:
# Read the school and student data file and store it in a Pandas DataFrame.
school_data_df = pd.read_csv(school_data_to_load)
student_data_df = pd.read_csv(student_data_to_load)

In [4]:
school_data_df.head()

Unnamed: 0,School ID,school_name,type,size,budget
0,0,Huang High School,District,2917,1910635
1,1,Figueroa High School,District,2949,1884411
2,2,Shelton High School,Charter,1761,1056600
3,3,Hernandez High School,District,4635,3022020
4,4,Griffin High School,Charter,1468,917500


In [5]:
student_data_df.head()

Unnamed: 0,Student ID,student_name,gender,grade,school_name,reading_score,math_score
0,0,Paul Bradley,M,9th,Huang High School,66,79
1,1,Victor Smith,M,12th,Huang High School,94,61
2,2,Kevin Rodriguez,M,12th,Huang High School,90,60
3,3,Dr. Richard Scott,M,12th,Huang High School,67,58
4,4,Bonnie Ray,F,9th,Huang High School,97,84


In [6]:
# Determine shape of the school dataset
school_data_df.shape

(15, 5)

In [7]:
# Determine shape of the student dataset
student_data_df.shape

(39170, 7)

In [8]:
# Determine if there are any missing values in the school data.
school_data_df.count()

School ID      15
school_name    15
type           15
size           15
budget         15
dtype: int64

In [9]:
# Determine if there are any missing values in the school data.
student_data_df.count()


Student ID       39170
student_name     39170
gender           39170
grade            39170
school_name      39170
reading_score    39170
math_score       39170
dtype: int64

#### Data Cleaning

In [10]:
# Determine if there are any missing values in the school data.
school_data_df.isnull()

Unnamed: 0,School ID,school_name,type,size,budget
0,False,False,False,False,False
1,False,False,False,False,False
2,False,False,False,False,False
3,False,False,False,False,False
4,False,False,False,False,False
5,False,False,False,False,False
6,False,False,False,False,False
7,False,False,False,False,False
8,False,False,False,False,False
9,False,False,False,False,False


In [11]:
# Determine if there are any missing values in the student data.
student_data_df.isnull().sum()

Student ID       0
student_name     0
gender           0
grade            0
school_name      0
reading_score    0
math_score       0
dtype: int64

In [12]:
# Determine data types for the school DataFrame.
school_data_df.dtypes

School ID       int64
school_name    object
type           object
size            int64
budget          int64
dtype: object

In [13]:
# Determine data types for the student DataFrame.

student_data_df.dtypes

Student ID        int64
student_name     object
gender           object
grade            object
school_name      object
reading_score     int64
math_score        int64
dtype: object

In [14]:
# Determine data types for the student DataFrame.
student_data_df.dtypes

Student ID        int64
student_name     object
gender           object
grade            object
school_name      object
reading_score     int64
math_score        int64
dtype: object

In the first glance we see that the student_data_df has some anomalies in the student_name column - prefixes and suffixes. 

In [15]:
# Put the student names in a list.
student_names = student_data_df["student_name"].tolist()

In [24]:
# Split the student name and determine the length of the split name.
cnt = 0
for name in student_names:
    print(name.split(), len(name.split()))
    cnt = cnt+1
    if cnt>10:
        break
# Remove cnt if you want to see the entire list     

['Paul', 'Bradley'] 2
['Victor', 'Smith'] 2
['Kevin', 'Rodriguez'] 2
['Dr.', 'Richard', 'Scott'] 3
['Bonnie', 'Ray'] 2
['Bryan', 'Miranda'] 2
['Sheena', 'Carter'] 2
['Nicole', 'Baker'] 2
['Michael', 'Roth'] 2
['Matthew', 'Greene'] 2
['Andrew', 'Alexander'] 2


We see that there are several prefixes like Dr., Miss and suffixes like MD, DDS etc to some of the names. We need to isolate them to remove them from the student_name list. 

In [17]:
# Create a new list and use it for the for loop to iterate through the list.
students_to_fix = []
# Use an if statement to check the length of the name.
# If the name is greater than or equal to "3", add the name to the list.
for name in student_names:
    if len(name.split())>=3:
        students_to_fix.append(name)
        
# Get the length of the students whose names are greater than or equal to "3".
print(len(students_to_fix))

1531


In [18]:
# Add the prefixes less than or equal to 4 to a new list.
prefixes=[]
for name in students_to_fix:
    if len(name.split()[0])<=4:
        prefixes.append(name.split()[0])
 #prints unique items in the list
print(set(prefixes))

{'Mary', 'Miss', 'Lisa', 'Omar', 'Lynn', 'Troy', 'Adam', 'Carl', 'Luke', 'Todd', 'Dr.', 'Mark', 'Joe', 'Tina', 'Mr.', 'Gina', 'Emma', 'Tara', 'Gail', 'Tony', 'Greg', 'Seth', 'Mrs.', 'Ms.', 'Kim', 'Paul', 'Sara', 'Dale', 'Dawn', 'Gary', 'Kyle', 'Cody', 'Ian', 'Anna', 'Ryan', 'Kara', 'Jon', 'Marc', 'Jill', 'Lori', 'Chad', 'Dana', 'Ruth', 'Kari', 'Eric', 'Amy', 'Jodi', 'John', 'Mike', 'Erin', 'Erik', 'Cory', 'Jose', 'Leah', 'Noah', 'Anne', 'Toni', 'Sean', 'Judy'}


In [19]:
# Add the suffixes less than or equal to 3 to a new list.
suffixes=[]
for name in students_to_fix:
    if len(name.split()[-1])<=3:
        suffixes.append(name.split()[-1])
 #prints unique items in the list
print(set(suffixes))

{'IV', 'Kim', 'Roy', 'Lee', 'Cox', 'Li', 'III', 'DVM', 'MD', 'PhD', 'Jr.', 'DDS', 'Day', 'II', 'V'}


We can now create a list of all the prefixes and suffixes: "Dr. ", "Mr. ","Ms. ", "Mrs. ", "Miss ", " MD", " DDS", " DVM", " PhD" to remove from the student_name list in student_data_df. We will not replace Jr., II, III, IV, and V as they are family titles. 

In [20]:
# Add each prefix and suffix to remove to a list.
prefixes_suffixes = ["Dr. ", "Mr. ","Ms. ", "Mrs. ", "Miss ", " MD", " DDS", " DVM", " PhD"]

In [21]:
# Iterate through the "prefixes_suffixes" list and replace them with an empty space, "" when it appears in the student's name.
for word in prefixes_suffixes:
    student_data_df["student_name"] = student_data_df["student_name"].str.replace(word, " ")

  student_data_df["student_name"] = student_data_df["student_name"].str.replace(word, " ")


In [22]:
student_data_df.head(20)

Unnamed: 0,Student ID,student_name,gender,grade,school_name,reading_score,math_score
0,0,Paul Bradley,M,9th,Huang High School,66,79
1,1,Victor Smith,M,12th,Huang High School,94,61
2,2,Kevin Rodriguez,M,12th,Huang High School,90,60
3,3,Richard Scott,M,12th,Huang High School,67,58
4,4,Bonnie Ray,F,9th,Huang High School,97,84
5,5,Bryan Miranda,M,9th,Huang High School,94,94
6,6,Sheena Carter,F,11th,Huang High School,82,80
7,7,Nicole Baker,F,12th,Huang High School,96,69
8,8,Michael Roth,M,10th,Huang High School,95,87
9,9,Matthew Greene,M,10th,Huang High School,96,84


In [None]:
# Put the cleaned students' names in another list.
student_names = student_data_df['student_name'].tolist()
student_names[0:20]
# Remove [0:20] to see the entire list

Check to see how many names have a split() length of 3 or more.

In [None]:
# Create a new list and use it for the for loop to iterate through the list.
students_fixed = []

# Use an if statement to check the length of the name.

# If the name is greater than or equal to 3, add the name to the list.
for name in student_names:
    if len(name.split())>=3:
        students_fixed.append(name)

In [None]:
# Get the length of the students' names that are greater than or equal to 3.
print(len(students_fixed))

### Cleaned DataSet

In [None]:
# Combine the data into a single dataset.
school_data_complete_df = pd.merge(student_data_df, school_data_df, on = 'school_name')

In [None]:
school_data_complete_df.head()

### Analysis


In [None]:
# Get the total number of students.
student_count = school_data_complete_df['Student ID'].count()
student_count

In [None]:
# Calculate the total number of schools.
school_count = len(school_data_complete_df['school_name'].unique())
school_count

In [None]:
# Calculate the total budget.
total_budget = school_data_df['budget'].sum()
total_budget

In [None]:
# Calculate the average reading score.
average_reading_score = school_data_complete_df['reading_score'].mean()
average_reading_score

In [None]:
# Calculate the average math score.
average_math_score = school_data_complete_df['math_score'].mean()
average_math_score

In [None]:
# Get all the students who are passing math in a new DataFrame.
passing_math = school_data_complete_df[school_data_complete_df['math_score']>=70]
passing_math_count = passing_math['student_name'].count()
passing_math_count

To get the overall passing percentage, we need to get all the students who passed both math and reading and divide by the total number of students.

In [None]:
# Get all the students who are passing reading in a new DataFrame.
passing_reading = school_data_complete_df[school_data_complete_df['reading_score']>=70]
passing_reading_count = passing_reading['student_name'].count()
passing_reading_count

In [None]:
# Calculate the percent that passed math.
pass_math_percentage = passing_math_count/float(student_count)*100
pass_math_percentage

In [None]:
# Calculate the percent that passed reading.
pass_reading_percentage = passing_reading_count/float(student_count)*100
pass_reading_percentage

In [None]:
passing_math_reading = school_data_complete_df[(school_data_complete_df['math_score']>=70) & (school_data_complete_df['reading_score']>=70)]


In [None]:
passing_math_reading_count = passing_math_reading.student_name.count()
passing_math_reading_count

In [None]:
overall_passing_percentage = passing_math_reading_count/float(student_count)*100
overall_passing_percentage

Adding analysis - summary statistics for the following into a new dataframe

Total number of schools in the column "Total Schools"\
Total number of students in the column "Total Students"\
Total budget in the column "Total Budget" \
Average reading score in the column "Average Reading Score" \
Average math score in the column "Average Math Score" \
Percentage of students passing reading in the column "% Passing Reading" \
Percentage of students passing math in the column "% Passing Math" \
Overall passing percentage in the column "% Overall Passing"


In [None]:
# Adding a list of values with keys to create a new DataFrame.
district_summary_df = pd.DataFrame([{"Total Schools": school_count,
          "Total Students": student_count,
          "Total Budget": total_budget,
          "Average Math Score": average_math_score,
          "Average Reading Score": average_reading_score,
          "% Passing Math": pass_math_percentage,
         "% Passing Reading": pass_reading_percentage,
        "% Overall Passing": overall_passing_percentage}])
district_summary_df
                                

### Formatting DataFrame
To clean up the district_summary_df DataFrame, we will format dollar amounts to two decimal places, and format the grade averages to one decimal place and percentages to the nearest whole number percent.

In [None]:
# Format the "Total Students" to have the comma for a thousands separator.
district_summary_df['Total Students'] = district_summary_df['Total Students'].map("{:,}".format)
district_summary_df['Total Students'] 

In [None]:
# Format "Total Budget" to have the comma for a thousands separator, a decimal separator, and a "$".
district_summary_df['Total Budget'] = district_summary_df['Total Budget'].map("${:,.2f}".format)
district_summary_df['Total Budget']

In [None]:
#The "Average Reading Score" column will be formatted to one decimal place.
district_summary_df['Average Reading Score'] = district_summary_df['Average Reading Score'].map("{:.1f}".format)
district_summary_df['Average Reading Score']

The "Average Math Score" column will be formatted to one decimal place.
The "% Passing Reading" column will be formatted to the nearest whole number percentage.
The "% Passing Math" column will be formatted to the nearest whole number percentage
The "% Overall Passing" column will be formatted to the nearest whole number percentage.

In [None]:
#The "Average Math Score" column will be formatted to one decimal place.
district_summary_df['Average Math Score'] = district_summary_df['Average Math Score'].map("{:.1f}".format)
district_summary_df['Average Math Score']

In [None]:
district_summary_df["% Passing Reading"] = pd.to_numeric(district_summary_df["% Passing Reading"])

# The "% Passing Reading" column will be formatted to the nearest whole number percentage. 
district_summary_df["% Passing Reading"] = district_summary_df["% Passing Reading"].map("{:.0f}%".format)
district_summary_df['% Passing Reading']

In [None]:
district_summary_df["% Passing Math"] = pd.to_numeric(district_summary_df["% Passing Math"])

# The "% Passing Math" column will be formatted to the nearest whole number percentage
district_summary_df['% Passing Math'] = district_summary_df['% Passing Math'].map("{:.0f}%".format)
district_summary_df['% Passing Math']

In [None]:
district_summary_df['% Overall Passing'] = pd.to_numeric(district_summary_df['% Overall Passing'])

# The "% Overall Passing" column will be formatted to the nearest whole number percentage.
district_summary_df['% Overall Passing'] = district_summary_df['% Overall Passing'].map("{:.0f}%".format)
district_summary_df['% Overall Passing']

In [None]:
district_summary_df