STAT 4185: Advanced Data Manipulation & Analysis with Python

# Week 6: Charting with Matplotlib and Seaborn In-Class Activity

## Part 1: Setup (~5 minutes)
Import necessary libraries and the dataset.

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

In [None]:
df = pd.read_csv('colleges_dataset_cleaned.csv', index_col=0)
df.head()

In [None]:
df.columns

## Part 2: Guided Examples (~30 minutes)

### Scatter plots

In [None]:
# Scatter plots

x = df['acceptance_rate']
y = df['overall_score']
a, b = np.polyfit(x, y, 1)
# Find line of best fit.

plt.figure(figsize=(10,10))
# This is useful for resizing the image size of the graph if you need it.

plt.scatter(x,y)
plt.plot(x, a*x+b, color='r')
# Plot data and line of best fit

plt.title('Acceptance Rate vs. College Score')
plt.xlabel('Acceptance Rate')
plt.ylabel('College Score')
plt.show()
# There appears to be a negative correlation, indicating that "better" colleges may tend to have lower acceptance rates.


# Let's repeat this for all appropriate fields to put in a scatter plot against college score.
relCols = ['Est_full_price_22_23', 'Est_price_with_avg_grant', 'percent_of_student_who_get_grants', 'graduation_rate', 'early_career_earnings', 'avg_price_for_low_income_students', 
           'undergrad_enrollment', 'percent_of_students_with_need_who_get_grants', 'percent_of_need_met', 'percent_of_students_who_get_merit_grants', 'avg_merit_grant', 
           'avg_time_to_a_degree_years', 'median_student_debt', 'percent_earning_more_than_a_high_school_grad', 'median_sat', 'median_act']
colDict = {'college_names':'College Name', 'overall_score':'College Score', 'acceptance_rate':'Acceptance Rate', 'Est_full_price_22_23':'Full Price of Attendance 2022-2023', 
           'Est_price_with_avg_grant':'Estimated Average Price After Grants', 'percent_of_student_who_get_grants':'Percent of Students with Grants', 'graduation_rate':'Graduate Rate',
           'early_career_earnings':'Average Early Career Earnings', 'avg_price_for_low_income_students':'Average Price for Low-Income Students', 'sat_act_required':'ACT/SAT Required', 
           'undergrad_enrollment':'Undergraduate Enrollment Size', 'percent_of_students_with_need_who_get_grants':'Percent of Students with Need that Receive Grants', 
           'percent_of_need_met':'Percent of Need Met', 'percent_of_students_who_get_merit_grants':'Percent of Students who Get Merit Grants', 'avg_merit_grant':'Average Merit Grant', 
           'avg_time_to_a_degree_years':'Average Time to Earn Degree (Years)', 'median_student_debt':'Median Student debt', 
           'percent_earning_more_than_a_high_school_grad':'Percentage of Students who Earn More than a Typical High School Student', 'Town':'Town', 'State':'State', 'median_sat':'Median SAT', 'median_act':'Median ACT'}

for colName in relCols:
  x = df[colName]
  a, b = np.polyfit(x, y, 1)
  plt.figure(figsize=(10,10))
  plt.scatter(x,y)
  plt.plot(x, a*x+b, color='r')
  plt.title((colDict[colName] + ' vs. College Score'))
  plt.xlabel(colDict[colName])
  plt.ylabel('College Score')
  plt.show()

# Take some time to interpret these graphs. Clearly explain one example from each category: no/little correlation, strong positive correlation, strong negative correlation.

### Histograms

In [None]:
# Histograms

plt.hist(df['overall_score'], bins=50)
plt.title("College Scores Histogram")
plt.xlabel('College Score')
plt.ylabel('Frequency')
plt.show()
# We can use this to check for biases in the assignment of overall scores to colleges.
# Just by eyeballing the data, it appears to be a roughly normal distribution around 63
# It may be skewed slightly left, indicating that the algorithm tends to give higher scores.

plt.hist(df['acceptance_rate'], bins=50)
plt.title("College Selectivity Histogram")
plt.xlabel('Acceptance Rate')
plt.ylabel('Frequency')
plt.show()
# This data is clearly skewed left, which indicates that most colleges are not selective.

plt.hist(df['percent_earning_more_than_a_high_school_grad'], bins=50)
plt.title("Earnings Benefit Histogram")
plt.xlabel('Percent of Graduates Earning More than a Typical High School Grad')
plt.ylabel('Frequency')
plt.show()
# This data is clearly skewed left, which indicates that most colleges tend to produce
# graduates that earn more than a high-school graduate would.
# If college students earned the same amount as high-school graduates, we would expect
# a normal distribution centered around 50%.

# Let's repeat this process for all columns for which a histogram makes sense.
relCols = ['Est_full_price_22_23', 'Est_price_with_avg_grant', 'percent_of_student_who_get_grants', 'graduation_rate', 'early_career_earnings', 
           'avg_price_for_low_income_students', 'undergrad_enrollment', 'percent_of_students_with_need_who_get_grants', 'percent_of_need_met', 
           'percent_of_students_who_get_merit_grants', 'avg_merit_grant', 'avg_time_to_a_degree_years', 'median_student_debt', 
           'percent_earning_more_than_a_high_school_grad', 'median_sat', 'median_act']

for colName in relCols:
  x = df[colName]
  plt.hist(df[colName], bins=50)
  plt.title((colDict[colName] + ' Histogram'))
  plt.xlabel(colDict[colName])
  plt.ylabel('Frequency')
  plt.show()

# Take some time to interpret a few of these graphs. Some of them are not perfect, since we are using 50 bins for every example. This is not ideal for every case.
# Try to make new graphs for the ones that don't make sense because of the number of bins we are using.

In [None]:
# We can repeat all of these with cumulative histograms
for colName in relCols:
  x = df[colName]
  plt.hist(df[colName], bins=50, cumulative=True)
  plt.title((colDict[colName] + ' Histogram'))
  plt.xlabel(colDict[colName])
  plt.ylabel('Frequency')
  plt.show()

# Try to see if there's anything that is easier to notice in a traditional vs. cumulative histogram. What kind of obvservations can you make from the below graph?

### Pie Charts

In [None]:
# Pie charts

n = df.sat_act_required.str.contains('No').sum()
y = df.sat_act_required.str.contains('Yes').sum()

data = [n, y]
labels = ['No', 'Yes']

# Declare exploding pie
explode = [0, 0.1]

# Define Seaborn color palette
colors = sns.color_palette('pastel')[0:2]

# Plot the data
plt.pie(data, labels = labels, explode = explode, colors = colors, autopct='%.0f%%')
plt.title('Percent of Schools that Require ACT/SAT')
plt.show()

### Bar Charts

In [None]:
# Bar charts

n = df.sat_act_required.str.contains('No').sum()
y = df.sat_act_required.str.contains('Yes').sum()

data = [n, y]
labels = ['No', 'Yes']

# Declare exploding pie
explode = [0, 0.1]

# Define Seaborn color palette
colors = sns.color_palette('pastel')[0:2]

# Plot the data
plt.bar(labels, data)
plt.title('Number of Colleges that Require ACT/SAT')
plt.ylabel('Number of Colleges')
plt.show()

### Stack Plots

In [None]:
# Stacked plot application
top10df = df.head(10)

x = [ 1,  2,  3,  4,  5,  6,  7,  8,  9]
arr1  = [23, 40, 28, 43,  8, 44, 43, 18, 17]
arr2  = [17, 30, 22, 14, 17, 17, 29, 22, 30]
arr3  = [15, 31, 18, 22, 18, 19, 13, 32, 39]

# Adding legend for stack plots is tricky.
plt.plot([], [], color='orange', label = 'Average Cost After Grants')
plt.plot([], [], color='b', label = 'Average Grant Amount')

plt.stackplot(top10df['college_names'], top10df['Est_price_with_avg_grant'], top10df['Est_full_price_22_23'] - top10df['Est_price_with_avg_grant'], colors= ['orange', 'b'])
plt.title('Stack Plot')
plt.ylabel('Estimated Full Price of Attendance')
plt.legend()
plt.xticks(rotation='vertical')
plt.show()

# A stacked bar plot looks better in my opinion
plt.bar(top10df['college_names'], top10df['Est_price_with_avg_grant'], color='orange', label = 'Average Cost After Grants')
plt.bar(top10df['college_names'], top10df['Est_full_price_22_23'] - top10df['Est_price_with_avg_grant'], color='b', label = 'Average Grant Amount', bottom=top10df['Est_price_with_avg_grant'])
plt.title('Stacked Bar Plot')
plt.ylabel('Estimated Full Price of Attendance')
plt.legend()
plt.xticks(rotation='vertical')
plt.show()

## Part 3: Explore on Your Own (~15 minutes)

Spend the rest of class manipulating the data and trying to find anything interesting.

Alternatively, we can take suggestions from the class and try to implement them together.

In [None]:
# A simple example.... college selectivity by state.
states = df.State.value_counts()
states = df.groupby('State').mean()['acceptance_rate'].sort_values(ascending=True)

plt.figure(figsize=(15,5))
states.plot(kind='bar')
plt.title('Average College Acceptance Rate by State')
plt.ylabel('Acceptance Rates (Percent)')
plt.show()