In [None]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt

%matplotlib inline 
#to visualize the plots directly in the notebook

In [None]:
# Import the datasets using pandas 
data_degree = pd.read_csv("../input/degrees-that-pay-back.csv")
salaries_by_college = pd.read_csv("../input/salaries-by-college-type.csv")
salaries_by_region = pd.read_csv("../input/salaries-by-region.csv")

***Degree data***

Let's first analyze the degree data. The dataset is sorted by undergraduate major, to each category are associated several information regarding the evolution of the salary along the working career of the graduates.

We will first give a quick look to the dataframe, to verify the kind of data we are dealing with, check for missing information and errors/outliers. Pandas gives many easy options to complete all these tasks quickly.

In [None]:
data_degree.head()

In [None]:
data_degree.tail()

In [None]:
# Check for missing data and data type 
data_degree.info()

There is no missing data, the dataset is fairly small, only 50 rows and 8 columns. We can notice that many entries are labeled as "object" even though they are integers, the reason why this happen is the presence of the dollar sign ($) before the numeric value. Let's write a quick function to deal with this problem.

In [None]:
columns_to_modify = ['Starting Median Salary', 'Mid-Career Median Salary','Mid-Career 10th Percentile Salary',
                      'Mid-Career 25th Percentile Salary', 'Mid-Career 75th Percentile Salary', 'Mid-Career 90th Percentile Salary']

In [None]:
def convert_column(data, columns):
    """Function to remove '$' and ',' and convert strings into int values.
       Takes as an input a dataframe and the columns to modify."""
    for column in columns:
        data[column] = data[column].replace({'\$':'', ',':''}, regex=True)
        data[column] = pd.to_numeric(data[column])
    return data

In [None]:
data_degree = convert_column(data_degree, columns_to_modify)

In [None]:
# Creating a column to show the difference between the starting and mid-career salary
data_degree['Salary difference Mid Start'] = data_degree['Mid-Career Median Salary'] - data_degree['Starting Median Salary']

In [None]:
data_degree.head()

The data is now all numeric, except for the first column that contains the different specializations. It is now possible to plot the data and gain some insights.

In [None]:
data_degree.describe()

In [None]:
# Histograms

plt.figure(figsize = (12,6))
plt.suptitle('Histograms distributions of the data', size=15, weight='bold')
plt.tight_layout
for i, feature in enumerate(data_degree):
    if i > 0 and i < 4:
        plt.subplot(1,3,i)
        data_degree[feature].plot(kind='hist', color='green', alpha=0.3)
        plt.xlabel(feature, weight='bold')

In [None]:
plt.figure(figsize=(18,9))

plt.subplot(1,2,1)
plt.title('Mid-Career salary distribution by percentile band', size=15, weight='bold')

data_degree['Mid-Career 10th Percentile Salary'].plot(kind='hist', alpha=0.5)
data_degree['Mid-Career 25th Percentile Salary'].plot(kind='hist', alpha=0.5)
data_degree['Mid-Career 75th Percentile Salary'].plot(kind='hist', alpha=0.5)
data_degree['Mid-Career 90th Percentile Salary'].plot(kind='hist', alpha=0.5)

plt.legend()

plt.subplot(1,2,2)
_ = np.array(data_degree.loc[:,['Mid-Career 10th Percentile Salary','Mid-Career 10th Percentile Salary',
                                'Mid-Career 75th Percentile Salary','Mid-Career 90th Percentile Salary']])
plt.boxplot(_, labels=['10th Percentile','25th Percentile','75th Percentile','90th Percentile'], showmeans= True)
plt.ylabel('US$', weight='bold')
plt.title('Mid-Career salary distribution by percentile band', size=15, weight='bold')

The boxplot shows clearly how good scores influence the salary that one can expect for his mid-career. Also, it is possible to observe a certain degree of variability within the same percentile classes, here data is not divided by studying area.

In [None]:
data_degree.head()

In [None]:
def sort_and_reindex(data, column, ascending=False):
    """Function to sort the given dataframe by a certain column and reset the index."""
    
    df = data.sort_values(by=column, ascending=ascending)
    df.reset_index(inplace=True)
    return df

In [None]:
# Sorting by starting median salary by major

sorted_start_salary = sort_and_reindex(data_degree, 'Starting Median Salary')
sorted_start_salary.head()

In [None]:
def scatterplot_with_percentile_line(data, column, index, show_percentile=False):
    
    """Returns a scatterplot of the salaries and the percentile of the distribution"""
        
    x = data[column]
    y = data.index
    
    plt.scatter(x,y, color='red')
    if show_percentile:
        plt.axvline(x.quantile(q=0.25), ls='--', color='yellow', label='25th percentile')
        plt.axvline(x.mean(), ls='--', color='orange', label='average')
        plt.axvline(x.quantile(q=0.75), ls='--', color='red', label='75th percentile')
        plt.axvline(x.quantile(q=0.9), ls='--', color='purple', label='90th percentile')
    
    plt.xlabel('US$', weight='bold')
    plt.yticks(y, data[index])
    plt.title('{}'.format(column), size=12, weight='bold')
    plt.legend()    

In [None]:
plt.figure(figsize=(8,9.5))
scatterplot_with_percentile_line(sorted_start_salary, 'Starting Median Salary', index='Undergraduate Major', show_percentile=True)

In [None]:
# Sorting the data by the median mid-career salary
sorted_mid_career_salary = sort_and_reindex(data_degree, 'Mid-Career Median Salary')
sorted_mid_career_salary.head()

In [None]:
plt.figure(figsize=(8,9.5))
scatterplot_with_percentile_line(sorted_mid_career_salary, 'Mid-Career Median Salary', index='Undergraduate Major', show_percentile=True)

In [None]:
# Plot to compare the evolution of the salary from the starting to mid-career one
plt.figure(figsize=(24,12))

plt.subplot(1,2,1)
# Sorting by Starting salary
x_mid_1 = sorted_start_salary['Mid-Career Median Salary']
y_1 = sorted_start_salary.index
x_1 = sorted_start_salary['Starting Median Salary']

plt.scatter(x_1, y_1, color='orange', label='Starting Salary')
plt.scatter(x_mid_1, y_1, color='red', label='Mid-Career Salary')

plt.xlabel('US$', weight='bold')
plt.yticks(y_1, sorted_start_salary['Undergraduate Major'])
plt.legend()

#######################
plt.subplot(1,2,2)
# Sorting by mid-career salary
x_mid_2 = sorted_mid_career_salary['Mid-Career Median Salary']
y_2 = sorted_mid_career_salary.index
x_2 = sorted_mid_career_salary['Starting Median Salary']

plt.scatter(x_2, y_2, color='orange', label='Starting Salary')
plt.scatter(x_mid_2, y_2, color='red', label='Mid-Career Salary')

plt.xlabel('US$', weight='bold')
plt.yticks(y_2, sorted_mid_career_salary['Undergraduate Major'])
plt.legend()

plt.tight_layout

In [None]:
sorted_difference_Mid_Start = sort_and_reindex(data_degree, 'Salary difference Mid Start')
sorted_percentage_change = sort_and_reindex(data_degree, 'Percent change from Starting to Mid-Career Salary')

In [None]:
plt.figure(figsize=(24,10))

plt.subplot(1,2,1)
scatterplot_with_percentile_line(sorted_difference_Mid_Start, 'Salary difference Mid Start', index='Undergraduate Major')

plt.subplot(1,2,2)
scatterplot_with_percentile_line(sorted_percentage_change, 'Percent change from Starting to Mid-Career Salary', index='Undergraduate Major')
plt.xlabel('%')

The percentage is misleading, looking solely at this information all the engineering professions seem to have worst future perspective compared to other specialization. Although, as we have seen, from the previous graphs engineering specializations are the ones that grant higher median salaries on the mid-term. 

The plot on the left highlights the bright future that awaits future graduate in Math, Physics and Economics, while their initial salaries are below the 50th percentile of the distribution, through experience they can raise their salaries up to 90-100% of its initial value.

In [None]:
# Plot the different percentile classes by major on the same graph
plt.figure(figsize=(8,12))

x_10 = sorted_mid_career_salary['Mid-Career 10th Percentile Salary']
x_25 = sorted_mid_career_salary['Mid-Career 25th Percentile Salary']
x_med = sorted_mid_career_salary['Mid-Career Median Salary']
x_75 = sorted_mid_career_salary['Mid-Career 75th Percentile Salary']
x_90 = sorted_mid_career_salary['Mid-Career 90th Percentile Salary']
y = sorted_mid_career_salary.index

plt.scatter(x_10, y, label='10 Percentile', color='y')
plt.scatter(x_25, y, label='25 Percentile', color='orange')
plt.scatter(x_med, y, label='Median', color='red')
plt.scatter(x_75, y, label='75 Percentile', color='purple')
plt.scatter(x_90, y, label='90 Percentile', color='blue')

plt.yticks(y, sorted_mid_career_salary['Undergraduate Major'])
plt.xlabel('US$', weight='bold')
plt.title('Mid Career salary breakdown by major', weight='bold')
plt.legend()
plt.grid(alpha=0.5)
plt.tight_layout

In [None]:
sorted_10_percentile = sort_and_reindex(data_degree, 'Mid-Career 10th Percentile Salary')
sorted_25_percentile = sort_and_reindex(data_degree, 'Mid-Career 25th Percentile Salary')
sorted_75_percentile = sort_and_reindex(data_degree, 'Mid-Career 75th Percentile Salary')
sorted_90_percentile = sort_and_reindex(data_degree, 'Mid-Career 90th Percentile Salary')

In [None]:
plt.figure(figsize=(24,30))

plt.subplot(2,2,1)
scatterplot_with_percentile_line(sorted_10_percentile, 'Mid-Career 10th Percentile Salary',index='Undergraduate Major')
plt.grid(alpha=.5)

plt.subplot(2,2,2)
scatterplot_with_percentile_line(sorted_25_percentile, 'Mid-Career 25th Percentile Salary',index='Undergraduate Major')
plt.grid(alpha=.5)

plt.subplot(2,2,3)
scatterplot_with_percentile_line(sorted_75_percentile, 'Mid-Career 75th Percentile Salary',index='Undergraduate Major')
plt.grid(alpha=.5)

plt.subplot(2,2,4)
scatterplot_with_percentile_line(sorted_90_percentile, 'Mid-Career 90th Percentile Salary',index='Undergraduate Major')
plt.grid(alpha=.5)

**Analysis by college**

In [None]:
salaries_by_college = convert_column(salaries_by_college, columns_to_modify)
salaries_by_region = convert_column(salaries_by_region, columns_to_modify)

In [None]:
# Check how many school type the dataframe has
print('There are {} different school types in the salaries by college dataframe'.format(salaries_by_college['School Type'].nunique()))
print('School types: {}'.format(salaries_by_college['School Type'].unique()))

In [None]:
salaries_by_college.head()

In [None]:
college_columns = salaries_by_college.columns

Engineering = salaries_by_college[salaries_by_college['School Type'] == 'Engineering'][college_columns[2:8]]
Party = salaries_by_college[salaries_by_college['School Type'] == 'Party'][college_columns[2:8]]
Liberal_arts = salaries_by_college[salaries_by_college['School Type'] == 'Liberal Arts'][college_columns[2:8]]
State = salaries_by_college[salaries_by_college['School Type'] == 'State'][college_columns[2:8]]
Ivy_league = salaries_by_college[salaries_by_college['School Type'] == 'Ivy League'][college_columns[2:8]]

In [None]:
plt.figure(figsize=(14, 8))
plt.subplot(1,2,1)
plt.title('Starting Median Salary by school type', weight='bold')
plt.boxplot([Engineering['Starting Median Salary'], Party['Starting Median Salary'],
             Liberal_arts['Starting Median Salary'], State['Starting Median Salary'],
             Ivy_league['Starting Median Salary']],
            labels = ['Engineering', 'Party', 'Liberal Arts', 'State', 'Ivy League'],
            showmeans=True)
plt.ylim(30000,140000)
plt.ylabel('US$', weight='bold')

plt.subplot(1,2,2)
plt.title('Mid- Career Median Salary by school type', weight='bold')
plt.boxplot([Engineering['Mid-Career Median Salary'], Party['Mid-Career Median Salary'],
             Liberal_arts['Mid-Career Median Salary'], State['Mid-Career Median Salary'],
             Ivy_league['Mid-Career Median Salary']],
            labels = ['Engineering', 'Party', 'Liberal Arts', 'State', 'Ivy League'],
            showmeans=True)
plt.ylabel('US$', weight='bold')
plt.ylim(30000,140000)
plt.tight_layout

In [None]:
salaries_by_college.head()

In [None]:
def college_boxplot(column):
    plt.title(column, weight='bold')
    plt.boxplot([Engineering[column], Party[column],
             Liberal_arts[column], State[column],
             Ivy_league[column]],
            labels = ['Engineering', 'Party', 'Liberal Arts', 'State', 'Ivy League'],
            showmeans=True)
    plt.ylabel('US$', weight='bold')

In [None]:
plt.figure(figsize=(14, 8))
plt.suptitle('Salary boxplot by school type and percentile class', size=12, weight='bold')

plt.subplot(2,2,1)
college_boxplot('Mid-Career 10th Percentile Salary')
plt.title('Mid-Career 10th percentile', weight='bold')
plt.ylim(30000,300000)

plt.subplot(2,2,2)
college_boxplot('Mid-Career 25th Percentile Salary')
plt.title('Mid-Career 25th percentile', weight='bold')
plt.ylim(30000,300000)

plt.subplot(2,2,3)
college_boxplot('Mid-Career 75th Percentile Salary')
plt.title('Mid-Career 75th percentile', weight='bold')
plt.ylim(30000,300000)

plt.subplot(2,2,4)
college_boxplot('Mid-Career 90th Percentile Salary')
plt.title('Mid-Career 90th percentile', weight='bold')
plt.ylim(30000,300000)
plt.tight_layout

In [None]:
# Let's check the top colleges for starting salary and look at the school type they belong to

college_start_sorted = sort_and_reindex(salaries_by_college, 'Starting Median Salary')
college_mid_sorted = sort_and_reindex(salaries_by_college, 'Mid-Career Median Salary')

Colleges are now sorted by starting salary, Engineering colleges occupate 8 of the 10 first position in the ranking, only Princeton and Harvard are the non- engineering university to make it into the list. 

Let's now look at the best and worst college for each school type and see how they fare.

In [None]:
for school_type in ['Engineering', 'Party', 'Liberal Arts', 'State', 'Ivy League']:
    position_best = college_start_sorted[college_start_sorted['School Type'] == school_type].head(1).index[0]
    name_best = college_start_sorted[college_start_sorted['School Type'] == school_type]['School Name'].head(1)[position_best]
    starting_salary = college_start_sorted[college_start_sorted['School Type'] == school_type]['Starting Median Salary'].head(1)[position_best]
    print('Top {} college: {} \n\tOverall starting salary position: {}\n\tStarting salary: {} US$'.format(school_type,name_best, position_best+1, starting_salary))
    
    position_worst = college_start_sorted[college_start_sorted['School Type'] == school_type].tail(1).index[0]
    name_worst = college_start_sorted[college_start_sorted['School Type'] == school_type]['School Name'].tail(1)[position_worst]
    starting_salary = college_start_sorted[college_start_sorted['School Type'] == school_type]['Starting Median Salary'].tail(1)[position_worst]
    print('Worst {} college: {} \n\tOverall starting salary position: {}\n\tStarting salary: {} US$\n\n'.format(school_type,name_worst, position_worst+1, starting_salary))


In [None]:
for school_type in ['Engineering', 'Party', 'Liberal Arts', 'State', 'Ivy League']:
    position_best = college_mid_sorted[college_mid_sorted['School Type'] == school_type].head(1).index[0]
    name_best = college_mid_sorted[college_mid_sorted['School Type'] == school_type]['School Name'].head(1)[position_best]
    mid_salary = college_mid_sorted[college_mid_sorted['School Type'] == school_type]['Mid-Career Median Salary'].head(1)[position_best]
    print('Top {} college: {} \n\tOverall mid-career salary position: {}\n\tMid-career salary: {} US$'.format(school_type,name_best, position_best+1, mid_salary))    
    
    position_worst = college_mid_sorted[college_mid_sorted['School Type'] == school_type].tail(1).index[0]
    name_worst = college_mid_sorted[college_mid_sorted['School Type'] == school_type]['School Name'].tail(1)[position_worst]
    mid_salary = college_mid_sorted[college_mid_sorted['School Type'] == school_type]['Mid-Career Median Salary'].tail(1)[position_worst]
    print('Worst {} college: {} \n\tOverall mid-career salary position: {}\n\tMid-career salary {} US$\n\n'.format(school_type,name_worst, position_worst+1, mid_salary))

In [None]:
# Adding the region information to the college dataframe

salaries_by_college.info()

In [None]:
salaries_by_region.info()

In [None]:
df = salaries_by_college

In [None]:
df_school_type = salaries_by_college
df_school_type.sort_values(by='School Name', inplace=True)

In [None]:
df_region = salaries_by_region
df_region.sort_values(by='School Name', inplace = True)
df_region.drop(df_region.columns[2:], axis=1, inplace=True)

In [None]:
df_school_region = df_school_type.merge(df_region, on='School Name')

In [None]:
df_school_region.Region.unique()

**Analysis by Region**

In [None]:
# Which region has the most profitable colleges?

North_east = salaries_by_region[salaries_by_region['Region'] == 'Northeastern']
South = salaries_by_region[salaries_by_region['Region'] == 'Southern']
West = salaries_by_region[salaries_by_region['Region'] == 'Western']
Mid_west = salaries_by_region[salaries_by_region['Region'] == 'Midwestern']
Cali = salaries_by_region[salaries_by_region['Region'] == 'California']

In [None]:
plt.figure(figsize=(16,8))
plt.subplot(1,2,1)
plt.boxplot([North_east['Starting Median Salary'],South['Starting Median Salary'],
             West['Starting Median Salary'], Mid_west['Starting Median Salary'], Cali['Starting Median Salary']],
             labels=['North-East','South','West','Mid-West','Cali'], showmeans=True)
plt.title('Median starting salary by region')
plt.ylabel('US$', rotation='vertical')

plt.subplot(1,2,2)
plt.boxplot([North_east['Mid-Career Median Salary'],South['Mid-Career Median Salary'],
             West['Mid-Career Median Salary'], Mid_west['Mid-Career Median Salary'], Cali['Mid-Career Median Salary']],
             labels=['North-East','South','West','Mid-West','Cali'], showmeans=True)
plt.title('Median Mid-Career salary by region')
plt.ylabel('US$', rotation='vertical')

In [None]:
import seaborn as sns

In [None]:
# Creating a bar-plot to show the amount 

sns.barplot(North_east['Starting Median Salary'][])

In [None]:
pl