In [None]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline 

In [None]:
# Import the datasets using pandas 
data_degree = pd.read_csv("../input/degrees-that-pay-back.csv")
salaries_by_college = pd.read_csv("../input/salaries-by-college-type.csv")
salaries_by_region = pd.read_csv("../input/salaries-by-region.csv")

Three datasets are available:
* Salaries by major
* Salaries by school type
* Salaries by region

***Degree data***

Let's first analyze the degree data. The dataset contains information about undergraduate major, for each of them we have the evolution of the median salary along the working career. We also have the influence of having good grades on the mid-career salary.

We will first take a quick look to the dataframe, to verify the kind of data we are dealing with, check for missing information and errors/outliers.

In [None]:
data_degree.head()

In [None]:
data_degree.info()

From the info command we can notice how many numeric entries are listed as object (strings), "$" and "," need to be removed, so that pandas will read the data as numeric. 

We'll have to do the same for the other dataframes as well, so let's write a quick a function.

In [None]:
def convert_column(data, columns):
    """Function to remove '$' and ',' and convert strings into int values.
       Takes as an input a dataframe and the columns to modify."""
    for column in columns:
        data[column] = data[column].replace({'\$':'', ',':''}, regex=True)
        data[column] = pd.to_numeric(data[column])
    return data

columns_to_modify = ['Starting Median Salary', 'Mid-Career Median Salary','Mid-Career 10th Percentile Salary',
                      'Mid-Career 25th Percentile Salary', 'Mid-Career 75th Percentile Salary', 'Mid-Career 90th Percentile Salary']

In [None]:
# Calling the function we've implemented
data_degree = convert_column(data_degree, columns_to_modify)

In [None]:
data_degree.head()

**Let's strart the analysis!**

I came up with a number of questions to be asked by the data:
* Does the salary increase along the working career? 
* What's the influence of good grades over the salary?
* Which degree is best on the short-term? And on the mid-term?

Some histograms can help us to answer the first question, let's see if there is a clear shift toward higher salaries in the dstribution of mid-career salaries versus the starting ones.

In [None]:
# Histograms

plt.figure(figsize = (6,6))
plt.title('Histograms distributions of the data', size=12, weight='bold')
data_degree['Starting Median Salary'].plot(kind='hist', color='green', alpha=0.3)
data_degree['Mid-Career Median Salary'].plot(kind='hist', color='red', alpha=0.3)
plt.xlabel('US$',weight='bold')
plt.legend()

The graph is clear, mid-career salaries are shifted toward higher values compared to starting salaries. We expected this, but now we have proven our hypothesis. 

We can also observe that in the mid-career distribution there are two main blocks divided one from the other, some major must be paying significantly more than others. We'll get to this a little bit later in the analysis, for now let's try to understand the influence of good grades on the final salaries.

In [None]:
plt.figure(figsize=(18,9))

plt.subplot(1,2,1)
plt.title('Mid-Career salary distribution by percentile band', size=15, weight='bold')

data_degree['Mid-Career 10th Percentile Salary'].plot(kind='hist', alpha=0.5)
data_degree['Mid-Career 25th Percentile Salary'].plot(kind='hist', alpha=0.5)
data_degree['Mid-Career 75th Percentile Salary'].plot(kind='hist', alpha=0.5)
data_degree['Mid-Career 90th Percentile Salary'].plot(kind='hist', alpha=0.5)

plt.legend()

plt.subplot(1,2,2)
_ = np.array(data_degree.loc[:,['Mid-Career 10th Percentile Salary','Mid-Career 10th Percentile Salary',
                                'Mid-Career 75th Percentile Salary','Mid-Career 90th Percentile Salary']])
plt.boxplot(_, labels=['10th Percentile','25th Percentile','75th Percentile','90th Percentile'], showmeans= True)
plt.ylabel('US$', weight='bold')
plt.title('Mid-Career salary distribution by percentile band', size=15, weight='bold')

Haivng good grades truly makes the difference as the histograms and the boxplots show. Personally I prefer the boxplot, the graph is much more clear, in a few lines is contained a lot of information.

Both the median and the average of top-percentile salary distribution is much higher than the 10 and 25th percentile ones. There is more than 100k$ of difference between the 90th percentile class and the 10th one, that's quite the difference.
Getting good grades pays, literally.

So far, we have discovered that salaries tend to increase along the career and that good grades are higly beneficial, let's now dig into the different majors and let's find out which one pays the most.

In [None]:
def scatterplot_with_percentile_line(data, column, index, show_percentile=False):
    
    """Returns a scatterplot of the salaries and the percentile of the distribution"""
        
    x = data[column]
    y = data.index
    
    plt.scatter(x,y, color='red')
    if show_percentile:
        plt.axvline(x.quantile(q=0.25), ls='--', color='yellow', label='25th percentile')
        plt.axvline(x.mean(), ls='--', color='orange', label='average')
        plt.axvline(x.quantile(q=0.75), ls='--', color='red', label='75th percentile')
        plt.axvline(x.quantile(q=0.9), ls='--', color='purple', label='90th percentile')
    
    plt.xlabel('US$', weight='bold')
    plt.yticks(y, data[index])
    plt.title('{}'.format(column), size=12, weight='bold')
    plt.legend()    

In [None]:
def sort_and_reindex(data, column, ascending=False):
    """Function to sort the given dataframe by a certain column and reset the index."""
    
    df = data.sort_values(by=column, ascending=ascending)
    df.reset_index(inplace=True)
    return df

In [None]:
# Sorting by starting median salary by major
sorted_start_salary = sort_and_reindex(data_degree, 'Starting Median Salary')
sorted_start_salary.head()

plt.figure(figsize=(8,9.5))
scatterplot_with_percentile_line(sorted_start_salary, 'Starting Median Salary', index='Undergraduate Major', show_percentile=True)

In [None]:
# Sorting the data by the median mid-career salary
sorted_mid_career_salary = sort_and_reindex(data_degree, 'Mid-Career Median Salary')
sorted_mid_career_salary.head()

plt.figure(figsize=(8,9.5))
scatterplot_with_percentile_line(sorted_mid_career_salary, 'Mid-Career Median Salary', index='Undergraduate Major', show_percentile=True)

In [None]:
# Plot to compare the evolution of the salary from the starting to mid-career one
plt.figure(figsize=(24,12))

plt.subplot(1,2,1)
# Sorting by Starting salary
x_mid_1 = sorted_start_salary['Mid-Career Median Salary']
y_1 = sorted_start_salary.index
x_1 = sorted_start_salary['Starting Median Salary']

plt.scatter(x_1, y_1, color='orange', label='Starting Salary')
plt.scatter(x_mid_1, y_1, color='red', label='Mid-Career Salary')

plt.xlabel('US$', weight='bold')
plt.yticks(y_1, sorted_start_salary['Undergraduate Major'])
plt.legend()

#######################
plt.subplot(1,2,2)
# Sorting by mid-career salary
x_mid_2 = sorted_mid_career_salary['Mid-Career Median Salary']
y_2 = sorted_mid_career_salary.index
x_2 = sorted_mid_career_salary['Starting Median Salary']

plt.scatter(x_2, y_2, color='orange', label='Starting Salary')
plt.scatter(x_mid_2, y_2, color='red', label='Mid-Career Salary')

plt.xlabel('US$', weight='bold')
plt.yticks(y_2, sorted_mid_career_salary['Undergraduate Major'])
plt.legend()

plt.tight_layout