In [1]:
import pandas as pd
df = pd.read_csv('salaries_by_college_major.csv')

In [2]:
df.head()

Unnamed: 0,Undergraduate Major,Starting Median Salary,Mid-Career Median Salary,Mid-Career 10th Percentile Salary,Mid-Career 90th Percentile Salary,Group
0,Accounting,46000.0,77100.0,42200.0,152000.0,Business
1,Aerospace Engineering,57700.0,101000.0,64300.0,161000.0,STEM
2,Agriculture,42600.0,71900.0,36300.0,150000.0,Business
3,Anthropology,36800.0,61500.0,33800.0,138000.0,HASS
4,Architecture,41600.0,76800.0,50600.0,136000.0,Business


In [3]:
df.shape

(51, 6)

In [4]:
df.columns

Index(['Undergraduate Major', 'Starting Median Salary',
       'Mid-Career Median Salary', 'Mid-Career 10th Percentile Salary',
       'Mid-Career 90th Percentile Salary', 'Group'],
      dtype='object')

In [5]:
df.isna()

Unnamed: 0,Undergraduate Major,Starting Median Salary,Mid-Career Median Salary,Mid-Career 10th Percentile Salary,Mid-Career 90th Percentile Salary,Group
0,False,False,False,False,False,False
1,False,False,False,False,False,False
2,False,False,False,False,False,False
3,False,False,False,False,False,False
4,False,False,False,False,False,False
5,False,False,False,False,False,False
6,False,False,False,False,False,False
7,False,False,False,False,False,False
8,False,False,False,False,False,False
9,False,False,False,False,False,False


In [6]:
clean_df = df.dropna()
clean_df.tail()

Unnamed: 0,Undergraduate Major,Starting Median Salary,Mid-Career Median Salary,Mid-Career 10th Percentile Salary,Mid-Career 90th Percentile Salary,Group
45,Political Science,40800.0,78200.0,41200.0,168000.0,HASS
46,Psychology,35900.0,60400.0,31600.0,127000.0,HASS
47,Religion,34100.0,52000.0,29700.0,96400.0,HASS
48,Sociology,36500.0,58200.0,30700.0,118000.0,HASS
49,Spanish,34000.0,53100.0,31000.0,96400.0,HASS


In [7]:

clean_df['Starting Median Salary'].max()
clean_df['Starting Median Salary'].idxmax()


43

In [8]:
df['Mid-Career Median Salary']
df['Mid-Career Median Salary'].idxmax()
df['Undergraduate Major'][8]

'Chemical Engineering'

In [9]:
# The Lowest Starting Salary
print(df['Starting Median Salary'].min())
df['Undergraduate Major'].loc[df['Starting Median Salary'].idxmin()]

34000.0


'Spanish'

In [10]:
# The Lowest Mid-Career Salary
df.loc[df['Mid-Career Median Salary'].idxmin()]

Undergraduate Major                  Education
Starting Median Salary                 34900.0
Mid-Career Median Salary               52000.0
Mid-Career 10th Percentile Salary      29300.0
Mid-Career 90th Percentile Salary     102000.0
Group                                     HASS
Name: 18, dtype: object

In [11]:
spread_col = df['Mid-Career 90th Percentile Salary'] - df['Mid-Career 10th Percentile Salary']
df.insert(1, 'Spread', spread_col)
df.head()

low_risk = df.sort_values('Spread')
low_risk[['Undergraduate Major', 'Spread']].head()


Unnamed: 0,Undergraduate Major,Spread
40,Nursing,50700.0
43,Physician Assistant,57600.0
41,Nutrition,65300.0
49,Spanish,65400.0
27,Health Care Administration,66400.0


In [12]:
# Majors with the Highest Potential

highest_potential = df.sort_values('Mid-Career 90th Percentile Salary', ascending=False)
print("This is the highest Potential")
highest_potential[['Undergraduate Major', 'Mid-Career 90th Percentile Salary']].head()



This is the highest Potential


Unnamed: 0,Undergraduate Major,Mid-Career 90th Percentile Salary
17,Economics,210000.0
22,Finance,195000.0
8,Chemical Engineering,194000.0
37,Math,183000.0
44,Physics,178000.0


In [13]:
# Majors with the Greatest Spread in Salaries

highest_spread = df.sort_values('Spread', ascending=False)
print("This is the Greatest Spread in Salaries")
highest_spread[['Undergraduate Major', 'Spread']].head()

This is the Greatest Spread in Salaries


Unnamed: 0,Undergraduate Major,Spread
17,Economics,159400.0
22,Finance,147800.0
37,Math,137800.0
36,Marketing,132900.0
42,Philosophy,132500.0


In [14]:
# Majors with the Greatest Spread in Salaries

highest_spread = df.sort_values('Mid-Career Median Salary', ascending=False)
print("This is the Greatest Spread in Salaries")
highest_spread[['Undergraduate Major', 'Mid-Career Median Salary']].head()

This is the Greatest Spread in Salaries


Unnamed: 0,Undergraduate Major,Mid-Career Median Salary
8,Chemical Engineering,107000.0
12,Computer Engineering,105000.0
19,Electrical Engineering,103000.0
1,Aerospace Engineering,101000.0
17,Economics,98600.0


In [24]:
# Grouping data 

df.groupby('Group').count()


Unnamed: 0_level_0,Undergraduate Major,Spread,Starting Median Salary,Mid-Career Median Salary,Mid-Career 10th Percentile Salary,Mid-Career 90th Percentile Salary
Group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Business,12,12,12,12,12,12
HASS,22,22,22,22,22,22
STEM,16,16,16,16,16,16


In [30]:
# Grouping data 
clean_df2 = df.dropna()
clean_df2.groupby('Group').mean(numeric_only=True)


Unnamed: 0_level_0,Spread,Starting Median Salary,Mid-Career Median Salary,Mid-Career 10th Percentile Salary,Mid-Career 90th Percentile Salary
Group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Business,103958.333333,44633.333333,75083.333333,43566.666667,147525.0
HASS,95218.181818,37186.363636,62968.181818,34145.454545,129363.636364
STEM,101600.0,53862.5,90812.5,56025.0,157625.0


In [20]:
# Select non-numeric columns
non_numeric_cols = clean_df2.select_dtypes(exclude='number')

# Display the non-numeric columns
print(non_numeric_cols)

                     Undergraduate Major     Group
0                             Accounting  Business
1                  Aerospace Engineering      STEM
2                            Agriculture  Business
3                           Anthropology      HASS
4                           Architecture  Business
5                            Art History      HASS
6                                Biology      STEM
7                    Business Management  Business
8                   Chemical Engineering      STEM
9                              Chemistry      STEM
10                     Civil Engineering      STEM
11                        Communications      HASS
12                  Computer Engineering      STEM
13                      Computer Science      STEM
14                          Construction  Business
15                      Criminal Justice      HASS
16                                 Drama      HASS
17                             Economics  Business
18                             