In [None]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import numpy as np
import json
import plotnine as p9
import os
import plotly.express as px
import plotly.graph_objects as go





In [None]:
df = pd.read_csv('../input/bachelor-degree-majors-by-age-sex-and-state/Bachelor_Degree_Majors.csv')

In [None]:
df.head()

In [None]:
df.info()

Thats a good sign ! Our dataframe does not have any null or misssing data values

## Data Wrangling

Key issues with the data:

1. ****Dtype inconsistency**** : The dtypes of the numerical/quanitative variables are not appropriate and need to be changed
2. ****Un-necessary data values****: The data value of ' 25 and older' is not needed since this group is already subvided into the remaining data values of 'Age Group' i.e '25 to 39', '40 to 64' and '65 and Older'
3. ****Formatting****: ',' values within the quantitave variables must be removed

### Removing data-values which are of no use:

In [None]:
df = df[df['Sex'] != 'Total']
df_cleaned = df[df['Age Group'] != '25 and older']

df_cleaned




In [None]:
df_cleaned

### Correcting Formatting error

In [None]:
def variable_restorer(val):
    
    """
    Takes in a string. 
    Replaces all commas within the string.
    Returns an int..
    """
    
    new_val = val.replace(',','')
    
    return int(new_val)

numeric_cols = df.iloc[0:,3:]  # Variables


for col in numeric_cols.columns:
    df_cleaned[col] = df_cleaned[col].apply(variable_restorer)


In [None]:
df_cleaned.head()

### GroupBy to get a better look at the data.

In [None]:
grouped_df = df_cleaned.groupby(['State','Sex','Age Group']).sum()
grouped_df

### The difference between 'Science and Engineering' and 'Science and Engineering Related Fields' doesn't seem too significant for our purposes. They essentially tell us the same thing. Let's merge the values together into a new column : STEM .

In [None]:
grouped_df['STEM'] = grouped_df['Science and Engineering'] + grouped_df['Science and Engineering Related Fields']
grouped_df = grouped_df.reset_index().drop(['Science and Engineering', 'Science and Engineering Related Fields'], axis = 1)
grouped_df


### What is the proportion of Male to Female Bachelor's degree holders per state ?

In [None]:
fig = px.bar(grouped_df, x='Age Group', y = "Bachelor's Degree Holders", color = 'Sex', facet_col = "State"
            ,facet_col_wrap = 3, facet_row_spacing = 0.01, facet_col_spacing =0.04,
            height = 5000, width =1000)
fig.for_each_annotation(lambda a: a.update(text=a.text.split("=")[-1]))
fig.update_yaxes(showticklabels=False)


#### It it seems there are more women with a Bachelor's degree in almost every state than there are men.What's less surprising is that that most Bachelor's degree holders fall in the '40 to 64' age group.

### How many Bachelor's Degree Holders are there per state ?

##### We can go about visualizing this in two ways:
1. Plotly choropleth heatmap
2. Basic Scatterplot

#### Choropeth heatmap

In [None]:
code = {'Alabama': 'AL',
        'Alaska': 'AK',
        'Arizona': 'AZ',
        'Arkansas': 'AR',
        'California': 'CA',
        'Colorado': 'CO',
        'Connecticut': 'CT',
        'Delaware': 'DE',
        'District of Columbia': 'DC',
        'Florida': 'FL',
        'Georgia': 'GA',
        'Hawaii': 'HI',
        'Idaho': 'ID',
        'Illinois': 'IL',
        'Indiana': 'IN',
        'Iowa': 'IA',
        'Kansas': 'KS',
        'Kentucky': 'KY',
        'Louisiana': 'LA',
        'Maine': 'ME',
        'Maryland': 'MD',
        'Massachusetts': 'MA',
        'Michigan': 'MI',
        'Minnesota': 'MN',
        'Mississippi': 'MS',
        'Missouri': 'MO',
        'Montana': 'MT',
        'Nebraska': 'NE',
        'Nevada': 'NV',
        'New Hampshire': 'NH',
        'New Jersey': 'NJ',
        'New Mexico': 'NM',
        'New York': 'NY',
        'North Carolina': 'NC',
        'North Dakota': 'ND',
        'Ohio': 'OH',
        'Oklahoma': 'OK',
        'Oregon': 'OR',
        'Pennsylvania': 'PA',
        'Rhode Island': 'RI',
        'South Carolina': 'SC',
        'South Dakota': 'SD',
        'Tennessee': 'TN',
        'Texas': 'TX',
        'Utah': 'UT',
        'Vermont': 'VT',
        'Virginia': 'VA',
        'Washington': 'WA',
        'West Virginia': 'WV',
        'Wisconsin': 'WI',
        'Wyoming': 'WY'}

# Target dataframe for the map: Has state name, state ID and total number of Bachelor's Degree holders in each state.

graduates_by_state = pd.DataFrame({'State':grouped_df['State'].unique(), 'ID':[ value for key,value in code.items()],
                         
                         "Bachelor's Degree Holders": grouped_df["Bachelor's Degree Holders"].groupby(grouped_df.index // 6).sum()})



fig0 = px.choropleth(graduates_by_state, locations = 'ID',
                              color = "Bachelor's Degree Holders",
                              hover_name = 'State',
                              locationmode = 'USA-states',
                              color_continuous_scale = "Aggrnyl",
                              scope = 'usa')
        
fig0.add_scattergeo(
                locations = graduates_by_state['ID'],
                locationmode = 'USA-states',
                text = graduates_by_state['ID'],   
                mode ='text'
                )
fig0.show()






#### Looks like the state of California has a disproportionately high amount of indidivuals with a Bachelor's Degree . This doesn't come as a surprise considering :
1. The state has the highest population density amongst all states.
2. The state has over 200 colleges

### Let's stop here to try and get a better look at the total number of individuals state-wide , regardless of age, with degrees in each major.

In [None]:
def col_creator(df,col):
    
    Females_maj = df.loc[df['Sex'] =='Female',col].values
    Females_maj = pd.DataFrame({'Females in'+ ' ' + col : Females_maj})
    Females_maj = Females_maj.groupby(Females_maj.index // 3).sum()

    Males_maj = df.loc[df['Sex'] == 'Male',col].values
    Males_maj = pd.DataFrame({'Males in' + ' ' + col :Males_maj})
    Males_maj = Males_maj.groupby(Males_maj.index // 3).sum()
    
    frames = [Females_maj, Males_maj]
    
    return  (pd.concat(frames,axis =1 ))
            

frames_2 = list()

for elem in grouped_df.iloc[0:,3:].columns :
    
    frames_2.append(col_creator(grouped_df, elem))
    
            
df_last = pd.concat(frames_2, axis =1)


df_last .index = [state for state in grouped_df['State'].unique()]

df_last.loc['Total'] = list(df_last[col_name].sum() for col_name in df_last.columns)


df_last

### Scatterplot

In [None]:
fig = go.Figure(data=go.Scatter(x=graduates_by_state['ID'],
                                y=graduates_by_state["Bachelor's Degree Holders"],
                                mode='markers',
                                marker_color=graduates_by_state["Bachelor's Degree Holders"],
                                 marker=dict(
        size=10,
        color=np.random.randn(500),
        colorscale='peach', 
        showscale=False),
                                text=graduates_by_state['State'])) 
fig.update_traces(marker=dict(size=12,
                              line=dict(width=2,
                                        color='white')),
                  selector=dict(mode='markers'))


fig.update_layout(title= "Bachelor's Degree Holders in the USA")
fig.show()

### The state of California has the highest number of Bachelor's Degree holders while the state of Wyoming has the lowest.

### Which gender group has more Bachelor's Degrees ?

##### Simple Violin plots can shed some more light on this question.

In [None]:
fig0 = go.Figure()

Sex = ['Male','Female']

for sex in Sex:
    fig0.add_trace(go.Violin(x=grouped_df['Sex'][grouped_df['Sex'] == sex],
                            y=grouped_df["Bachelor's Degree Holders"][grouped_df['Sex'] == sex],
                            name=sex,
                            box_visible=True,
                            meanline_visible=True))

fig0.show()
    
    

#### The female gender group has a  higher median, confirming the fact that there are more women who have a Bachelor's Degree than there are men. 

In [None]:
df_last["Males in Bachelor's Degree Holders"]

### Disparity between the proportion of of the population of Men and Women across Majors:
##### 1. What proportion of the total population of women have  Bachelor's degrees in a STEM field ? 
##### 2. What proportion of the total population of men have  Bachelor's degrees in Arts and Humanities ?
##### 3. Is there a significant amount of men who have Bachelor's Degrees in Education ?


#### We can collectively answer these questions at once via a ****Population Pyramid****

In [None]:

import plotly.graph_objects as go


Majors = np.array(["Business","Education","Arts, Humanites and Others", "STEM"])
Totals = np.array([e for e in df_last.loc['Total'][0:]])


Men_Bachelors_total = np.array([df_last.iloc[51,1]]*4)
Women_Bachelors_total = np.array([-x for x in  [df_last.iloc[51,0]]*4])

Men_totals = np.array([a for a in df_last.iloc[51,3::2]])
Women_totals = np.array([ -b for b in df_last.iloc[51,2::2]])
W_pos = -1*Women_totals
W_pos_t = -1*Women_Bachelors_total

Percentage_male = pd.Series(Men_totals).apply(lambda x : round((x*100)/ sum(Men_totals)) )

Percentage_female = pd.Series(Women_totals).apply(lambda x : round((x*100)/ sum(Women_totals)) )


fig1 = go.Figure( layout =  go.Layout(yaxis = go.layout.YAxis(title = 'Majors'),
                                      title = "Proportion of Bachelors Degree holders per Major",
                                      xaxis = go.layout.XAxis( 
                                              range=[-50000000,50000000],
                                              tickvals = [e for e in range(-50000000,75000000,25000000)],
                                              ticktext = [50000000,25000000,0,25000000,50000000],
                                              title = 'Count'),
                                      barmode= 'overlay',
                                      bargap = 0.1) )

fig1.add_trace(go.Bar(y=Majors,
                      x= Men_Bachelors_total,
                      text = 'x',
                      name = 'Men',
                      hovertemplate  = 'Total Graduates: %{x:.4sf}',
                      showlegend = False,
                      orientation = 'h',
                      
                      marker = dict(
                      color= 'rgba(46,183,18,0.5)',
                    
              )))

fig1.add_trace(go.Bar(y=Majors,
                      x= Men_totals,
                      name = 'Men',
                      text = Percentage_male.apply(lambda x: '{:.0f}%'.format(x)),
                      textposition = 'outside',
                      hovertemplate = '%{y} graduates: %{x:.4sf}',
                      textfont=dict(
                            family=" Droid Sans,Standard TT",
                            size=25,
                            color="black"),                 
                      orientation = 'h',
                      marker = dict(line=dict(color='black', width=0.5),
                             color='rgba(43,151,21,0.9)',
                    
              )))


fig1.add_trace(go.Bar(y=Majors,
              x= Women_Bachelors_total,
                      name = 'Women',
                      text = -1*Women_Bachelors_total,
              hovertemplate = 
                      'Total Graduates: %{text:.4sf}',
              showlegend = False,
               
               hoverinfo = 'text + y',
               
               orientation = 'h',
               marker = dict(
                    color='rgba(250,255,78,0.4)',
                    
              )))


fig1.add_trace(go.Bar(y=Majors,
              x= Women_totals,
              customdata = -1*Women_totals,
              name = 'Women',
              text = Percentage_female.apply(lambda x: '{:.0f}%'.format(x)),
              textposition = 'outside',
              hovertemplate = '%{y} graduates : %{customdata:.4sf}',
              textfont=dict(
                            family=" Droid Sans,Standard TT",
                            size=25,
                            color="black"),
              orientation = 'h',
              marker = dict(opacity = 1,
                             line = dict(color = 'black', width = 0.5),
                             color='rgba(235,242,36,0.5)',
                    
              )))

fig1.update_xaxes(title_font = dict(size = 20))

fig1.update_yaxes(title_font = dict(size = 20))

fig1.show()

# Feedback request: Thanks for reading ! Please don't hesitate to point out anything that might have been done better . I'm always up for some good ol' constructive criticism ! Have a great day!