In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import warnings
warnings.filterwarnings("ignore")
import matplotlib.pyplot as plt
import seaborn as sns

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Reading Data

In [None]:
train_df = pd.read_csv("/kaggle/input/bachelor-degree-majors-by-age-sex-and-state/Bachelor_Degree_Majors.csv")
train_df.head(10)

### Summary of Data

In [None]:
print(len(train_df))
summary = pd.DataFrame(train_df.dtypes)
summary["null"] = train_df.isnull().sum()
summary["unique"] = train_df.nunique()
summary["first"] = train_df.loc[0]
summary["second"] = train_df.loc[1]
summary["third"] = train_df.loc[2]
summary

From here, we can make following observations:
1. There are total of 612 rows and there are no null values in the data.
2. Every feature has a datatype object but we could see numbers as the values for quite some features which we need to convert to float
3. There are all unique values in Bachelor's degree holder and that makes sense.
4. There are all unique values to Science and Eng as well.
5. Data is in certain age groups and there are total 4 age groups.


In [None]:
train_df.Sex.unique()
# These are the unique values in this feature

### Convert the object values to float values

In [None]:
def convert_str_to_num(Series):
    Series = Series.split(",")
    Series = int("".join(Series))
    return Series

In [None]:
cols = train_df.columns.to_list()
for col in cols[3:]:
    train_df[col] = train_df[col].apply(convert_str_to_num)

## Univariate Analysis

In [None]:
total_df = train_df[(train_df["Sex"] == "Total") & (train_df["Age Group"] == "25 and older")]
Male_df = train_df[(train_df["Sex"] == "Male") & (train_df["Age Group"] == "25 and older")]
Female_df = train_df[(train_df["Sex"] == "Female") & (train_df["Age Group"] == "25 and older")]

Removing the rows with have total as a value in sex feature because we already have male and female"

In [None]:
fg, ax = plt.subplots(4, 2, figsize = (20,10))
fg.tight_layout() 
for idx, col in enumerate(cols[1:]):
    sns.histplot(x = col, data = train_df, color="Green", ax = ax[idx//2, idx%2])

From  these plots we could see that sex and age group is uniformly distributed in the data while other features are not. Other features are having right skewness

In [None]:
def write_percentage(train_df, ax):
    """Writes percentage on top of the bars on the plots"""
    for patches in ax.patches:
        height = patches.get_height()
        width = patches.get_width()
        x_loc = patches.get_x()
        values = height
        ax.text(x = x_loc , y=height+100, s = '{:1.1f}%'.format(values), fontsize = 10 )

In [None]:
train_df.columns

### Countrywise distribution

In [None]:
fg, ax = plt.subplots(6,1,figsize = (20,30))
for idx, col in enumerate(cols[3:]):
    sns.barplot(y = col, x = "State", data = total_df, palette="Reds", ax= ax[idx])
    plt.sca(ax[idx])
    plt.xticks(rotation = 90)
    fg.tight_layout(pad=3.0)
plt.show()

It is clear that total number of degree holders in California has performed great. There are few other cities as well which performed pretty well like in  the case of Education degree. In the sector of degree holders related to education department, Texas has highest number of people in that sector with a close competition with Florida, New york and California. All in all California has been performing really well in all departments with holder of highest number of degree holders in most of the department.

It could also be the case that since California and Texas are big cities with total population greater than other cities so it is no great deal to have higher number of degree holders.

One feature that could have been of great importance is the total number of degree holders with respect to the population of the city.

### Degree holders 

In [None]:
fg, ax = plt.subplots(2,3,figsize = (30,15))
people_df = train_df[train_df["Sex"] != "Total"]
for idx, col in enumerate(cols[3:]):
    sns.barplot(y = col, x = "Sex", data = people_df, palette="Greens", ax = ax[idx//3, idx%3])

 From here we could see that Females are having high number in most of the categories, except for **Science and 
Engineering** and **Business**

# Closer Look

In [None]:
fg, ax = plt.subplots(2,3,figsize = (30,15))
data = train_df[(train_df["Sex"] != "Total") & (train_df["Age Group"] != "25 and older" )]
for idx, col in enumerate(cols[3:]):
    sns.barplot(y = "Bachelor's Degree Holders", x = "Age Group", hue = "Sex", data = data, palette="Reds", ax = ax[idx//3, idx%3])

From here we could see that mostly people of age category 40 to 64 are having highest count be it male or female and on the other hand people with age group with 65 and older are having least count

### Looking forward for your comments would be more than happy to get a feedback!
### Have a great day! :)