Exploring 2016 FCC New Coders Survey Data

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from __future__ import division
import pylab
plt.style.use('fivethirtyeight')
%matplotlib inline
pylab.rcParams['figure.figsize'] = (10.0, 8.0)

In [None]:
data_file = "../input/2016-FCC-New-Coders-Survey-Data.csv"

In [None]:
df = pd.read_csv(data_file)

# Age Distribution of learners
1. Symmetry: Positively skewed - most of the learners are in the age group of early twenties to early thirties.
2. Unimodal
3. Outliers: There are some senior citizens who are taking up online courses

In [None]:
hist = df.Age.plot.hist(bins=75)

In [None]:
hist = df.Age.plot.box()

## Distribution of Males and Females

In [None]:
fig, ax = plt.subplots(ncols=2)
box = df.Age[df.Gender == 'male'].plot.box(ax=ax[1])
box = df.Age[df.Gender == 'female'].plot.box(ax=ax[0])
ax[1].legend(["Distribution of Male Ages"])
ax[0].legend(["Distribution of Female Ages"])

# Comparing Distribution of Males and Females

- The distribution is slightly shifted to the right for females
- There are much fewer females than males in each age group

In [None]:
fig, ax = plt.subplots()
df[df.Gender == 'male'].Age.plot.kde(ax=ax)
plot = df[df.Gender == 'female'].Age.plot.kde(ax=ax)
legend = ax.legend(['Male', 'Female'])

In [None]:
fig, ax = plt.subplots()
df[df.Gender == 'male'].Age.plot.hist(bins=75, ax=ax, alpha=0.5)
df[df.Gender == 'female'].Age.plot.hist(bins=75, ax=ax, alpha=0.8)
legend = ax.legend(['Male', 'Female'])

## Representation of Different Genders

- Almost five times more males than females
- Very small representation of other Genders

In [None]:
plot = df[df.Age.notnull() == True].groupby(df.Gender).Age.size().plot.bar()


# Learning Resources 

## Most Popular Learning Resources

In [None]:
values = df[['ResourceBlogs', 'ResourceBooks', 'ResourceCodeWars','ResourceCodecademy',
'ResourceCoursera', 'ResourceDevTips', 'ResourceEdX', 'ResourceEggHead', 'ResourceFCC',
'ResourceGoogle', 'ResourceHackerRank', 'ResourceKhanAcademy', 'ResourceLynda', 'ResourceMDN',
'ResourceOdinProj', 'ResourceOther', 'ResourcePluralSight', 'ResourceReddit', 'ResourceSkillCrush',
'ResourceSoloLearn', 'ResourceStackOverflow', 'ResourceTreehouse', 'ResourceUdacity', 'ResourceUdemy',
'ResourceW3Schools', 'ResourceYouTube']].count()

bar = values.sort_values(ascending=False).plot.bar()

## Most popular learning resources by High Speed internet Availability

- Google, DevTips most popular among those who do not have high speed internet availability
- Strange that Youtube is in third place for those who do not have high speed internet availability

In [None]:
values = df.fillna(0)[['ResourceBlogs', 'ResourceBooks', 'ResourceCodeWars', 'ResourceCodecademy',
'ResourceCoursera', 'ResourceDevTips', 'ResourceEdX', 'ResourceEggHead', 'ResourceFCC', 'ResourceGoogle',
'ResourceHackerRank', 'ResourceKhanAcademy', 'ResourceLynda', 'ResourceMDN', 'ResourceOdinProj',
'ResourceOther', 'ResourcePluralSight', 'ResourceReddit', 'ResourceSkillCrush', 'ResourceSoloLearn',
'ResourceStackOverflow', 'ResourceTreehouse', 'ResourceUdacity', 'ResourceUdemy', 'ResourceW3Schools',
'ResourceYouTube', 'HasHighSpdInternet']].groupby('HasHighSpdInternet')
values = values.sum()

values = values.apply(lambda x: x/sum(x))
values = values.unstack().unstack()
values = values.sort_values(by=values.columns[0], axis=0)

plot = values.plot.bar(stacked=True)

## Most Popular Learning Resources by Gender

- SkillCrush is the most popular with females
- EggHead is the most popular with males

In [None]:
values = df.fillna(0)[['ResourceBlogs', 'ResourceBooks', 'ResourceCodeWars', 'ResourceCodecademy','ResourceCoursera',
'ResourceDevTips', 'ResourceEdX', 'ResourceEggHead', 'ResourceFCC', 'ResourceGoogle', 'ResourceHackerRank','ResourceKhanAcademy',
'ResourceLynda', 'ResourceMDN', 'ResourceOdinProj', 'ResourceOther', 'ResourcePluralSight',
'ResourceReddit', 'ResourceSkillCrush', 'ResourceSoloLearn', 'ResourceStackOverflow', 'ResourceTreehouse',
'ResourceUdacity', 'ResourceUdemy', 'ResourceW3Schools','ResourceYouTube', 'Gender']].groupby('Gender')


values = values.sum()
values = values.apply(lambda x: x/sum(x))
values = values.unstack().unstack()
values = values.sort_values(by=values.columns[4], axis=0)


values.plot.bar(stacked=True)