In [None]:
# Importing libraries and the dataset

%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import colorsys
plt.style.use('seaborn-talk')

df = pd.read_csv("../input/2016-FCC-New-Coders-Survey-Data.csv", sep=',')

**Distribution of Age**

In [None]:
df.Age.hist(bins=75)
plt.xlabel("Age")
plt.title("Distribution of Age")
plt.show()

Most learners are in the 20-30 year gap, being 25 the most frequent age for new coders.

**Distribution of Gender**

In [None]:
labels = df.Gender.value_counts().index
N = len(df.EmploymentField.value_counts().index)
HSV_tuples = [(x*1.0/N, 0.5, 0.5) for x in range(N)]
RGB_tuples = list(map(lambda x: colorsys.hsv_to_rgb(*x), HSV_tuples))
patches, texts = plt.pie(df.Gender.value_counts(), colors=RGB_tuples, startangle=90)
plt.axes().set_aspect('equal', 'datalim')
plt.legend(patches, labels, bbox_to_anchor=(1.05,1))
plt.title("Gender")
plt.show()

New coders are mostly men, with a very high proportion with respect to women

**Distribution of Job role interest**

In [None]:
N = len(df.JobRoleInterest.value_counts().index)
HSV_tuples = [(x*1.0/N, 0.5, 0.5) for x in range(N)]
RGB_tuples = list(map(lambda x: colorsys.hsv_to_rgb(*x), HSV_tuples))
labels = df.JobRoleInterest.value_counts().index
colors = ['OliveDrab', 'Orange', 'OrangeRed', 'DarkCyan', 'Salmon', 'Sienna', 'Maroon', 'LightSlateGrey', 'DimGray']
patches, texts = plt.pie(df.JobRoleInterest.value_counts(), colors=RGB_tuples, startangle=90)
plt.axes().set_aspect('equal', 'datalim')
plt.legend(patches, labels, bbox_to_anchor=(1.25, 1))
plt.title("Job Role Interest")
plt.show()

The interest of new coders seems to lie in Web Development (both front and back-end), followed by Data Science.

**Distribution of Employment field**

In [None]:
N = len(df.EmploymentField.value_counts().index)
HSV_tuples = [(x*1.0/N, 0.5, 0.5) for x in range(N)]
RGB_tuples = list(map(lambda x: colorsys.hsv_to_rgb(*x), HSV_tuples))
labels = df.EmploymentField.value_counts().index
patches, texts = plt.pie(df.EmploymentField.value_counts(), colors=RGB_tuples, startangle=90)
plt.axes().set_aspect('equal', 'datalim')
plt.legend(patches, labels, bbox_to_anchor=(1.3, 1))
plt.title("Employment Field")
plt.show()

New coders mostly belong to the Software Development professional field. This makes sense, since it is a field of constant change and developers need to update and broaden their knowledge and improve their skills in order to be aligned with the rapid changes that take place in the market.

**Job preference per age**

In [None]:
df_ageranges = df.copy()
bins=[0, 20, 30, 40, 50, 60, 100]
df_ageranges['AgeRanges'] = pd.cut(df_ageranges['Age'], bins, labels=["< 20", "20-30", "30-40", "40-50", "50-60", "< 60"]) 
df2 = pd.crosstab(df_ageranges.AgeRanges,df_ageranges.JobPref).apply(lambda r: r/r.sum(), axis=1)
N = len(df_ageranges.AgeRanges.value_counts().index)
HSV_tuples = [(x*1.0/N, 0.5, 0.5) for x in range(N)]
RGB_tuples = list(map(lambda x: colorsys.hsv_to_rgb(*x), HSV_tuples))
ax1 = df2.plot(kind="bar", stacked=True, color= RGB_tuples, title="Job preference per Age")
lines, labels = ax1.get_legend_handles_labels()
ax1.legend(lines,labels, bbox_to_anchor=(1.51, 1))

The interest to become a freelance worker increases with age, being the main preference for people older than 60.

People younger than 30 would like to work for a startup or start their own business, while this preference decreases significatively with age.

Working for a medium-sized company is the job preference for people between their 20s and their 50s.

**Employment field and Under-employed**

In [None]:
df4 = pd.crosstab(df_ageranges.EmploymentField,df_ageranges.IsUnderEmployed).apply(lambda r: r/r.sum(), axis=1)
df4 = df4.sort_values(by=1.0)
ax1 = df4.plot(kind="bar", stacked=True, title="Under-employed per Employment Field")
lines, labels = ax1.get_legend_handles_labels()
ax1.legend(lines,["No", "Yes"], bbox_to_anchor=(1.51, 1))

The field where people feel less under-employed is software development, followed by software development and IT. On the contrary, food and beverage is the field where employees feel most under-employed.

**Relocation per Age**

In [None]:
df3 = pd.crosstab(df_ageranges.AgeRanges,df_ageranges.JobRelocateYesNo).apply(lambda r: r/r.sum(), axis=1)
N = len(df_ageranges.AgeRanges.value_counts().index)
HSV_tuples = [(x*1.0/N, 0.5, 0.5) for x in range(N)]
RGB_tuples = list(map(lambda x: colorsys.hsv_to_rgb(*x), HSV_tuples))
ax1 = df3.plot(kind="bar", stacked=True, color=RGB_tuples, title="Relocation per Age")
lines, labels = ax1.get_legend_handles_labels()
ax1.legend(lines,["No", "Yes"], loc='best')

The will to relocate for a job decreases with age. Almost 80% of the people under 30 years are prepared for it.

**Bootcamp preference**

In [None]:
df_aux = df.copy()
values = df_aux.BootcampName.value_counts(dropna=True)
values = values[values > 10]
values = values[values.index != 'Free Code Camp is not a bootcamp - please scroll up and change answer to "no"']
index = np.arange(0, len(values.index))
plt.bar(index, values)
plt.xticks(index + 0.5, values.index, rotation="vertical")
plt.xlim(0, len(values))
plt.title("Bootcamp preference")
plt.show()

The most popular bootcamp for new coders is General Assembly, which 90 students from the dataset have enrolled.

**Which bootcamps are more likely to be recommended?**

In [None]:
df_aux2 = df.copy()
df_aux2 = df_aux2[df_aux2.BootcampFinish == 1.0]
values = df_aux2.BootcampName.value_counts(dropna=True)
values = values[values > 10]
values = values[values.index != 'Free Code Camp is not a bootcamp - please scroll up and change answer to "no"']
df_aux2 = df_aux2[df_aux2["BootcampName"].isin(values.index.tolist())]
df10 = pd.crosstab(df_aux2.BootcampName,df_aux2.BootcampRecommend).apply(lambda r: r/r.sum(), axis=1)
df10 = df10.sort_values(by=1.0, ascending=False)
df10.ix[values.index,:]
df10.plot(kind="bar", stacked=True, title="Recommended bootcamps")

Dev Academy is the bootcamp that students are more likely to recommend, with 100% of satisfied participants. In this figure, only students that have finished the bootcamp have been taken into account. Morevover, the study has been restricted to bootcamps with at least 10 enrolled students.

**Money for learning vs Expected earning**

In [None]:
df5 = df.copy()
df5 = df5.dropna(subset=["ExpectedEarning"])
df5 = df5[df['MoneyForLearning'].isin(range(0,60000))]

x = df5.MoneyForLearning
y = df5.ExpectedEarning

m, b = np.polyfit(x, y, 1)
plt.plot(x, y, '.', alpha=0.5)
plt.plot(x, m*x + b, '-', color="red")
plt.xlabel("Money for learning")
plt.ylabel("Expected earning")
plt.title("Money for learning vs Expected earning")
plt.show()

People who spend more money for learning expect in general a higher salary in the future, although there is a high number of students who expect a very high income having spent very little money on learning (close to $0).


**Income vs Money for learning**

In [None]:
df6 = df.copy()
df6 = df6.dropna(subset=["Income"])
df6 = df6[df['MoneyForLearning'].isin(range(0,60000))]

x = df6.Income
y = df6.MoneyForLearning

m, b = np.polyfit(x, y, 1)
plt.plot(x, y, '.', alpha=0.5)
plt.plot(x, m*x + b, '-', color="red")
plt.title("Income vs Money for learning")
plt.xlabel("Income")
plt.ylabel("Money for learning")
plt.show()

The money that students spend on learning does not depend on their income.

**Hours learning vs Expected earning**

In [None]:
df7 = df.copy()
df7 = df7.dropna(subset=["HoursLearning"])
df7 = df7.dropna(subset=["ExpectedEarning"])

x = df7.HoursLearning
y = df7.ExpectedEarning

m, b = np.polyfit(x, y, 1)
plt.plot(x, y, '.', alpha=0.5)
plt.plot(x, m*x + b, '-', color="red")
plt.xlabel("Hours learning")
plt.ylabel("Expected earning")
plt.title("Hours learning vs Expected earning")
plt.show()

The amount of hours spent learning is not correlated with the expected earning. Thus, people who expect to have a higher income don't spend more hours learning in order to reach that goal.

**Months programming vs Hours learning**

In [None]:
df8 = df.copy()
df8 = df8.dropna(subset=["HoursLearning"])
df8 = df8[df['MonthsProgramming'].isin(range(0,500))]

x = df8.MonthsProgramming
y = df8.HoursLearning

m, b = np.polyfit(x, y, 1)
plt.plot(x, y, '.', alpha=0.5)
plt.plot(x, m*x + b, '-', color="red")
plt.xlabel("Months Programming")
plt.ylabel("Hours learning")
plt.title("Months programming vs Hours learning")
plt.show()

Developers who have been coding for a long time tend to spend less hours learning, probably because it is easier to master one programming lenguage when you already have some programming background.

**Age vs Hours learning**

In [None]:
df9 = df.copy()
df9 = df9.dropna(subset=["HoursLearning"])
df9 = df9[df['Age'].isin(range(0,70))]
x = df9.Age
y = df9.HoursLearning
m, b = np.polyfit(x, y, 1)
plt.plot(x, y, '.', alpha=0.5)
plt.plot(x, m*x + b, '-', color="red")
plt.xlabel("Age")
plt.ylabel("Hours learning")
plt.title("Age vs Hours learning")
plt.show()

The age of the students does not have an effect on the amount of hours they spend learning.