In [None]:
# Import some libraries

import pandas as pd
pd.options.mode.chained_assignment = None
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style = 'whitegrid')
import textwrap

In [None]:
# Load data from a CSV file into pandas dataFrame

data_score_humanities = pd.read_csv('../input/indonesia-college-entrance-examination-utbk-2019/score_humanities.csv')
data_score_science = pd.read_csv('../input/indonesia-college-entrance-examination-utbk-2019/score_science.csv')
data_universities = pd.read_csv('../input/indonesia-college-entrance-examination-utbk-2019/universities.csv')
data_majors = pd.read_csv('../input/indonesia-college-entrance-examination-utbk-2019/majors.csv')

In [None]:
score_humanities = data_score_humanities.copy()
score_humanities.head()

In [None]:
score_science = data_score_science.copy()
score_science.head()

In [None]:
data_majors.head()

In [None]:
data_universities.head()

In [None]:
# Informations on each data

print(score_humanities.info())
print('\n')
print(score_science.info())
print('\n')
print(data_majors.info())
print('\n')
print(data_universities.info())

# Exploratory Data Analysis

In [None]:
data_majors.head()

In [None]:
data_universities.head()

In [None]:
# Merge data majors and univ to get university name
data_major_univ = pd.merge(data_majors, data_universities[['id_university', 'university_name']], left_on = 'id_university',
                           right_on = 'id_university')
data_major_univ.drop(['Unnamed: 0'], axis = 1, inplace = True)
# Join major and university name
data_major_univ['major_univ_name'] = data_major_univ['major_name'] +' - ' +data_major_univ['university_name']

In [None]:
data_major_univ

In [None]:
#  Calculate the average of total score
score_humanities['avg_score'] = score_humanities.iloc[:, 6:15].mean(axis = 1)
score_science['avg_score'] = score_science.iloc[:, 6:14].mean(axis = 1)

In [None]:
score_humanities.head()

In [None]:
# Merge score_humanities with major and university dataframe to get major and university names
# First choice major and university
score_humanities = pd.merge(score_humanities, data_major_univ[['id_major', 'id_university', 'major_name', 'university_name', 'major_univ_name']]
                            , left_on = ['id_first_major', 'id_first_university']
                            , right_on = ['id_major', 'id_university']).drop(['id_major', 'id_university', 'id_first_major', 'id_first_university'], axis = 1)

score_humanities.rename(columns={'major_univ_name' : 'specific_first_choice'}, inplace = True)
score_humanities['avg_score_first_choice'] = score_humanities.groupby('specific_first_choice')['avg_score'].transform('mean')

# Second choice major and university
# Initiate suffixes to give specific columns name because there are major and university name columns before, so it will be duplicated.
score_humanities = pd.merge(score_humanities, data_major_univ[['id_major', 'id_university', 'major_name', 'university_name', 'major_univ_name']]
                            , left_on = ['id_second_major', 'id_second_university']
                            , right_on = ['id_major', 'id_university']
                            , suffixes=('_first_choice', '_second_choice')).drop(['id_major', 'id_university', 'id_second_major', 'id_second_university'], axis = 1)

score_humanities.rename(columns={'major_univ_name' : 'specific_second_choice'}, inplace = True)
score_humanities['avg_score_second_choice'] = score_humanities.groupby('specific_second_choice')['avg_score'].transform('mean')


# Drop 'Unnamed: 0' columns, which is will not use in this project. Then, sort values by id_user
score_humanities = score_humanities.drop('Unnamed: 0', axis = 1).sort_values('id_user')
score_humanities.head()

In [None]:
# Merge score_science with major and university dataframe to get major and university names
# First choice major and university
score_science = pd.merge(score_science, data_major_univ[['id_major', 'id_university', 'major_name', 'university_name', 'major_univ_name']]
                            , left_on = ['id_first_major', 'id_first_university']
                            , right_on = ['id_major', 'id_university']).drop(['id_major', 'id_university', 'id_first_major', 'id_first_university'], axis = 1)

score_science.rename(columns={'major_univ_name' : 'specific_first_choice'}, inplace = True)
score_science['avg_score_first_choice'] = score_science.groupby('specific_first_choice')['avg_score'].transform('mean')

# Second choice major and university
# Initiate suffixes to give specific columns name because there are major and university name columns before, so it will be duplicated.

score_science = pd.merge(score_science, data_major_univ[['id_major', 'id_university', 'major_name', 'university_name', 'major_univ_name']]
                            , left_on = ['id_second_major', 'id_second_university']
                            , right_on = ['id_major', 'id_university']
                            , suffixes=('_first_choice', '_second_choice')).drop(['id_major', 'id_university', 'id_second_major', 'id_second_university'], axis = 1)

score_science.rename(columns={'major_univ_name' : 'specific_second_choice'}, inplace = True)
score_science['avg_score_second_choice'] = score_science.groupby('specific_second_choice')['avg_score'].transform('mean')


# Drop 'Unnamed: 0' columns, which is will not use in this project. Then, sort values by id_user
score_science = score_science.drop('Unnamed: 0', axis = 1).sort_values('id_user')
score_science.head()

## Analysis Numeric and Categorical Data on Major and University Data

#### Distribution Plot of Capacity College Students on Each Major

In [None]:
fig, ax = plt.subplots(figsize = (8, 4))
fig.tight_layout(pad = 5)

sns.distplot(ax = ax, a = data_major_univ['capacity'].dropna(), label = "Skewness : %.2f"%(data_major_univ['capacity'].skew()))
ax.set_title('capacity', fontsize = 18)
ax.legend(loc = 'best')

plt.show()

#### Piechart Type of Majors

In [None]:
fig, ax = plt.subplots(figsize = (6, 6))
sizes = [count for count in data_major_univ['type'].value_counts()]
labels = list(data_major_univ['type'].value_counts().index + ' major')
ax.pie(x = sizes, labels = labels, autopct = '%1.1f%%',textprops={'fontsize': 14})
plt.show()

#### Top 10 Most Major are Provided by Universities & Universities Based on the Most Number of Majors

In [None]:
# Visualize with barplot

fig, ax = plt.subplots(2,1, figsize = (14, 8))
fig.tight_layout(pad = 6)
max_width = 13

index = data_major_univ['major_name'].fillna('NaN').value_counts().index[0:10]
count = data_major_univ['major_name'].fillna('NaN').value_counts()[0:10]
sns.barplot(ax = ax[0], x = index, y = count, order = index)
ax[0].set_title('Top 10 Most Majors are Provided by Universities', fontsize = 18)
ax[0].set_xticklabels((textwrap.fill(x.get_text(), max_width) for x in ax[0].get_xticklabels()), fontsize = 10)
for index,count in enumerate(count):
    ax[0].text(x=index-0.1 , y =count+0 , s=f"{count}" , fontdict=dict(fontsize=10))
ax[0].set_ylabel('No. of Universities', fontsize = 14)
ax[0].set_ylim(40, 65)


index = data_major_univ['university_name'].fillna('NaN').value_counts().index[0:10]
count = data_major_univ['university_name'].fillna('NaN').value_counts()[0:10]
sns.barplot(ax = ax[1], x = index, y = count, order = index)
ax[1].set_title('Top 10 Universities Based on the Most Number of Majors', fontsize = 18)
ax[1].set_xticklabels((textwrap.fill(x.get_text(), max_width) for x in ax[1].get_xticklabels()), fontsize = 10)
for index,count in enumerate(count):
    ax[1].text(x=index-0.1 , y=count+0 , s=f"{count}" , fontdict=dict(fontsize=10))
ax[1].set_ylabel('No. of Majors', fontsize = 14)
ax[1].set_ylim(60, 85)
plt.show()

## Analysis Numeric and Categorical Data on "Score Humanities" Data

### Analysis Numeric Data on "Score Humanities" Data

In [None]:
# Define numeric columns on "Score Humanities" Data
num_score_humanities = ['score_eko', 'score_geo', 'score_kmb', 'score_kpu','score_kua'
                        , 'score_mat', 'score_ppu', 'score_sej', 'score_sos', 'avg_score']

# Create distribution plot on each columns
fig, ax = plt.subplots(5, 2, figsize = (14, 14))
fig.tight_layout(pad = 5)

for ax, n in zip(ax.flatten(), num_score_humanities):
    sns.distplot(ax = ax, a = score_humanities[n].dropna(), label = "Skewness : %.2f"%(score_humanities[n].skew()))
    ax.set_title(n, fontsize = 18)
    ax.legend(loc = 'best')

plt.show()

In [None]:
# Create heatmap data numeric
cormat = score_humanities[num_score_humanities].corr()
fig, ax = plt.subplots(figsize = (12, 8))
sns.heatmap(ax = ax, data = cormat, annot = True)
ax.set_yticklabels(cormat,rotation = 0)
plt.show()

### Analysis Categorical Data on "Score Humanities" Data

#### Analysis First Choice Major & University on "Score Humanities" Data

In [None]:
# Define categorical columns on "Score Humanities" Data
first_choice_humanities = ['major_name_first_choice', 'university_name_first_choice']
first_choice_titles = ['Top 10 First Choices Humanities Majors', 'Top 10 First Choices Universities (Humanities Majors)']

#### - Barplot Top 10 First Choices  Humanities Majors and Universities

In [None]:
# Visualize top 10 first choices humanities majors and universities with barplot

fig, ax = plt.subplots(len(first_choice_humanities), 1, figsize = (14, 10))
fig.tight_layout(pad = 6)
max_width = 13

for ax, col, name, xlabel in zip(ax.flatten(), first_choice_humanities, first_choice_titles, ['Major', 'University']):
    index = score_humanities[col].fillna('NaN').value_counts().index[0:10]
    count = score_humanities[col].fillna('NaN').value_counts()[0:10]
    sns.barplot(ax = ax, x = index, y = count, order = index)
    ax.set_title(name, fontsize = 18)
    ax.set_xlabel(xlabel, fontsize = 14)
    ax.set_xticklabels((textwrap.fill(x.get_text(), max_width) for x in ax.get_xticklabels()), fontsize = 10)
    for index,count in enumerate(count):
        ax.text(x=index-0.1 , y=count+0 , s=f"{count}" , fontdict=dict(fontsize=10))
    ax.set_ylabel('No. of Participants', fontsize = 14)
    ax.set_ylim(0,6000)
    
plt.show()

#### - Boxplot Top 10 First Choices Humanities Majors and Universities Based on Average Score Participants

In [None]:
# Visualize top 10 first choices humanities majors and universities with boxplot based on average score participants

fig, ax = plt.subplots(len(first_choice_humanities), 1, figsize = (14, 10))
fig.tight_layout(pad = 6)
max_width = 13

for ax, col, name, xlabel in zip(ax.flatten(), first_choice_humanities, first_choice_titles, ['Major', 'University']):
    values = score_humanities[col].value_counts().sort_values(ascending = False).index[0:10]
    top_data = score_humanities[score_humanities[col].isin(values)]
    sns.boxplot(ax = ax, data=top_data, x = top_data[col], y = top_data['avg_score'], order = values)
    ax.set_title(name, fontsize = 18)
    ax.set_xlabel(xlabel, fontsize = 14)
    ax.set_xticklabels((textwrap.fill(x.get_text(), max_width) for x in ax.get_xticklabels()), fontsize = 10)
    ax.set_ylabel('Participants Average Score', fontsize = 14)

plt.show()

#### - Barplot (Counts Participants) and Boxplot (Participants AVG Score) Top 5 First Choices Humanities Majors - Universities

In [None]:
# Visualize top 5 first choices humanities majors - universities with barplot and boxplot

fig, ax = plt.subplots(2, 1, figsize = (10, 8))
fig.tight_layout(pad = 6)

index = score_humanities['specific_first_choice'].fillna('NaN').value_counts().index[0:5]
count = score_humanities['specific_first_choice'].fillna('NaN').value_counts()[0:5]
sns.barplot(ax = ax[0], x = count, y = index, order = index)
ax[0].set_title('Top 5 First Choices Humanities Majors - Universities', fontsize = 16)
ax[0].set_xlabel('No. of Participants', fontsize = 14)
ax[0].set_ylabel('Humanities Majors - Universities', fontsize = 14)
ax[0].set_yticklabels(ax[0].get_yticklabels(), fontsize = 8)
for index,count in enumerate(count):
        ax[0].text(x=count+1, y=index+0.1, s=f"{count}" , fontdict=dict(fontsize=10))


values = score_humanities['specific_first_choice'].value_counts().sort_values(ascending = False).index[0:5]
top5_data = score_humanities[score_humanities['specific_first_choice'].isin(values)]
sns.boxplot(ax = ax[1], data=top5_data, x = top_data['avg_score'], y = top5_data['specific_first_choice'], order = values)
ax[1].set_title('Top 5 First Choices Humanities Majors - Universities', fontsize = 16)
ax[1].set_xlabel('Participants Average Score', fontsize = 14)
ax[1].set_ylabel('Humanities Majors - Universities', fontsize = 14)
ax[1].set_yticklabels(ax[1].get_yticklabels(), fontsize = 8)
plt.show()

#### Analysis Second Choice Major & University on "Score Humanities" Data

In [None]:
# Define categorical columns on "Score Humanities" Data
second_choice_humanities = ['major_name_second_choice', 'university_name_second_choice']
second_choice_titles = ['Top 10 Second Choices Humanities Majors', 'Top 10 Second Choices Universities (Humanities Majors)']

#### - Barplot Top 10 Second Choices Humanities Majors and Universities

In [None]:
# Visualize top 10 second choices humanities majors and universities with barplot

fig, ax = plt.subplots(len(first_choice_humanities), 1, figsize = (14, 10))
fig.tight_layout(pad = 6)
max_width = 13

for ax, col, name, xlabel in zip(ax.flatten(), second_choice_humanities, second_choice_titles, ['Majors', 'Universities']):
    index = score_humanities[col].fillna('NaN').value_counts().index[0:10]
    count = score_humanities[col].fillna('NaN').value_counts()[0:10]
    sns.barplot(ax = ax, x = index, y = count, order = index)
    ax.set_title(name, fontsize = 18)
    ax.set_xlabel(xlabel, fontsize = 14)
    ax.set_xticklabels((textwrap.fill(x.get_text(), max_width) for x in ax.get_xticklabels()), fontsize = 10)
    for index,count in enumerate(count):
        ax.text(x=index-0.1 , y =count+0 , s=f"{count}" , fontdict=dict(fontsize=10))
    ax.set_ylabel('No. of Participants', fontsize = 14)
    ax.set_ylim(0, 5500)

plt.show()

#### - Boxplot Top 10 Second Choices Humanities Majors and Universities Based on Average Score Participants

In [None]:
# Visualize top 10 second choices humanities majors and universities with boxplot based on average score participants

fig, ax = plt.subplots(len(second_choice_humanities), 1, figsize = (14, 10))
fig.tight_layout(pad = 6)
max_width = 13

for ax, col, name, xlabel in zip(ax.flatten(), second_choice_humanities, second_choice_titles, ['Majors', 'Universities']):
    values = score_humanities[col].value_counts().sort_values(ascending = False).index[0:10]
    top_data = score_humanities[score_humanities[col].isin(values)]
    sns.boxplot(ax = ax, data=top_data, x = top_data[col], y = top_data['avg_score'], order = values)
    ax.set_title(name, fontsize = 18)
    ax.set_xlabel(xlabel, fontsize = 14)
    ax.set_xticklabels((textwrap.fill(x.get_text(), max_width) for x in ax.get_xticklabels()), fontsize = 10)
    ax.set_ylabel('Participants Average Score', fontsize = 14)

plt.show()

#### - Barplot (Counts Participants) and Boxplot (Participants AVG Score) Top 5 Second Choices Humanities Majors - Universities

In [None]:
# Visualize top 5 second choices humanities majors - universities with barplot and boxplot

fig, ax = plt.subplots(2, 1, figsize = (10, 8))
fig.tight_layout(pad = 6)

index = score_humanities['specific_second_choice'].fillna('NaN').value_counts().index[0:5]
count = score_humanities['specific_second_choice'].fillna('NaN').value_counts()[0:5]
sns.barplot(ax = ax[0], x = count, y = index, order = index)
ax[0].set_title('Top 5 Second Choices Humanities Majors - Universities', fontsize = 16)
ax[0].set_xlabel('No. of Participants', fontsize = 14)
ax[0].set_ylabel('Humanities Majors - Universities', fontsize = 14)
ax[0].set_yticklabels(ax[0].get_yticklabels(), fontsize = 8)
for index,count in enumerate(count):
        ax[0].text(x=count+1, y=index+0.1, s=f"{count}" , fontdict=dict(fontsize=10))


values = score_humanities['specific_second_choice'].value_counts().sort_values(ascending = False).index[0:5]
top5_data = score_humanities[score_humanities['specific_second_choice'].isin(values)]
sns.boxplot(ax = ax[1], data=top_data, x = top5_data['avg_score'], y = top5_data['specific_second_choice'], order = values)
ax[1].set_title('Top 5 Second Choices Humanities Majors - Universities', fontsize = 16)
ax[1].set_xlabel('Participants Average Score', fontsize = 14)
ax[1].set_ylabel('Humanities Majors - Universities', fontsize = 14)
ax[1].set_yticklabels(ax[1].get_yticklabels(), fontsize = 8)
plt.show()

## Analysis Numeric and Categorical Data on "Score Science" Data

### Analysis Numeric Data on "Score Science" Data

In [None]:
# Create distribution plot on each columns
fig, ax = plt.subplots(5, 2, figsize = (14, 14))
fig.tight_layout(pad = 5)

# Define numeric columns on "Score Humanities" Data
num_score_science = ['score_bio', 'score_fis', 'score_kim', 'score_kmb','score_kpu'
                     , 'score_kua', 'score_mat', 'score_ppu', 'avg_score']

for ax, n in zip(ax.flatten(), num_score_science):
    sns.distplot(ax = ax, a = score_science[n].dropna(), label = "Skewness : %.2f"%(score_science[n].skew()))
    ax.set_title(n, fontsize = 18)
    ax.legend(loc = 'best')

plt.show()

In [None]:
# Create heatmap data numeric
cormat = score_science[num_score_science].corr()
fig, ax = plt.subplots(figsize = (12, 8))
sns.heatmap(ax = ax, data = cormat, annot = True)
ax.set_yticklabels(cormat,rotation = 0)
plt.show()

### Analysis Categorical Data on "Score Science" Data

#### Analysis First Choice Major & University on "Score Science" Data

In [None]:
# Define categorical columns on "Score Humanities" Data
first_choice_science = ['major_name_first_choice', 'university_name_first_choice']
first_choice_titles = ['Top 10 First Choices Science Majors', 'Top 10 First Choices Universities (Science Majors)']

#### - Barplot Top 10 First Choices Science Majors and Universities

In [None]:
# Visualize top 10 first choices science majors and universities with barplot

fig, ax = plt.subplots(len(first_choice_science), 1, figsize = (14, 10))
fig.tight_layout(pad = 6)
max_width = 13

for ax, col, name, xlabel in zip(ax.flatten(), first_choice_science, first_choice_titles, ['Majors', 'Universities']):
    index = score_science[col].fillna('NaN').value_counts().index[0:10]
    count = score_science[col].fillna('NaN').value_counts()[0:10]
    sns.barplot(ax = ax, x = index, y = count, order = index)
    ax.set_title(name, fontsize = 18)
    ax.set_xlabel(xlabel, fontsize = 14)
    ax.set_xticklabels((textwrap.fill(x.get_text(), max_width) for x in ax.get_xticklabels()), fontsize = 10)
    for index,count in enumerate(count):
        ax.text(x=index-0.1 , y =count+0 , s=f"{count}" , fontdict=dict(fontsize=10))    
    ax.set_ylabel('No. of Participants', fontsize = 14)
plt.show()

#### - Boxplot Top 10 First Choices Science Majors and Universities Based on Average Score Participants

In [None]:
# Visualize top 10 first choices science majors and universities with boxplot based on average score participants

fig, ax = plt.subplots(len(first_choice_science), 1, figsize = (14, 10))
fig.tight_layout(pad = 6)
max_width = 13

for ax, col, name, xlabel in zip(ax.flatten(), first_choice_science, first_choice_titles, ['Majors', 'Universities']):
    values = score_science[col].value_counts().sort_values(ascending = False).index[0:10]
    top_data = score_science[score_science[col].isin(values)]
    sns.boxplot(ax = ax, data=top_data, x = top_data[col], y = top_data['avg_score'], order = values)
    ax.set_title(name, fontsize = 18)
    ax.set_xlabel(xlabel, fontsize = 14)
    ax.set_ylabel('Participants Average Score', fontsize = 14)
    ax.set_xticklabels((textwrap.fill(x.get_text(), max_width) for x in ax.get_xticklabels()), fontsize = 10)

plt.show()

#### - Barplot (Counts Participants) and Boxplot (Participants AVG Score) Top 5 First Choices Science Majors - Universities

In [None]:
# Visualize top 5 first choices science majors - universities with barplot and boxplot

fig, ax = plt.subplots(2, 1, figsize = (10, 8))
fig.tight_layout(pad = 6)

index = score_science['specific_first_choice'].fillna('NaN').value_counts().index[0:5]
count = score_science['specific_first_choice'].fillna('NaN').value_counts()[0:5]
sns.barplot(ax = ax[0], x = count, y = index, order = index)
ax[0].set_title('Top 5 First Choices Science Majors - Universities', fontsize = 16)
ax[0].set_xlabel('No. of Participants', fontsize = 14)
ax[0].set_ylabel('Science Majors - Universities', fontsize = 14)
ax[0].set_yticklabels(ax[0].get_yticklabels(), fontsize = 8)
for index,count in enumerate(count):
        ax[0].text(x=count+1, y=index+0.1, s=f"{count}" , fontdict=dict(fontsize=10))

values = score_science['specific_first_choice'].value_counts().sort_values(ascending = False).index[0:5]
top5_data = score_science[score_science['specific_first_choice'].isin(values)]
sns.boxplot(ax = ax[1], data=top5_data, x = top_data['avg_score'], y = top5_data['specific_first_choice'], order = values)
ax[1].set_title('Top 5 First Choices Science Majors - Universities', fontsize = 16)
ax[1].set_xlabel('Participants Average Score', fontsize = 14)
ax[1].set_ylabel('Science Majors - Universities', fontsize = 14)
ax[1].set_yticklabels(ax[1].get_yticklabels(), fontsize = 8)
plt.show()

#### Analysis Second Choice Major & University on "Score Science" Data

In [None]:
# Define categorical columns on "Score Humanities" Data
second_choice_science = ['major_name_second_choice', 'university_name_second_choice']
second_choice_titles = ['Top 10 Second Choices Science Majors', 'Top 10 Second Choices Universities (Science Majors)']

#### - Barplot Top 10 Second Choices Major and University

In [None]:
# Visualize top 10 first choices science majors and universities with barplot

fig, ax = plt.subplots(len(first_choice_science), 1, figsize = (14, 10))
fig.tight_layout(pad = 6)
max_width = 13

for ax, col, name, xlabel in zip(ax.flatten(), second_choice_science, second_choice_titles, ['Majors', 'Universities']):
    index = score_science[col].fillna('NaN').value_counts().index[0:10]
    count = score_science[col].fillna('NaN').value_counts()[0:10]
    sns.barplot(ax = ax, x = index, y = count, order = index)
    ax.set_title(name, fontsize = 18)
    ax.set_xlabel(xlabel, fontsize = 14)
    ax.set_xticklabels((textwrap.fill(x.get_text(), max_width) for x in ax.get_xticklabels()), fontsize = 10)
    for index,count in enumerate(count):
        ax.text(x=index-0.1 , y =count+0 , s=f"{count}" , fontdict=dict(fontsize=10))  
    ax.set_ylabel('No. of Participants', fontsize = 14)

plt.show()

#### - Boxplot Top 10 Second Choices Majors and Universities

In [None]:
# Visualize top 10 first choices science majors and universities with boxplot

fig, ax = plt.subplots(len(second_choice_science), 1, figsize = (14, 10))
fig.tight_layout(pad = 6)
max_width = 13

for ax, col, name, xlabel in zip(ax.flatten(), second_choice_science, second_choice_titles, ['Majors', 'Universities']):
    values = score_science[col].value_counts().sort_values(ascending = False).index[0:10]
    top_data = score_science[score_science[col].isin(values)]
    sns.boxplot(ax = ax, data=top_data, x = top_data[col], y = top_data['avg_score'], order = values)
    ax.set_title(name, fontsize = 18)
    ax.set_xlabel(xlabel, fontsize = 14)
    ax.set_ylabel('Participants Average Score', fontsize = 14)
    ax.set_xticklabels((textwrap.fill(x.get_text(), max_width) for x in ax.get_xticklabels()), fontsize = 10)

plt.show()

#### - Barplot (Counts Participants) and Boxplot (Participants AVG Score) Top 5 Second Choices Science Majors - Universities

In [None]:
# Visualize top 5 second choices science majors - universities with barplot and boxplot

fig, ax = plt.subplots(2, 1, figsize = (10, 8))
fig.tight_layout(pad = 5)

index = score_science['specific_second_choice'].fillna('NaN').value_counts().index[0:5]
count = score_science['specific_second_choice'].fillna('NaN').value_counts()[0:5]
sns.barplot(ax = ax[0], x = count, y = index, order = index)
ax[0].set_title('Top 5 Second Choices Science Majors - Universities', fontsize = 16)
ax[0].set_xlabel('No. of Participants', fontsize = 14)
ax[0].set_ylabel('Science Majors - Universities', fontsize = 14)
ax[0].set_yticklabels(ax[0].get_yticklabels(), fontsize = 8)
for index,count in enumerate(count):
        ax[0].text(x=count+1, y=index+0.1, s=f"{count}" , fontdict=dict(fontsize=10))

values = score_science['specific_second_choice'].value_counts().sort_values(ascending = False).index[0:5]
top5_data = score_science[score_science['specific_second_choice'].isin(values)]
sns.boxplot(ax = ax[1], data=top_data, x = top5_data['avg_score'], y = top5_data['specific_second_choice'], order = values)
ax[1].set_title('Top 5 Second Choices Science Majors - Universities', fontsize = 16)
ax[1].set_xlabel('Participants Average Score', fontsize = 14)
ax[1].set_ylabel('Science Majors - Universities', fontsize = 14)
ax[1].set_yticklabels(ax[1].get_yticklabels(), fontsize = 8)
plt.show()