In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
sns.set(style = 'whitegrid')

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Ujian Tulis Berbasis Komputer 2019

Ujian Tulis Berbasis Komputer (UTBK) merupakan tes masuk ke perguruan tinggi yang dilaksanakan oleh Lembaga Tes Masuk Perguruan Tinggi (LTMPT) sebagai satu-satunya lembaga penyelenggara tes perguruan tinggi terstandar di Indonesia.

## Read data

In [None]:
jurusan = pd.read_csv("../input/indonesia-college-entrance-examination-utbk-2019/majors.csv")
skor_soshum = pd.read_csv("../input/indonesia-college-entrance-examination-utbk-2019/score_humanities.csv")
skor_saintek = pd.read_csv("../input/indonesia-college-entrance-examination-utbk-2019/score_science.csv")
univ = pd.read_csv("../input/indonesia-college-entrance-examination-utbk-2019/universities.csv")

In [None]:
print("JURUSAN: \n", jurusan.info(), "\n", jurusan.head(), "\n\n")
print("SKOR SOSHUM: \n", skor_soshum.info(), "\n", skor_soshum.head(), "\n\n")
print("SKOR SAINTEK: \n", skor_saintek.info(), "\n", skor_saintek.head(), "\n\n")
print("UNIVERSITAS: \n", univ.info(), "\n", univ.head(), "\n\n")

## Mencari Rata-rata Nilai

In [None]:
#SOSHUM
skor_soshum["rata_rata"] = skor_soshum.iloc[:,6:15].mean(axis=1)
#SAINTEK
skor_saintek["rata_rata"] = skor_soshum.iloc[:,6:14].mean(axis=1)

In [None]:
skor_soshum["rata_rata"].head()

In [None]:
skor_saintek["rata_rata"].head()

## Merge Univ - Jurusan

In [None]:
univ_jur = pd.merge(univ, jurusan, on="id_university")
univ_jur["univ_prodi"] = univ_jur["major_name"] + " - " + univ_jur["university_name"]
univ_jur.head()

## Banyaknya Tipe Jurusan

In [None]:
data = univ_jur["type"].value_counts()
fig, ax = plt.subplots(figsize=(15,5))
ax.pie(data, autopct='%1.1f%%')
ax.legend(["SAINTEK", "SOSHUM"], 
          title = "Tipe Jurusan",
          bbox_to_anchor=(1,1))

## Kapasitas Jurusan

In [None]:
data_capacity = univ_jur.loc[:,["capacity","univ_prodi"]].sort_values(by='capacity', 
                                                                      ascending=False).head(10)
data_capacity["univ_prodi"] = data_capacity["univ_prodi"].apply(lambda x: x.replace(" - ","\n").replace(" ","\n"))

fig, ax = plt.subplots(2, 1, figsize=(17,15))
fig.tight_layout(pad=7)
sns.barplot(ax = ax[0], x = data_capacity['univ_prodi'], y = data_capacity['capacity'])
ax[0].set_title("10 Program Studi dengan Kapasitas Terbanyak (2019)", fontsize=20)
ax[0].set_ylabel("Kapasitas", fontsize=15)
ax[0].set_xlabel("")

sns.histplot(ax = ax[1], data = univ_jur["capacity"])
ax[1].set_title("Distribusi Kapasitas", fontsize=20)
ax[1].set_ylabel("Universitas", fontsize=15)
ax[1].set_xlabel("Kapasitas", fontsize=15)
ax[1].legend(["Skewness : %.2f"%(univ_jur["capacity"].skew())],
             bbox_to_anchor=(0.95, 0.95),
             fontsize=15)

# SAINTEK (Sains dan Teknologi)

In [None]:
saintek_1 = pd.merge(skor_saintek[["id_user","id_first_major","id_first_university","rata_rata"]],
                     univ_jur[["id_university", "id_major", "univ_prodi"]],
                    left_on=["id_first_university", "id_first_major"],
                    right_on=["id_university", "id_major"]).drop(['id_major', 'id_university', 
                                                                  'id_first_major', 'id_first_university'], axis = 1)

saintek_2 = pd.merge(skor_saintek[["id_user", "id_second_major","id_second_university","rata_rata"]],
                     univ_jur[["id_university", "id_major", "univ_prodi"]],
                    left_on=["id_second_university", "id_second_major"],
                    right_on=["id_university", "id_major"]).drop(['id_major', 'id_university', 
                                                                  'id_second_major', 'id_second_university'], axis = 1)

### Pilihan 1 (SAINTEK)

In [None]:
saintek_1.sort_values(ascending=True, by="id_user").head()

### Pilihan 2 (SAINTEK)

In [None]:
saintek_2.sort_values(ascending=True, by="id_user").head()

In [None]:
total_saintek = pd.merge(saintek_1, saintek_2, on=["id_user","rata_rata"])
total_saintek = total_saintek.rename(columns={"univ_prodi_x": "prod_pil_1", 
                                              "univ_prodi_y": "prod_pil_2",
                                             }, 
                                     errors="raise").sort_values(by="id_user")
total_saintek.head()

### Distribusi Nilai SAINTEK

In [None]:
fig, ax = plt.subplots(figsize=(10,5))
sns.histplot(data=total_saintek["rata_rata"], ax=ax)
ax.set_title("Distribusi Nilai UTBK SAINTEK", fontsize=15)
ax.legend(["Skewness : %.2f"%(total_saintek['rata_rata'].skew())],
          bbox_to_anchor=(0.95, 0.95))
ax.set_ylabel("Pengikut")
ax.set_xlabel("Rata-rata")

### Top 10 Pilihan 1 dan 2 SAINTEK

In [None]:
#Top 10 Pilihan 1 dan 2 Saintek
top10saintek_pil1 = total_saintek.groupby("prod_pil_1")["prod_pil_1"].count().sort_values(ascending=False)
top10saintek_pil1 = pd.DataFrame(data = top10saintek_pil1).rename(columns={"prod_pil_1":"count"}).reset_index().head(10)
top10saintek_pil1['prod_pil_1'] = top10saintek_pil1['prod_pil_1'].apply(lambda x: x.replace(" - ","\n").replace(" ","\n"))

top10saintek_pil2 = total_saintek.groupby("prod_pil_2")["prod_pil_2"].count().sort_values(ascending=False)
top10saintek_pil2 = pd.DataFrame(data = top10saintek_pil2).rename(columns={"prod_pil_2":"count"}).reset_index().head(10)
top10saintek_pil2['prod_pil_2'] = top10saintek_pil2['prod_pil_2'].apply(lambda x: x.replace(" - ","\n").replace(" ","\n"))

fig, ax = plt.subplots(2, 1, figsize=(20,17))
fig.tight_layout(pad=12)
sns.barplot(ax = ax[0], x = top10saintek_pil1["prod_pil_1"], y = top10saintek_pil1["count"] )
ax[0].set_title("10 Teratas Pilihan 1 SAINTEK", fontsize=17)
ax[0].set_ylabel("Peminat")
ax[0].set_xlabel("Prodi Pilihan 1")

sns.barplot(ax = ax[1], x = top10saintek_pil2["prod_pil_2"], y = top10saintek_pil2["count"] )
ax[1].set_title("10 Teratas Pilihan 2 SAINTEK", fontsize=17)
ax[1].set_ylabel("Peminat")
ax[1].set_xlabel("Prodi Pilihan 2")

# SOSHUM (Sosial dan Humaniora)

In [None]:
soshum_1 = pd.merge(skor_soshum[["id_user","id_first_major","id_first_university","rata_rata"]],
                     univ_jur[["id_university", "id_major", "univ_prodi"]],
                    left_on=["id_first_university", "id_first_major"],
                    right_on=["id_university", "id_major"]).drop(['id_major', 'id_university', 
                                                                  'id_first_major', 'id_first_university'], axis = 1)

soshum_2 = pd.merge(skor_soshum[["id_user", "id_second_major","id_second_university","rata_rata"]],
                     univ_jur[["id_university", "id_major", "univ_prodi"]],
                    left_on=["id_second_university", "id_second_major"],
                    right_on=["id_university", "id_major"]).drop(['id_major', 'id_university', 
                                                                  'id_second_major', 'id_second_university'], axis = 1)

### Pilihan 1 (SOSHUM)

In [None]:
soshum_1.sort_values(ascending=True, by="id_user").head()

### Pilihan 2 (SOSHUM)

In [None]:
soshum_2.sort_values(ascending=True, by="id_user").head()

In [None]:
total_soshum = pd.merge(soshum_1, soshum_2, on=["id_user","rata_rata"])
total_soshum = total_soshum.rename(columns={"univ_prodi_x": "prod_pil_1", 
                                              "univ_prodi_y": "prod_pil_2",
                                             }, 
                                     errors="raise").sort_values(by="id_user")
total_soshum.head()

### Distribusi Nilai SOSHUM

In [None]:
fig, ax = plt.subplots(figsize=(10,5))
sns.histplot(data=total_soshum["rata_rata"], ax=ax)
ax.set_title("Distribusi Nilai UTBK SOSHUM", fontsize=15)
ax.legend(["Skewness : %.2f"%(total_soshum['rata_rata'].skew())],
          bbox_to_anchor=(0.95, 0.95))
ax.set_ylabel("Pengikut")
ax.set_xlabel("Rata-rata")

### Top 10 Pilihan 1 dan 2 SOSHUM

In [None]:
#Top 10 Pilihan 1 dan 2 Saintek
top10soshum_pil1 = total_soshum.groupby("prod_pil_1")["prod_pil_1"].count().sort_values(ascending=False)
top10soshum_pil1 = pd.DataFrame(data = top10soshum_pil1).rename(columns={"prod_pil_1":"count"}).reset_index().head(10)
top10soshum_pil1['prod_pil_1'] = top10soshum_pil1['prod_pil_1'].apply(lambda x: x.replace(" - ","\n").replace(" ","\n"))

top10soshum_pil2 = total_soshum.groupby("prod_pil_2")["prod_pil_2"].count().sort_values(ascending=False)
top10soshum_pil2 = pd.DataFrame(data = top10soshum_pil2).rename(columns={"prod_pil_2":"count"}).reset_index().head(10)
top10soshum_pil2['prod_pil_2'] = top10soshum_pil2['prod_pil_2'].apply(lambda x: x.replace(" - ","\n").replace(" ","\n"))

fig, ax = plt.subplots(2, 1, figsize=(20,17))
fig.tight_layout(pad=12)
sns.barplot(ax = ax[0], x = top10soshum_pil1["prod_pil_1"], y = top10soshum_pil1["count"] )
ax[0].set_title("10 Teratas Pilihan 1 SOSHUM", fontsize=17)
ax[0].set_ylabel("Peminat")
ax[0].set_xlabel("Prodi Pilihan 1")

sns.barplot(ax = ax[1], x = top10soshum_pil2["prod_pil_2"], y = top10soshum_pil2["count"] )
ax[1].set_title("10 Teratas Pilihan 2 SOSHUM", fontsize=17)
ax[1].set_ylabel("Peminat")
ax[1].set_xlabel("Prodi Pilihan 2")