## Student dropout rate

Predict students' dropout and academic success.

Download and unzip data from [here](https://archive.ics.uci.edu/dataset/697/predict+students+dropout+and+academic+success). Rename the data file e to `student_dropout.csv`.

In [1]:
import numpy as np
import pandas as pd
from feature_engine.encoding import OneHotEncoder, OrdinalEncoder

In [2]:
data = pd.read_csv("../student_dropout.csv", sep=";")

data.head()

Unnamed: 0,Marital status,Application mode,Application order,Course,Daytime/evening attendance\t,Previous qualification,Previous qualification (grade),Nacionality,Mother's qualification,Father's qualification,...,Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP,Target
0,1,17,5,171,1,1,122.0,1,19,12,...,0,0,0,0,0.0,0,10.8,1.4,1.74,Dropout
1,1,15,1,9254,1,1,160.0,1,1,3,...,0,6,6,6,13.666667,0,13.9,-0.3,0.79,Graduate
2,1,1,5,9070,1,1,122.0,1,37,37,...,0,6,0,0,0.0,0,10.8,1.4,1.74,Dropout
3,1,17,2,9773,1,1,122.0,1,38,37,...,0,6,10,5,12.4,0,9.4,-0.8,-3.12,Graduate
4,2,39,1,8014,0,1,100.0,1,37,38,...,0,6,6,6,13.0,0,13.9,-0.3,0.79,Graduate


In [3]:
# target

data["dropout"] = np.where(data["Target"]=="Dropout", 1,0)
data.drop(["Target"], axis=1, inplace=True)
data["dropout"].mean()

0.3212025316455696

In [4]:
data["Marital status"] = data["Marital status"].map(
    {1:"single", 2: "married", 3: "widower", 4:"divorced", 5: "facto union", 6:"legally separated"},
)

In [5]:
data["Application mode"] = data["Application mode"].map({
    1: "1st phase - general contingent",
    2: "Ordinance No. 612/93",
    5: "1st phase - special contingent (Azores Island)",
    7: "Holders of other higher courses",
    10: "Ordinance No. 854-B/99",
    15: "International student (bachelor)",
    16: "1st phase - special contingent (Madeira Island)",
    17: "2nd phase - general contingent",
    18: "3rd phase - general contingent",
    26: "Ordinance No. 533-A/99, item b2) (Different Plan)",
    27: "Ordinance No. 533-A/99, item b3 (Other Institution)",
    39: "Over 23 years old",
    42: "Transfer",
    43: "Change of course",
    44: "Technological specialization diploma holders",
    51: "Change of institution/course",
    53: "Short cycle diploma holders",
    57: "Change of institution/course (International)",
})

In [6]:
data["Course"] = data["Course"].map({
    33: "Biofuel Production Technologies",
    171: "Animation and Multimedia Design",
    8014: "Social Service (evening attendance)",
    9003: "Agronomy",
    9070: "Communication Design",
    9085: "Veterinary Nursing",
    9119: "Informatics Engineering",
    9130: "Equinculture",
    9147: "Management",
    9238: "Social Service",
    9254: "Tourism",
    9500: "Nursing",
    9556: "Oral Hygiene",
    9670: "Advertising and Marketing Management",
    9773: "Journalism and Communication",
    9853: "Basic Education",
    9991: "Management (evening attendance)",
})

In [7]:
data["Previous qualification"] = data["Previous qualification"].map({
    1: "Secondary education",
    2: "Higher education - bachelor's degree",
    3: "Higher education - degree",
    4: "Higher education - master's",
    5: "Higher education - doctorate",
    6: "Frequency of higher education",
    9: "12th year of schooling - not completed",
    10: "11th year of schooling - not completed",
    12: "Other - 11th year of schooling",
    14: "10th year of schooling",
    15: "10th year of schooling - not completed",
    19: "Basic education 3rd cycle (9th/10th/11th year) or equiv.",
    38: "Basic education 2nd cycle (6th/7th/8th year) or equiv.",
    39: "Technological specialization course",
    40: "Higher education - degree (1st cycle)",
    42: "Professional higher technical course",
    43: "Higher education - master (2nd cycle)",
})

In [8]:
data["Nacionality"] = data["Nacionality"].map({
    1: "Portuguese",
    2: "German",
    6: "Spanish",
    11: "Italian",
    13: "Dutch",
    14: "English",
    17: "Lithuanian",
    21: "Angolan",
    22: "Cape Verdean",
    24: "Guinean",
    25: "Mozambican",
    26: "Santomean",
    32: "Turkish",
    41: "Brazilian",
    62: "Romanian",
    100: "Moldova (Republic of)",
    101: "Mexican",
    103: "Ukrainian",
    105: "Russian",
    108: "Cuban",
    109: "Colombian",
})

In [9]:
data.drop(
    ["Mother's qualification", "Father's qualification", "Mother's occupation", "Father's occupation"],
    axis=1,
    inplace=True,
)

In [10]:
data.head()

Unnamed: 0,Marital status,Application mode,Application order,Course,Daytime/evening attendance\t,Previous qualification,Previous qualification (grade),Nacionality,Admission grade,Displaced,...,Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP,dropout
0,single,2nd phase - general contingent,5,Animation and Multimedia Design,1,Secondary education,122.0,Portuguese,127.3,1,...,0,0,0,0,0.0,0,10.8,1.4,1.74,1
1,single,International student (bachelor),1,Tourism,1,Secondary education,160.0,Portuguese,142.5,1,...,0,6,6,6,13.666667,0,13.9,-0.3,0.79,0
2,single,1st phase - general contingent,5,Communication Design,1,Secondary education,122.0,Portuguese,124.8,1,...,0,6,0,0,0.0,0,10.8,1.4,1.74,1
3,single,2nd phase - general contingent,2,Journalism and Communication,1,Secondary education,122.0,Portuguese,119.6,1,...,0,6,10,5,12.4,0,9.4,-0.8,-3.12,0
4,married,Over 23 years old,1,Social Service (evening attendance),0,Secondary education,100.0,Portuguese,141.5,0,...,0,6,6,6,13.0,0,13.9,-0.3,0.79,0


In [11]:
data_ohe = OneHotEncoder(drop_last=True).fit_transform(data)

data_ohe.shape

(4424, 102)

In [12]:
data_ohe.to_csv("../student_dropout_logit.csv", index=False)

In [13]:
data_ordinal = OrdinalEncoder(encoding_method="arbitrary").fit_transform(data)

data_ordinal.shape

(4424, 33)

In [14]:
data_ohe.to_csv("../student_dropout_trees.csv", index=False)