# Math Grade Predictor

### Imports

In [1]:
import pandas as pd
import numpy as np
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline, make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

### Utility Functions

In [2]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

### Preprocessing

In [3]:
# Reading initial data
df_train = pd.read_csv('train.csv')
df_test  = pd.read_csv('test.csv')

# Setting the target name
target_name = 'IN_TREINEIRO'

In [4]:
# Matching train columns with test columns
df_train_proc = df_train[df_test.columns.append(pd.Index([target_name]))]

In [5]:
# Split train and test
y = df_train_proc[target_name]
X = df_train_proc.drop(columns=[target_name])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [6]:
# Pipeline to fill null values
fill_na_cols = ['TP_SEXO', 'Q001', 'Q002', 'Q006', 'Q024', 'Q025', 'Q026', 'Q027', 'Q047']
fill_na_pipeline = Pipeline([('fillna', SimpleImputer(strategy='constant', fill_value='0'))])
onehot_pipeline = Pipeline([('onehot', OneHotEncoder())])

# Pipeline to impute median
median_cols = ['TP_ENSINO', 'TP_DEPENDENCIA_ADM_ESC', 'TP_STATUS_REDACAO']
median_pipeline = Pipeline([('median', SimpleImputer(strategy='median'))])

# Pipeline to impute mean
mean_cols = ['NU_NOTA_CN', 'NU_NOTA_CH', 'NU_NOTA_LC', 'NU_NOTA_COMP1', 'NU_NOTA_COMP2', 
             'NU_NOTA_COMP3', 'NU_NOTA_COMP4', 'NU_NOTA_COMP5', 'NU_NOTA_REDACAO']
mean_pipeline = Pipeline([('mean', SimpleImputer(strategy='mean'))])

# Dropping NU_INSCRICAO column because of it's high variance.
# Dropping SG_UF_RESIDENCIA column because CO_UF_RESIDENCIA is a numerical reference to it.
drop_cols = ['NU_INSCRICAO', 'SG_UF_RESIDENCIA']

# Preprocessing pipeline
preprocessor = ColumnTransformer(transformers=[
    ('drop', 'drop', drop_cols),
    ('fillna', fill_na_pipeline, fill_na_cols),
    ('median', median_pipeline, median_cols),
    ('mean',  mean_pipeline, mean_cols)
], remainder='passthrough')

# Training pipeline
std_pca_lasso_pipeline = Pipeline([
    ('std', StandardScaler(with_mean=False)),
    ('smt', SMOTE(sampling_strategy="minority")),
    ('rfc', RandomForestClassifier(n_estimators=1000, n_jobs=-1, max_features=5))
])

# Unifying the pipelines into one
pipeline = make_pipeline(
    preprocessor,
    OneHotEncoder(handle_unknown='ignore'),
    std_pca_lasso_pipeline
)

### Training

In [7]:
# Training model
pipe = pipeline.fit(X_train, y_train)

In [8]:
# Predicting Math Grades
y_final = pipeline.predict(df_test)

In [9]:
pipeline.score(X_test, y_test)

0.9660111677591648

### Preparing the output file

In [10]:
# Preparing and creating the answer.csv
answer = pd.DataFrame(df_test['NU_INSCRICAO'])
answer = pd.concat([answer, pd.Series(y_final)], axis=1, join='inner')
answer.columns = ['NU_INSCRICAO', target_name]
answer.to_csv('answer.csv', index=False)
answer.head()

Unnamed: 0,NU_INSCRICAO,IN_TREINEIRO
0,ba0cc30ba34e7a46764c09dfc38ed83d15828897,0
1,177f281c68fa032aedbd842a745da68490926cd2,0
2,6cf0d8b97597d7625cdedc7bdb6c0f052286c334,1
3,5c356d810fa57671402502cd0933e5601a2ebf1e,0
4,df47c07bd881c2db3f38c6048bf77c132ad0ceb3,0
