## Imports

In [26]:
# DO NOT CHANGE THESE LINES.
import os
import json
import pandas as pd
import warnings
from sklearn.linear_model import ElasticNet
from feature_engine.encoding import OneHotEncoder
from joblib import dump
warnings.filterwarnings('ignore')

## Paths

In [27]:
# DO NOT CHANGE THESE LINES.
ROOT_DIR = os.path.dirname(os.getcwd())
MODEL_INPUTS_OUTPUTS = os.path.join(ROOT_DIR, 'model_inputs_outputs/')
INPUT_DIR = os.path.join(MODEL_INPUTS_OUTPUTS, "inputs")
INPUT_SCHEMA_DIR = os.path.join(INPUT_DIR, "schema")
DATA_DIR = os.path.join(INPUT_DIR, "data")
TRAIN_DIR = os.path.join(DATA_DIR, "training")
TEST_DIR = os.path.join(DATA_DIR, "testing")
MODEL_PATH = os.path.join(MODEL_INPUTS_OUTPUTS, "model")
MODEL_ARTIFACTS_PATH = os.path.join(MODEL_PATH, "artifacts")
OHE_ENCODER_FILE = os.path.join(MODEL_ARTIFACTS_PATH, 'ohe.joblib')
PREDICTOR_DIR_PATH = os.path.join(MODEL_ARTIFACTS_PATH, "predictor")
PREDICTOR_FILE_PATH = os.path.join(PREDICTOR_DIR_PATH, "predictor.joblib")
IMPUTATION_FILE = os.path.join(MODEL_ARTIFACTS_PATH, 'imputation.joblib')
LABEL_ENCODER_FILE = os.path.join(MODEL_ARTIFACTS_PATH, 'label_encoder.joblib')
if not os.path.exists(MODEL_ARTIFACTS_PATH):
    os.makedirs(MODEL_ARTIFACTS_PATH)
if not os.path.exists(PREDICTOR_DIR_PATH):
    os.makedirs(PREDICTOR_DIR_PATH)

### Reading the schema
The schema contains metadata about the datasets. We will use the scehma to get information about the type of each feature (NUMERIC or CATEGORICAL) and the id and target features, this will be helpful in preprocessing stage.

In [28]:
file_name = [f for f in os.listdir(INPUT_SCHEMA_DIR) if f.endswith('json')][0]
schema_path = os.path.join(INPUT_SCHEMA_DIR, file_name)
with open(schema_path, "r", encoding="utf-8") as file:
    schema = json.load(file)
features = schema['features']

numeric_features = []
categorical_features = []
nullable_features = []
for f in features:
    if f['dataType'] == 'CATEGORICAL':
        categorical_features.append(f['name'])
    else:
        numeric_features.append(f['name'])
    if f['nullable']:
        nullable_features.append(f['name'])

id_feature = schema['id']['name']
target_feature = schema['target']['name']

### Reading training data

In [29]:
file_name = [f for f in os.listdir(TRAIN_DIR) if f.endswith('.csv')][0]
file_path = os.path.join(TRAIN_DIR, file_name)
df = pd.read_csv(file_path)
df.head()

Unnamed: 0,unit_id,act_combined_midrange,act_english_midrange,act_math_midrange,act_writing_midrange,admission_rate,agege24,average_cost_academic_year,average_cost_program_year,carnegie_basic_classification,...,sat_math_midrange,sat_total_average,sat_verbal_midrange,sat_writing_midrange,spend_per_student,state,tuition_(instate),tuition_(out_of_state),undergrad_size,percent_pell_grant
0,459523,,,,,,,,9083.0,,...,,,,,4702.0,TX,,,35.0,0.8857
1,123341,,,,,,0.48,13157.0,,Associate\s--Public Suburban-serving Multicampus',...,,,,,3674.0,CA,1150.0,6190.0,17195.0,0.3146
2,172495,,,,,,0.46,,17013.0,,...,,,,,2015.0,MI,,,111.0,0.652
3,419457,,,,,,0.62,23317.0,,Special Focus Institutions--Other health profe...,...,,,,,6234.0,VA,13663.0,13663.0,126.0,0.5495
4,455725,,,,,,,,15564.0,,...,,,,,3366.0,AZ,,,363.0,0.8227


## Data Preprocessing
Data preprocessing is very important before training the model, as the data may contain missing values in some cells. Moreover, most of the learning algorithms cannot work with categorical data, thus the data has to be encoded.

In this section we will impute the missing values and encode the categorical features. Afterwards the data will be ready to train the model.

##### Imputing missing data
> The median value will be used to impute missing values of the numeric features and the mode will be used to impute categorical features.

##### You can add your own preprocessing steps such as:
<ul>
<li>Normalization</li> <br>
<li>Outlier removal</li><br>
<li>Handling imbalanced classes</li><br>
<li>Dropping or adding features</li><br>
</ul>

### Important note:
<p> 
Saving the values used for imputation during training step is crucial. These values will be used to impute missing data in the testing set. This is very important to avoid the well known problem of data leakage. During testing, you should not make any assumptions about the data in hand, alternatively anything needed during the testing phase should be learned from the training phase. This is why we are creating a dictionary of values used during training to reuse these values during testing.
</p>


In [30]:
# Imputing missing data
imputation_values = {}
for column in nullable_features:    
    if column in numeric_features:
        value = df[column].median()
    else:
        value = df[column].mode()[0]
    df[column].fillna(value, inplace=True)
    imputation_values[column] = value

dump(imputation_values, IMPUTATION_FILE)

# Comment the above code and write you own imputation code here


#BEGIN

#CODE HERE

#END

['/Users/moo/Desktop/Upwork/rt-ML/Jupyter-Notebook-Python-Regression-Template/model_inputs_outputs/model/artifacts/imputation.joblib']

##### Encoding Categorical features
<p>
The id column is just an identifier for the training example, so we will exclude it during the encoding phase.<br>
Target feature will be label encoded in the next step.
</p>


In [31]:
# Saving the id and target columns in a different variable.
ids = df[id_feature]
target = df[target_feature]

# Dropping the id and target from the dataframe
df.drop(columns=[id_feature, target_feature], inplace=True)

# Ensure that all categorical columns are stored as str type.
# This is to ensure that even if the categories are numbers (1, 2, ...), they still get encoded.
for c in categorical_features:
    df[c] = df[c].astype(str)

# Encoding the categorical features if exist
if categorical_features:
    encoder = OneHotEncoder(top_categories=6)
    encoder.fit(df)
    df = encoder.transform(df)

    # Saving the encoder to use it on the testing dataset
    path = dump(encoder, OHE_ENCODER_FILE)

### Training the Classifier
We choose Linear Regression model, but feel free to try your own and compare the results.

In [32]:
# Creating a linear regression model and training it
model = ElasticNet()
model.fit(df, target)

# BEGIN

# model = ...

# END

# Saving the model to use it for predictions
path = dump(model, PREDICTOR_FILE_PATH)
