In [1]:
# Import packages
import pandas as pd
import numpy as np
import os
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, mean_squared_error, mean_absolute_error, r2_score
from util import dict_train_test_split

In [2]:
%load_ext memory_profiler

In [3]:
# Load data
df = pd.read_csv(os.path.join('data', 'tabular.csv'))
with open(os.path.join('data', 'images.npy'), 'rb') as f:
    images = np.load(f)
    
# Exclude target column
X_columns = [col for col in df.columns if col != 'target']

# Create X_dict and y
X_dict = {
    'tabular': df[X_columns],
    'images': images
}
y = df['target']

In [4]:
X_dict['tabular'].shape

(357699, 60)

In [5]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression


def drop_outliers(tabular, y):
    Q1, Q3 = y.quantile([0.25,0.75])
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    condition = (y > lower_bound) & (y < upper_bound)
    return tabular[condition], y[condition]

class Model:  
    """
    This class represents an AI model.
    """
    
    def __init__(self, learning_rate=0.01):
        """
        Constructor for Model class.
  
        Parameters
        ----------
        self : object
            The instance of the object passed by Python.
        """
        # TODO: Replace the following code with your own initialization code.

        self.learning_rate = learning_rate

        self.nancolumns = ['V39','V38', 'V15', 'V5','V47']
        self.numerical_features = ['V0', 'V1', 'V10', 'V11', 'V13', 'V14', 'V16', 'V17', 'V18', 'V2',
                            'V22', 'V25', 'V26', 'V27', 'V28', 'V3', 'V30', 'V32', 'V33', 'V34',
                            'V35', 'V4', 'V40', 'V41', 'V42', 'V43', 'V44', 'V45', 'V48', 'V49',
                            'V50', 'V53', 'V56', 'V57', 'V59', 'V6', 'V7', 'V8']
        self.categorical_features = ['V9', 'V12', 'V19', 'V20', 'V21', 'V23', 'V24', 'V29', 'V31', 'V36',
                            'V37', 'V46', 'V51', 'V52', 'V54', 'V55', 'V58']

        self.numeric_transformer = Pipeline( 
            steps=[("imputer", SimpleImputer(strategy="median")), 
                   ("scaler", StandardScaler()),
            ]
        )
 
        self.cat_transformer = Pipeline(
            steps=[
                ("imputer", SimpleImputer(strategy="most_frequent")),
                ("encoder", OneHotEncoder(handle_unknown="ignore", sparse=True)),
            ]
        )

        self.preprocessor = ColumnTransformer( 
            transformers=[ 
                ("num", self.numeric_transformer, self.numerical_features), 
                ("cat", self.cat_transformer, self.categorical_features)
            ] 
        )


        self.model = Pipeline(
            steps=[
                ("preprocessor", self.preprocessor),
                ("regressor", LinearRegression())                
            ]
        )

        # self.model = LinearRegression()

    
    def fit(self, X_dict, y):
        """
        Train the model using the input data.
        
        Parameters
        ----------
        X_dict : dictionary with the following entries:
            - tabular: pandas Dataframe of shape (n_samples, n_features)
            - images: ndarray of shape (n_samples, height, width)
            Training data.
        y : pandas Dataframe of shape (n_samples,)
            Target values.
            
        Returns
        -------
        self : object
            Returns an instance of the trained model.
        """
        # TODO: Add your training code.
        tabular = X_dict['tabular']
        images = X_dict['images']
        tabular = tabular.sample(frac=0.3, random_state=2109)
        # images = images[
        y = y[tabular.index]
        # tabular = tabular.reset_index(drop=True)
        # y = y.reset_index(drop=True)
        


        tabular = tabular.drop(columns=self.nancolumns)
        tabular[self.numerical_features] = tabular[self.numerical_features].astype(np.float32)
        tabular[self.categorical_features] = tabular[self.categorical_features].astype(object)
        tabular, y = drop_outliers(tabular, y)

        self.model.fit(tabular, y)
        return self
    
    def predict(self, X_dict):
        """
        Use the trained model to make predictions.
        
        Parameters
        ----------
        X_dict : dictionary with the following entries:
            - tabular: pandas Dataframe of shape (n_samples, n_features)
            - images: ndarray of shape (n_samples, height, width)
            Input data.
            
        Returns
        -------
        pandas Dataframe of shape (n_samples,)
           Predicted target values per element in X_dict.
           
        """
        # TODO: Replace the following code with your own prediction code.
        tabular = X_dict['tabular']
        tabular = tabular.drop(columns=self.nancolumns)
        tabular[self.numerical_features] = tabular[self.numerical_features].astype(np.float32)
        tabular[self.categorical_features] = tabular[self.categorical_features].astype(object)

        return self.model.predict(tabular)



In [6]:
%%time
%%memit

# Split train and test
X_dict_train, y_train, X_dict_test, y_test = dict_train_test_split(X_dict, y, ratio=0.9)

# Train and predict
model = Model()
model.fit(X_dict_train, y_train)
y_pred = model.predict(X_dict_test)

# Evaluate model predition
# Learn more: https://scikit-learn.org/stable/modules/classes.html#module-sklearn.metrics
print("MSE: {0:.2f}".format(mean_squared_error(y_test, y_pred)))

MSE: 2932.05
peak memory: 742.37 MiB, increment: 162.56 MiB
CPU times: total: 1min 9s
Wall time: 13.1 s
