In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression 
from sklearn.ensemble import RandomForestRegressor
from sklearn import decomposition

class Model:  
    """
    This class represents an AI model.
    """
    
    def __init__(self):
        """
        Constructor for Model class.
  
        Parameters
        ----------
        self : object
            The instance of the object passed by Python.
        """
        # TODO: Replace the following code with your own initialization code.
        self.nancolumns = ['V39','V38', 'V15', 'V5','V47']
        # self.large_cat_features = ['V12', 'V21', 'V24', 'V29', 'V36', 'V37', 'V51', 'V52', 'V55', 'V58']
        # self.small_cat_features = ['V9', 'V19', 'V20', 'V23', 'V31', 'V46', 'V54']
        self.numerical_features = ['V0', 'V1', 'V10', 'V11', 'V13', 'V14', 'V16', 'V17', 'V18', 'V2',
                            'V22', 'V25', 'V26', 'V27', 'V28', 'V3', 'V30', 'V32', 'V33', 'V34',
                            'V35', 'V4', 'V40', 'V41', 'V42', 'V43', 'V44', 'V45', 'V48', 'V49',
                            'V50', 'V53', 'V56', 'V57', 'V59', 'V6', 'V7', 'V8']
        self.categorical_features = ['V9', 'V12', 'V19', 'V20', 'V21', 'V23', 'V24', 'V29', 'V31', 'V36',
       'V37', 'V46', 'V51', 'V52', 'V54', 'V55', 'V58']
        

        self.large_cat_features = ['V12', 'V21', 'V24', 'V37']
        self.small_cat_features = ['V9', 'V19', 'V20', 'V23', 'V29', 'V31', 'V36', 'V46', 'V51', 'V52', 'V54', 'V55', 'V58']
    
    def fit(self, X_dict, y):
        """
        Train the model using the input data.
        
        Parameters
        ----------
        X_dict : dictionary with the following entries:
            - tabular: pandas Dataframe of shape (n_samples, n_features)
            - images: ndarray of shape (n_samples, height, width)
            Training data.
        y : pandas Dataframe of shape (n_samples,)
            Target values.
            
        Returns
        -------
        self : object
            Returns an instance of the trained model.
        """
        # TODO: Add your training code.
        tabular = X_dict['tabular']
        images = X_dict['images']

        images = images.reshape(images.shape[0], -1)
        image_labels = ['T' + str(i) for i in range(images.shape[1])]
        image_df = pd.DataFrame(images, columns=image_labels)

        tabular = tabular.drop(columns=self.nancolumns)

        tabular.reset_index(drop=True, inplace=True)
        image_df.reset_index(drop=True, inplace=True)
        
        # print(tabular.shape)
        # print(image_df.shape)
        combined = pd.concat([tabular, image_df], axis=1)
        # combined = tabular


        print(self.numerical_features + self.categorical_features)
        combined = combined.dropna(subset=self.numerical_features + self.categorical_features)
        y = y[combined.index]

        numeric_transformer = Pipeline( 
            steps=[("imputer", SimpleImputer(strategy="median")), 
                   ("scaler", StandardScaler()),
                   ("pca", decomposition.PCA(n_components=10, svd_solver='full'))
            ]
        ) 
 
        small_cat_transformer = Pipeline( 
            steps=[ 
                ("imputer", SimpleImputer(strategy="most_frequent")),
                ("encoder", OneHotEncoder(handle_unknown="ignore")), 
            ] 
        ) 

        large_cat_transformer = Pipeline(
            steps=[
                ("imputer", SimpleImputer(strategy="most_frequent")),
                ("encoder", OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
            ]
        )

        image_transformer = Pipeline(
            steps=[
                ("imputer", SimpleImputer(strategy='constant', fill_value=0)),
                ("pca", decomposition.PCA(n_components=10, svd_solver='full'))
            ]
        )

        pca = decomposition.PCA(n_components=15, svd_solver='full')

        preprocessor = ColumnTransformer( 
            transformers=[ 
                ("num", numeric_transformer, self.numerical_features), 
                ("small_cat", small_cat_transformer, self.small_cat_features), 
                ("large_cat", large_cat_transformer, self.large_cat_features),
                ("image", image_transformer, image_labels)
            ] 
        ) 

        self.model = Pipeline( 
            steps=[
                ("preprocessor", preprocessor),
                ("pca", pca),
                # ("regressor", LinearRegression())
                ("regressor", RandomForestRegressor(
                    random_state=42, n_estimators=5,
                    max_depth=5, min_samples_split=10, min_samples_leaf=5,
                    )
                )
            ]
                 
        ) 

        self.model.fit(combined, y)

        return self
    
    def predict(self, X_dict):
        """
        Use the trained model to make predictions.
        
        Parameters
        ----------
        X_dict : dictionary with the following entries:
            - tabular: pandas Dataframe of shape (n_samples, n_features)
            - images: ndarray of shape (n_samples, height, width)
            Input data.
            
        Returns
        -------
        pandas Dataframe of shape (n_samples,)
           Predicted target values per element in X_dict.
           
        """
        # TODO: Replace the following code with your own prediction code.
        tabular = X_dict['tabular']
        images = X_dict['images']

        images = images.reshape(images.shape[0], -1)
        image_labels = ['T' + str(i) for i in range(images.shape[1])]
        image_df = pd.DataFrame(images, columns=image_labels)

        tabular = tabular.drop(columns=self.nancolumns)
        tabular.reset_index(drop=True, inplace=True)
        image_df.reset_index(drop=True, inplace=True)

        combined = pd.concat([tabular, image_df], axis=1)
        # combined = tabular

        return self.model.predict(combined)
    



In [2]:
%load_ext memory_profiler

In [3]:
# Import packages
import pandas as pd
import numpy as np
import os
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, mean_squared_error, mean_absolute_error, r2_score
from util import dict_train_test_split

In [5]:
# Load data
df = pd.read_csv(os.path.join('data', 'tabular.csv'))
with open(os.path.join('data', 'images.npy'), 'rb') as f:
    images = np.load(f)
    
# Exclude target column
X_columns = [col for col in df.columns if col != 'target']

# Create X_dict and y
X_dict = {
    'tabular': df[X_columns],
    'images': images
}
y = df['target']

In [6]:
%%time
%%memit

# Split train and test
X_dict_train, y_train, X_dict_test, y_test = dict_train_test_split(X_dict, y, ratio=0.9)

# Train and predict
model = Model()
model.fit(X_dict_train, y_train)
y_pred = model.predict(X_dict_test)

# Evaluate model predition
# Learn more: https://scikit-learn.org/stable/modules/classes.html#module-sklearn.metrics
print("MSE: {0:.2f}".format(mean_squared_error(y_test, y_pred)))