In [1]:
# usual imports 
import numpy as np
import pandas as pd

import math

import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline

from sklearn.metrics import mutual_info_score
from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import StandardScaler

from sklearn.ensemble import RandomForestClassifier


In [2]:
def get_input_data_matrix(df, categorical_features, numerical_features, scaling_required = True, 
          is_training_data= True, dict_vectorizer = None, standard_scalar = None):
    '''
    This function gets dataframe and converts input features into numpy 2D matrix
    and returns values that need to be predicted.
    Returns:
    {
        'X': X, 
        'dict_vectorizer' : dict_vectorizer, 
        'standard_scalar' : standard_scalar,
        'feature_names' : feature_names
    }
    '''
    categorical_data = df[categorical_features].to_dict(orient='rows')
    numerical_data = df[numerical_features].to_numpy()
    if is_training_data:
        dict_vectorizer = DictVectorizer(sparse=False)
        dict_vectorizer.fit(categorical_data)        
        if scaling_required:
            standard_scalar = StandardScaler()
            standard_scalar.fit(numerical_data)
            X_numerical = standard_scalar.transform(numerical_data)
        else:
            X_numerical = numerical_data

    X_categorical = dict_vectorizer.transform(categorical_data)

    input_data_matrix = np.concatenate((X_categorical, X_numerical), axis=1)
    feature_names = dict_vectorizer.feature_names_ + numerical_features
    
    return {
        'input_data_matrix': input_data_matrix, 
        'dict_vectorizer' : dict_vectorizer, 
        'standard_scalar' : standard_scalar,
        'feature_names' : feature_names
    }



def calculate_mi(series):
    '''
    Calculate mutual information score between the pandas series and churn column
    '''
    return mutual_info_score(series, df_train_full_explore.churn)
