In [1]:
import pandas as pd
import numpy as np

In [3]:
class Exploring:
    
    def __init__(self, dataset):
        self.dataset = dataset
        
    def corr_matrix(self, target=None, heatmap=None, **kwargs_plot):
        corr_matrix = self.dataset.corr()
        if target: 
            result = corr_matrix[target].sort_values(ascending=False)
            return pd.DataFrame(result)
        elif heatmap:
            import seaborn as sns
            fig, ax = plt.subplots(figsize=(12,8))
            sns.heatmap(corr_matrix, alpha=.9, cmap=plt.get_cmap("coolwarm"), **kwargs_plot)
            return None
        else:
            return corr_matrix
        
    def plot_corr_matrix(self, subset=None, **kwargs_plot):
        from pandas.plotting import scatter_matrix as scatter_matrix
        if subset:
            scatter_matrix(self.dataset[subset],figsize=(12,8), **kwargs_plot)
            plt.show()
            return None
        else:
            scatter_matrix(self.dataset,figsize=(12,8))
            plt.show()
            return None
    
    def plot_geo(self, x, y, var, **kwargs_plot):
        from matplotlib import pyplot as plt
        x_value = self.dataset[x]
        y_value = self.dataset[y]
        z_value = self.dataset[var]
        fig, ax = plt.subplots(figsize=(12,8))
        ax_color = ax.scatter(x_value, y_value, 
                    c=z_value,
                    label=var,
                    cmap=plt.get_cmap("jet"),
                    **kwargs_plot)
        cbar = fig.colorbar(ax_color)
        ax.set_xlabel(x)
        ax.set_ylabel(y)
        plt.legend()
        plt.show()
        return None

    def plot_percentil(self, var1, var2, percentiles=[.25, .50, .75], table=None, **kwargs_plot):
        array_percentil = self.dataset[var1].describe(percentiles)
        index_percentil = list(str(int(per*100)).replace('0.', '')+'%' for per in percentiles)
        label = index_percentil + ['max']
        self.dataset['group_'+var1] = pd.cut(self.dataset[var1],
                                       bins=array_percentil[['min']+index_percentil+['max']],
                                       labels=label)
        df_mean  = pd.DataFrame(self.dataset.groupby(by='group_'+var1)[var2].mean())
        df_count = pd.DataFrame(self.dataset.groupby(by='group_'+var1)[var2].count())
        df_std   = pd.DataFrame(self.dataset.groupby(by='group_'+var1)[var2].std())
        
        df_final = df_mean.join(df_count, rsuffix='_count').join(df_std, rsuffix='_std')
        if table:
            return df_final
        else:
            df_final[var2+'_std_upper'] = df_final[var2] + df_final[var2+'_std']/2
            df_final[var2+'_std_lower'] = df_final[var2] - df_final[var2+'_std']/2
            x = np.arange(df_final.shape[0])
            y = df_final[var2]
            y2 = df_final[var2+'_count']
            std = df_final[var2+'_std']
            std_upper = df_final[var2+'_std_upper']
            std_lower = df_final[var2+'_std_lower']
            fig, ax1 = plt.subplots(figsize=(12,8))
            ax1.plot(x, y, color='black', alpha=0.6)
            ax1.plot(x, std_lower, color='black', alpha=0.3)
            #ax1.stackplot(x, std_upper, std_lower)
            ax1.plot(x, std_upper, color='black', alpha=0.3)
            ax1.set_xlabel('Percentil '+var1)
            ax1.set_ylabel('mean_'+var2)
            ax2 = ax1.twinx()
            ax2.set_ylabel('Volumetria')
            ax2.bar(x, y2, width=0.2, alpha=.3, color='green')
            plt.xticks(x, label)
            return None

    def boxplot_normalized(self, drop_columns=None):
        df_numerical = self.dataset._get_numeric_data()
        df_box = (df_numerical - df_numerical.mean())/df_numerical.std()
        if drop_columns:
            df_box.drop(drop_columns, axis=1, inplace=True)
        fig, ax1 = plt.subplots(figsize=(12,8))
        sns.boxplot(data=df_box)
        plt.show()
        return None
        
    def data_info(self):
        info = pd.DataFrame()
        info["var"] = self.dataset.columns
        info["# missing"] = list(self.dataset.isnull().sum())
        info["% missing"] = info["# missing"] / self.dataset.shape[0]*100
        info["types"] = list(self.dataset.dtypes)
        info["unique values"] = list(len(self.dataset[var].unique()) for var in self.dataset.columns)
        return info

In [119]:
class premodeling:
    # Importing libs
    import numpy as np
    import pandas as pd

    def __init__(self, dataset, index_cols=None, ignore_cols=None, drop_cols=None):
        '''
        Here we define what will be index, cat_cols, cat_cols_used and drop columns
        '''
        self.dataset = dataset.copy()
        # Set index columns
        if index_cols:
            self.dataset.set_index(index_cols, inplace=True)
        # Drop columns in drop_cols
        if drop_cols:
            self.dataset.drop(labels=drop_cols, axis=1, inplace=True)
        # Set importants features
        self.cat_cols = self.dataset.select_dtypes(include='object').columns.tolist()
        self.num_cols = self.dataset.select_dtypes(exclude='object').columns.tolist()
        if ignore_cols:
            # Use all categories unless ignore cols
            self.cat_cols_used = [col for col in self.cat_cols if col not in ignore_cols]
            self.num_cols_used = [col for col in self.num_cols if col not in ignore_cols]
        else:
            # Use all categories columns
            self.cat_cols_used = self.cat_cols
            self.num_cols_used = self.num_cols
        
    def fill_missing(self, na_cat='desconhecido', na_num='mean'):
        '''
        Preenche valores nulos para valores categóricos com 'desconhecido'
        e para valores numéricos com a média.

        Para escorar o teste, os valores das médias vão estar salvos
        no train_mean.
        '''
        self.train_mean = self.dataset[self.num_cols_used].mean()
        if na_cat:
            self.dataset[self.cat_cols_used] = self.dataset[self.cat_cols_used].fillna(
                value=na_cat, axis=1)
        if na_num:
            self.dataset[self.num_cols_used] = self.dataset[self.num_cols_used].fillna(
                value=self.dataset[self.num_cols_used].mean())
        return self.dataset

    def encoding_others(self, per_others=0, cat_cols=None, other_name='Outros',
                 ignore_cols=None, index_cols=None):
        '''
        Codificamos os valores categoricos com ocorrência menor que min_others
        por 'Outros'
        '''
        if cat_cols:
            # Use some categories columns
            self.cat_cols_used_other = [col for col in self.cat_cols_used if col in ignore_cols]
        else:
            # Use all categories columns
            self.cat_cols_used_other = self.cat_cols_used
        # Creating a log dictionary
        self.dict_log = {}
        # Mapping and saving in self.dict_log
        for col in self.cat_cols_used_other:
            var = self.dataset[col].value_counts(normalize=True)*100
            self.dict_log[col] = var[var > per_others].index.tolist()
        # What isn't in dict_log, replace by 'other_name'
        for col in self.dict_log.keys():
            self.dataset[col] = self.dataset[col].apply(
                lambda x: x if x in self.dict_log.get(col) else other_name)
        return self.dataset
    
    def encoding_OneHot(self, cat_cols=None, drop_cat=True):
        if cat_cols:
        # Use some categories columns
            self.cat_cols_used_encoding = [col for col in self.cat_cols_used if col in cat_cols]
        else:
            # Use all categories columns
            self.cat_cols_used_encoding = self.cat_cols_used
        import re
        from sklearn.preprocessing import OneHotEncoder
        enc = OneHotEncoder()
        enc.fit(self.dataset[self.cat_cols_used_encoding])
        a_encoding = enc.transform(self.dataset[self.cat_cols_used_encoding]).toarray()
        # Take the columns Name
        enc_name = enc.get_feature_names().tolist()
        col_name = []
        for num, col in zip(range(len(self.dataset[self.cat_cols_used_encoding])), self.dataset[self.cat_cols_used_encoding]):
            partial_col_name = [name.replace(re.findall('x[0-9]+',name)[0], col) for name in enc_name if re.findall('[0-9]+',name)[0]==str(num)]
            col_name += partial_col_name
        df_encoding = pd.DataFrame(a_encoding, columns=col_name)
        #return self.dataset, enc, a_encoding, col_name
        for col in col_name:
            self.dataset[col] = df_encoding[col].values
        if drop_cat:
            self.dataset.drop(labels=self.cat_cols_used_encoding, axis=1, inplace=True)
        return self.dataset, enc
    
    def encoding_mean_rate(self, target, cat_cols=None, min_obs=100):
        if cat_cols:
        # Use some categories columns
            self.cat_cols_used_meanrate = [col for col in self.cat_cols_used if col in cat_cols]
        else:
            # Use all categories columns
            self.cat_cols_used_meanrate = self.cat_cols_used
        
        for col in self.cat_cols_used_meanrate:
            df_mean = pd.DataFrame(self.dataset.groupby([col])[target].mean())
            df_mean.columns = [col+'_mean']
            df_count = pd.DataFrame(self.dataset.groupby([col])[target].count())
            df_count.columns = [col+'_count']
            df_result = df_mean.join(df_count)
            df_result = df_result[df_result[col+'_count']>min_obs]
            self.dataset = self.dataset.join(df_result[col+'_mean'], on=[col])
            self.num_cols_used.append(col+'_mean')
    
    def encoding_label(self, target, str_target):
        n = len(self.dataset[target].unique())
        print(n)
        if n != 2:
            print('FAIL. nº de features: ',n)
            return None
        else:
            self.dataset[target].apply(lambda x: 1 if x==str_target else 0)

In [120]:
df.sample(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
71,72,0,3,"Goodwin, Miss. Lillian Amy",female,16.0,5,2,CA 2144,46.9,,S
201,202,0,3,"Sage, Mr. Frederick",male,,8,2,CA. 2343,69.55,,S
18,19,0,3,"Vander Planke, Mrs. Julius (Emelia Maria Vande...",female,31.0,1,0,345763,18.0,,S
87,88,0,3,"Slocovski, Mr. Selman Francis",male,,0,0,SOTON/OQ 392086,8.05,,S
654,655,0,3,"Hegarty, Miss. Hanora ""Nora""",female,18.0,0,0,365226,6.75,,Q
527,528,0,1,"Farthing, Mr. John",male,,0,0,PC 17483,221.7792,C95,S
846,847,0,3,"Sage, Mr. Douglas Bullen",male,,8,2,CA. 2343,69.55,,S
135,136,0,2,"Richard, Mr. Emile",male,23.0,0,0,SC/PARIS 2133,15.0458,,C
413,414,0,2,"Cunningham, Mr. Alfred Fleming",male,,0,0,239853,0.0,,S
114,115,0,3,"Attalah, Miss. Malake",female,17.0,0,0,2627,14.4583,,C


In [143]:
df = pd.read_csv('C:\\Users\\Rafael\\Desktop\\Data Science\\Dataset\\titanic\\train.csv')

print('Raw Dataset: \n')
print(Exploring(df).data_info())

ob = premodeling(df,index_cols=['PassengerId'], drop_cols=['Name', 'Ticket'], ignore_cols=['Survived'])
print('\n__ini___ Dataset: \n')
print(Exploring(ob.dataset).data_info())

ob.encoding_others(per_others=10)
print('\nencoding_others Dataset:')
print(Exploring(ob.dataset).data_info())

ob.encoding_mean_rate('Survived', min_obs=100)
print('\nencoding_mean_rate Dataset: \n')
print(Exploring(ob.dataset).data_info())

ob.fill_missing()
print('\nfill_missing Dataset: \n')
print(Exploring(ob.dataset).data_info())

data, enc = ob.encoding_OneHot()
print('\nencoding_OneH Dataset:')
print(Exploring(ob.dataset).data_info())

Raw Dataset: 

            var  # missing  % missing    types  unique values
0   PassengerId          0   0.000000    int64            891
1      Survived          0   0.000000    int64              2
2        Pclass          0   0.000000    int64              3
3          Name          0   0.000000   object            891
4           Sex          0   0.000000   object              2
5           Age        177  19.865320  float64             89
6         SibSp          0   0.000000    int64              7
7         Parch          0   0.000000    int64              7
8        Ticket          0   0.000000   object            681
9          Fare          0   0.000000  float64            248
10        Cabin        687  77.104377   object            148
11     Embarked          2   0.224467   object              4

__ini___ Dataset: 

        var  # missing  % missing    types  unique values
0  Survived          0   0.000000    int64              2
1    Pclass          0   0.000000    int64

In [259]:
class cl_modeling:
    
    def __init__(self, X_train, y_train, X_test, y_test):
        self.X_train = X_train.copy()
        self.y_train = y_train.copy()
        self.X_test = X_test.copy()
        self.y_test = y_test.copy()
    
    def train_predict_model(self, model):
        # Train the model
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        y_pred_proba = model.predict_proba(X_test)
        
        from sklearn.metrics import roc_curve
        from sklearn.metrics import roc_auc_score, confusion_matrix
        from sklearn.metrics import accuracy_score, f1_score
        from sklearn.metrics import precision_score, recall_score
        
        print("Precision: ", precision_score(y_test, y_pred))
        print("Recall: ", recall_score(y_test, y_pred))
        print("Accuracy: ", accuracy_score(y_test, y_pred))
        print("F1: ", f1_score(y_test, y_pred))
        print("AUC: ", roc_auc_score(y_test, y_pred_proba[:, 1]))
        print("Confusion Matrix: \n ",confusion_matrix(y_test, y_pred))
    
    def multi_default_models(self, models=None):
        if models:
            for model in models:
                print(model)
                ob2.train_predict_model(model)
                print()
        else:
            ob2 = modeling(self.X_train, self.X_test, self.y_train, self.y_test)
            from sklearn.ensemble import RandomForestClassifier
            from sklearn.linear_model import LogisticRegression
            from sklearn.naive_bayes import GaussianNB
            from sklearn.tree import DecisionTreeClassifier
            from lightgbm import LGBMClassifier
            from xgboost  import XGBRFClassifier

            # Setting Models
            nb = GaussianNB()
            rf = RandomForestClassifier(criterion="entropy", n_estimators=500, max_depth=6)
            lr = LogisticRegression(solver='lbfgs', max_iter=1000)
            dt = DecisionTreeClassifier(max_depth=13,  min_samples_leaf=10)
            xgb = XGBRFClassifier(max_depth=10,learning_rate=0.1)
            lgbm_rf = LGBMClassifier(boosting_type='rf',n_jobs=1, bagging_freq=3, bagging_fraction=.3,importance_type='gain')
            lgbm_dart = LGBMClassifier(boosting_type='dart',n_jobs=1, importance_type='gain')
            lgbm = LGBMClassifier(n_jobs=1, importance_type='gain')

            # Evaluating
            model_list = [nb, lr, dt, rf, xgb, lgbm_rf, lgbm_dart, lgbm]
            for model in model_list:
                print(model)
                ob2.train_predict_model(model)
                print()
            ob2 = modeling(self.X_train, self.X_test, self.y_train, self.y_test)

In [260]:
dataset = ob.dataset
y = dataset['Survived']
X = dataset[dataset.columns[1:]]

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [261]:
ob2 = cl_modeling(X_train, X_test, y_train, y_test)
ob2.multi_default_models()

GaussianNB(priors=None, var_smoothing=1e-09)
Precision:  0.7346938775510204
Recall:  0.7272727272727273
Accuracy:  0.8022388059701493
F1:  0.730964467005076
AUC:  0.8388022234176081
Confusion Matrix: 
  [[143  26]
 [ 27  72]]

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=1000, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)
Precision:  0.6938775510204082
Recall:  0.6868686868686869
Accuracy:  0.7723880597014925
F1:  0.6903553299492386
AUC:  0.834439065208296
Confusion Matrix: 
  [[139  30]
 [ 31  68]]

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=13,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=10, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
     