# LabelEncoder

In [44]:
import numpy as np
import pandas as pd

from tpot import TPOTClassifier

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.preprocessing import FunctionTransformer

url = "https://gist.githubusercontent.com/michhar/2dfd2de0d4f8727f873422c5d959fff5/raw/ff414a1bcfcba32481e4d4e8db578e55872a2ca1/titanic.csv"


df = pd.read_csv(url, sep=None, engine='python')

df.head()

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

df.Cabin.fillna('0', inplace=True)
df.Embarked.fillna(method='bfill', inplace=True)

cols = ['Name', 'Sex', 'Ticket', 'Cabin', 'Embarked']
for col in cols:
    print(col)
#     if col in df.columns:
#         df.drop(col, axis=1, inplace=True)
    df[col] = LabelEncoder().fit_transform(df[col])

df.head()

X_train, X_test, y_train, y_test = train_test_split(
    df.drop('Survived', axis=1),
    df.Survived,
    test_size = 0.2, 
    random_state = 10
)

tpot = TPOTClassifier(
    generations=3,
    population_size=50,
    verbosity=2,
)

tpot.fit(df.drop('Survived', axis=1).values, df.Survived.values)

from sklearn.metrics import classification_report

y_pred = tpot.predict(X_test)

print(classification_report(y_test, y_pred))

Imputing missing values in feature set
             precision    recall  f1-score   support

          0       0.78      0.82      0.80        17
          1       0.79      0.73      0.76        15

avg / total       0.78      0.78      0.78        32



# Pipeline

In [5]:
import numpy as np
import pandas as pd

from tpot import TPOTClassifier

from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import OneHotEncoder, StandardScaler, Imputer


url = "https://gist.githubusercontent.com/michhar/2dfd2de0d4f8727f873422c5d959fff5/raw/ff414a1bcfcba32481e4d4e8db578e55872a2ca1/titanic.csv"
df = pd.read_csv(url, sep=None, engine='python')

for x in df.select_dtypes("object").columns:
    df[x] = df[x].astype('category')


class TypeSelector(BaseEstimator, TransformerMixin):
    def __init__(self, dtype):
        self.dtype = dtype
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        assert isinstance(X, pd.DataFrame)
        return X.select_dtypes(include=[self.dtype])

class StringIndexer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        assert isinstance(X, pd.DataFrame)
        return X.apply(lambda s: s.cat.codes.replace(
            {-1: len(s.cat.categories)}
        ))

transformer = Pipeline([
    ('features', FeatureUnion(n_jobs=1, transformer_list=[
        
        ('numericals', Pipeline([
            ('selector', TypeSelector(np.number)),
            ('imputer', Imputer(strategy="mean")),
            ('scaler', StandardScaler()),
        ])),
        
        ('categoricals', Pipeline([
            ('selector', TypeSelector('category')),
            ('labeler', StringIndexer()),
            ('imputer', Imputer(strategy="most_frequent")),
            ('encoder', OneHotEncoder(handle_unknown='ignore')),
        ]))
    ])), 
])


X_train, X_test, y_train, y_test = train_test_split(
    df.drop('Survived', axis=1),
    df.Survived,
    test_size = 0.2, 
    random_state = 10
)

tpot = TPOTClassifier(
    generations=3,
    population_size=50,
    verbosity=2,
    config_dict="TPOT sparse"
)


X_train = transformer.fit_transform(X_train, y_train)
tpot.fit(X_train, y_train)

X_test = transformer.transform(X_test)
y_pred = tpot.predict(X_test.toarray())

print(classification_report(y_test, y_pred))

Optimization Progress:  50%|█████     | 100/200 [01:47<00:19,  5.15pipeline/s]

Generation 1 - Current best internal CV score: 0.8223333333333335


Optimization Progress:  75%|███████▌  | 150/200 [02:02<00:18,  2.73pipeline/s]

Generation 2 - Current best internal CV score: 0.8303333333333335


                                                                              

Generation 3 - Current best internal CV score: 0.8383333333333335

Best pipeline: RandomForestClassifier(input_matrix, bootstrap=False, criterion=gini, max_features=0.5, min_samples_leaf=3, min_samples_split=3, n_estimators=100)
             precision    recall  f1-score   support

          0       0.68      1.00      0.81        17
          1       1.00      0.47      0.64        15

avg / total       0.83      0.75      0.73        32



# Dora

In [3]:
import pandas as pd
from Dora import Dora

from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

from tpot import TPOTClassifier, TPOTRegressor


dora = Dora()
df = pd.read_csv("https://gist.githubusercontent.com/michhar/2dfd2de0d4f8727f873422c5d959fff5/raw/ff414a1bcfcba32481e4d4e8db578e55872a2ca1/titanic.csv", sep='\t')
dora = Dora(output='Survived', data=df)

dora.extract_ordinal_feature("Embarked")
dora.extract_ordinal_feature("Cabin")
dora.extract_ordinal_feature("Ticket")
dora.extract_ordinal_feature("Sex")
dora.remove_feature('Name')
dora.impute_missing_values()
#dora.scale_input_values()

target = 'Survived'
X = dora.data.drop(target, axis=1)
y = dora.data[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = TPOTClassifier(generations=3, population_size=50, verbosity=2, config_dict='TPOT sparse')
model.fit(X_train, y_train)
model.score(X_test, y_test)

y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))


Optimization Progress:  50%|█████     | 100/200 [00:30<00:28,  3.52pipeline/s]

Generation 1 - Current best internal CV score: 0.837923076923077


Optimization Progress:  75%|███████▌  | 150/200 [00:48<00:12,  3.86pipeline/s]

Generation 2 - Current best internal CV score: 0.837923076923077


                                                                              

Generation 3 - Current best internal CV score: 0.845923076923077

Best pipeline: LinearSVC(input_matrix, C=10.0, dual=False, loss=squared_hinge, penalty=l1, tol=0.01)
             precision    recall  f1-score   support

          0       0.79      0.71      0.75        21
          1       0.54      0.64      0.58        11

avg / total       0.70      0.69      0.69        32





# Datacleaner

In [13]:
import pandas as pd
from datacleaner import autoclean

from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

from tpot import TPOTClassifier, TPOTRegressor

url = "https://gist.githubusercontent.com/michhar/2dfd2de0d4f8727f873422c5d959fff5/raw/ff414a1bcfcba32481e4d4e8db578e55872a2ca1/titanic.csv"
df = pd.read_csv(url, sep='\t')
df = autoclean(df)

target = 'Survived'
X = df.drop(target, axis=1)
y = df[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = TPOTClassifier(generations=3, population_size=50, verbosity=2)
model.fit(X_train, y_train)
model.score(X_test, y_test)

y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

Optimization Progress:  50%|█████     | 100/200 [00:17<00:11,  8.46pipeline/s]

Generation 1 - Current best internal CV score: 0.8375897435897436


Optimization Progress:  75%|███████▌  | 150/200 [00:35<00:19,  2.52pipeline/s]

Generation 2 - Current best internal CV score: 0.845923076923077


                                                                              

Generation 3 - Current best internal CV score: 0.845923076923077

Best pipeline: ExtraTreesClassifier(input_matrix, bootstrap=False, criterion=entropy, max_features=0.6000000000000001, min_samples_leaf=3, min_samples_split=6, n_estimators=100)
             precision    recall  f1-score   support

          0       0.77      0.81      0.79        21
          1       0.60      0.55      0.57        11

avg / total       0.71      0.72      0.72        32



In [10]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,19,1,22.0,1,0,103,7.25,9,2
1,2,1,1,33,0,38.0,1,0,124,71.2833,14,0
2,3,1,3,59,0,26.0,0,0,142,7.925,9,2
3,4,1,1,47,0,35.0,1,0,8,53.1,9,2
4,5,0,3,1,1,35.0,0,0,92,8.05,9,2


In [8]:
df.shape

(156, 12)