In [1]:
import os
os.chdir('/Volumes/Projects/autopilotml')

In [62]:
from autopilotml.load_data import read_csv
from autopilotml.preprocessing import imputation, drop_missing_rows, drop_missing_columns, inter_quartile_range, remove_outliers_zscore
from autopilotml.transformation import onehot_transform, label_transform, ordinal_transform
from autopilotml.scaling import standard_scale, target_scale, maxabs_scale, robust_scale
from autopilotml.feature_selection import rfe
from datetime import datetime
import mlflow
from mlflow.tracking import MlflowClient
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVR, SVC

# Data Loading

In [3]:
df = read_csv("/Volumes/Projects/autopilotml/autopilotml/dataset/titanic_train.csv")
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0.0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1.0,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     888 non-null    float64
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(3), int64(4), object(5)
memory usage: 83.7+ KB


# Data Preprocessing

In [5]:
# Drop high cardinality features
df = df.drop(['PassengerId'], axis=1)

In [6]:
df.isnull().sum()

Survived      3
Pclass        0
Name          0
Sex           0
Age         177
SibSp         0
Parch         0
Ticket        0
Fare          0
Cabin       687
Embarked      2
dtype: int64

In [7]:
df = df.dropna(subset=['Survived'], axis = 0)
df

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,0.0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,1.0,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
7,0.0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.0750,,S
...,...,...,...,...,...,...,...,...,...,...,...
886,0.0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,1.0,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,0.0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,1.0,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [8]:
numerical_columns = df.select_dtypes(exclude='object').columns
categorical_columns = df.select_dtypes(include='object').columns

In [9]:
# basic_imputation(df, strategy_numerical='constant', strategy_categorical='constant', fill_value=123)

# basic_imputation(df, strategy_numerical='median', strategy_categorical='constant', fill_value='missing_value')

imputation(df, strategy_numerical='median', strategy_categorical='most_frequent',  numerical_columns = numerical_columns, categorical_columns = categorical_columns)

# basic_imputation(df, strategy_numerical='knn', strategy_categorical='most_frequent', n_neighbors=3, weights='uniform')

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,0.0,3.0,"Braund, Mr. Owen Harris",male,22.0,1.0,0.0,A/5 21171,7.2500,B96 B98,S
1,1.0,1.0,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1.0,0.0,PC 17599,71.2833,C85,C
2,1.0,3.0,"Heikkinen, Miss. Laina",female,26.0,0.0,0.0,STON/O2. 3101282,7.9250,B96 B98,S
3,1.0,1.0,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1.0,0.0,113803,53.1000,C123,S
7,0.0,3.0,"Palsson, Master. Gosta Leonard",male,2.0,3.0,1.0,349909,21.0750,B96 B98,S
...,...,...,...,...,...,...,...,...,...,...,...
886,0.0,2.0,"Montvila, Rev. Juozas",male,27.0,0.0,0.0,211536,13.0000,B96 B98,S
887,1.0,1.0,"Graham, Miss. Margaret Edith",female,19.0,0.0,0.0,112053,30.0000,B42,S
888,0.0,3.0,"Johnston, Miss. Catherine Helen ""Carrie""",female,28.0,1.0,2.0,W./C. 6607,23.4500,B96 B98,S
889,1.0,1.0,"Behr, Mr. Karl Howell",male,26.0,0.0,0.0,111369,30.0000,C148,C


In [10]:
df.isnull().sum()

Survived    0
Pclass      0
Name        0
Sex         0
Age         0
SibSp       0
Parch       0
Ticket      0
Fare        0
Cabin       0
Embarked    0
dtype: int64

# Outlier

In [11]:
df.describe()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare
count,888.0,888.0,888.0,888.0,888.0,888.0
mean,0.385135,2.308559,29.32902,0.524775,0.382883,32.236012
std,0.486901,0.835686,13.013933,1.104186,0.807113,49.760012
min,0.0,1.0,0.42,0.0,0.0,0.0
25%,0.0,2.0,22.0,0.0,0.0,7.8958
50%,0.0,3.0,28.0,0.0,0.0,14.4542
75%,1.0,3.0,35.0,1.0,0.0,31.0
max,1.0,3.0,80.0,8.0,6.0,512.3292


In [12]:
df.shape

(888, 11)

In [13]:
df = inter_quartile_range(df, iqr_threshold = 1.5, Lc = 0.05, Uc = 0.95, cap = False, numerical_columns = numerical_columns)
df

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,0.0,3.0,"Braund, Mr. Owen Harris",male,22.0,1.0,0.0,A/5 21171,7.2500,B96 B98,S
1,1.0,1.0,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1.0,0.0,PC 17599,71.2833,C85,C
2,1.0,3.0,"Heikkinen, Miss. Laina",female,26.0,0.0,0.0,STON/O2. 3101282,7.9250,B96 B98,S
3,1.0,1.0,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1.0,0.0,113803,53.1000,C123,S
7,0.0,3.0,"Palsson, Master. Gosta Leonard",male,2.0,3.0,1.0,349909,21.0750,B96 B98,S
...,...,...,...,...,...,...,...,...,...,...,...
886,0.0,2.0,"Montvila, Rev. Juozas",male,27.0,0.0,0.0,211536,13.0000,B96 B98,S
887,1.0,1.0,"Graham, Miss. Margaret Edith",female,19.0,0.0,0.0,112053,30.0000,B42,S
888,0.0,3.0,"Johnston, Miss. Catherine Helen ""Carrie""",female,28.0,1.0,2.0,W./C. 6607,23.4500,B96 B98,S
889,1.0,1.0,"Behr, Mr. Karl Howell",male,26.0,0.0,0.0,111369,30.0000,C148,C


In [14]:
df.shape

(877, 11)

# Transformation

In [15]:
df1 = df.copy()
df1.head()

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,0.0,3.0,"Braund, Mr. Owen Harris",male,22.0,1.0,0.0,A/5 21171,7.25,B96 B98,S
1,1.0,1.0,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1.0,0.0,PC 17599,71.2833,C85,C
2,1.0,3.0,"Heikkinen, Miss. Laina",female,26.0,0.0,0.0,STON/O2. 3101282,7.925,B96 B98,S
3,1.0,1.0,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1.0,0.0,113803,53.1,C123,S
7,0.0,3.0,"Palsson, Master. Gosta Leonard",male,2.0,3.0,1.0,349909,21.075,B96 B98,S


In [16]:
numerical_columns = df1.select_dtypes(exclude='object').columns
categorical_columns = df1.select_dtypes(include='object').columns

### Onehot Encodinng

In [17]:
# df1, encoder = onehot_transform(df1, categorical_columns = categorical_columns)
# df1.head()

### ordinal_transform

In [18]:
df1, encoder = ordinal_transform(df1, categorical_columns = categorical_columns)
df1.head()

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,0.0,3.0,107.0,1.0,22.0,1.0,0.0,520.0,7.25,46.0,2.0
1,1.0,1.0,188.0,0.0,38.0,1.0,0.0,592.0,71.2833,80.0,0.0
2,1.0,3.0,350.0,0.0,26.0,0.0,0.0,664.0,7.925,46.0,2.0
3,1.0,1.0,270.0,0.0,35.0,1.0,0.0,49.0,53.1,54.0,2.0
7,0.0,3.0,618.0,1.0,2.0,3.0,1.0,393.0,21.075,46.0,2.0


In [19]:
# df1[categorical_columns] = encoder.inverse_transform(df1[categorical_columns])
# df1.head()

### Label Encoding

In [20]:
# If the target column is categorical
df1, encoder = label_transform(df1, target_column='Survived')
df1.head()

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,0,3.0,107.0,1.0,22.0,1.0,0.0,520.0,7.25,46.0,2.0
1,1,1.0,188.0,0.0,38.0,1.0,0.0,592.0,71.2833,80.0,0.0
2,1,3.0,350.0,0.0,26.0,0.0,0.0,664.0,7.925,46.0,2.0
3,1,1.0,270.0,0.0,35.0,1.0,0.0,49.0,53.1,54.0,2.0
7,0,3.0,618.0,1.0,2.0,3.0,1.0,393.0,21.075,46.0,2.0


# Scaling

In [21]:
numerical_columns = df1.select_dtypes(exclude='object').columns
numerical_columns = numerical_columns.drop(['Survived'])
numerical_columns

Index(['Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare',
       'Cabin', 'Embarked'],
      dtype='object')

In [22]:
df1, scaler = standard_scale(df1, feature_columns=numerical_columns)
df1.head()

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,0,0.83093,-1.307433,0.737573,-0.558608,0.603257,-0.469245,0.936888,-0.558432,-0.285857,0.585623
1,1,-1.566194,-0.987487,-1.355798,0.665151,0.603257,-0.469245,1.298963,0.994304,1.167208,-1.944383
2,1,0.83093,-0.347595,-1.355798,-0.252668,-0.527205,-0.469245,1.661039,-0.542064,-0.285857,0.585623
3,1,-1.566194,-0.663591,-1.355798,0.435697,0.603257,-0.469245,-1.431691,0.553379,0.056041,0.585623
7,0,0.83093,0.710991,0.737573,-2.088307,2.864182,0.82081,0.298226,-0.223191,-0.285857,0.585623


# Feature selection

In [23]:
df2 = df1.copy()
print(df2.shape)
df2.head()

(877, 11)


Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,0,0.83093,-1.307433,0.737573,-0.558608,0.603257,-0.469245,0.936888,-0.558432,-0.285857,0.585623
1,1,-1.566194,-0.987487,-1.355798,0.665151,0.603257,-0.469245,1.298963,0.994304,1.167208,-1.944383
2,1,0.83093,-0.347595,-1.355798,-0.252668,-0.527205,-0.469245,1.661039,-0.542064,-0.285857,0.585623
3,1,-1.566194,-0.663591,-1.355798,0.435697,0.603257,-0.469245,-1.431691,0.553379,0.056041,0.585623
7,0,0.83093,0.710991,0.737573,-2.088307,2.864182,0.82081,0.298226,-0.223191,-0.285857,0.585623


In [24]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC

In [25]:
df2 = rfe(df2, label_column='Survived', n_features_to_select=7, estimator=GradientBoostingClassifier())
df2

Unnamed: 0,Pclass,Name,Sex,Age,Ticket,Fare,Cabin,Survived
0,0.830930,-1.307433,0.737573,-0.558608,0.936888,-0.558432,-0.285857,0
1,-1.566194,-0.987487,-1.355798,0.665151,1.298963,0.994304,1.167208,1
2,0.830930,-0.347595,-1.355798,-0.252668,1.661039,-0.542064,-0.285857,1
3,-1.566194,-0.663591,-1.355798,0.435697,-1.431691,0.553379,0.056041,1
7,0.830930,0.710991,0.737573,-2.088307,0.298226,-0.223191,-0.285857,0
...,...,...,...,...,...,...,...,...
886,-0.367632,0.414744,0.737573,-0.176183,-1.175221,-0.419001,-0.285857,0
887,-1.566194,-0.545093,-1.355798,-0.788063,-1.607700,-0.006770,-1.012389,1
888,0.830930,-0.110599,-1.355798,-0.099698,1.691212,-0.165600,-0.285857,0
889,-1.566194,-1.414081,0.737573,-0.252668,-1.637873,-0.006770,0.269727,1


# Model Building

In [40]:
from sklearn.utils import all_estimators

estimators = all_estimators(type_filter='regressor')

all_regs = []
for name, RegressorClass in estimators:
    try:
        print(name)
        # reg = RegressorClass()
        # all_regs.append(reg)
    except Exception as e:
        print(e)

ARDRegression
AdaBoostRegressor
BaggingRegressor
BayesianRidge
CCA
DecisionTreeRegressor
DummyRegressor
ElasticNet
ElasticNetCV
ExtraTreeRegressor
ExtraTreesRegressor
GammaRegressor
GaussianProcessRegressor
GradientBoostingRegressor
HistGradientBoostingRegressor
HuberRegressor
IsotonicRegression
KNeighborsRegressor
KernelRidge
Lars
LarsCV
Lasso
LassoCV
LassoLars
LassoLarsCV
LassoLarsIC
LinearRegression
LinearSVR
MLPRegressor
MultiOutputRegressor
MultiTaskElasticNet
MultiTaskElasticNetCV
MultiTaskLasso
MultiTaskLassoCV
NuSVR
OrthogonalMatchingPursuit
OrthogonalMatchingPursuitCV
PLSCanonical
PLSRegression
PassiveAggressiveRegressor
PoissonRegressor
QuantileRegressor
RANSACRegressor
RadiusNeighborsRegressor
RandomForestRegressor
RegressorChain
Ridge
RidgeCV
SGDRegressor
SVR
StackingRegressor
TheilSenRegressor
TransformedTargetRegressor
TweedieRegressor
VotingRegressor


In [27]:
df3 = df2.copy()
df3.shape

(877, 8)

In [28]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(df3.drop('Survived', axis=1), df3['Survived'], 
                                                        test_size=0.15, random_state=42)

In [63]:
with mlflow.start_run(run_name=datetime.now().strftime('%Y-%m-%d_%H:%M:%S')) as run:
    run_id = run.info.run_id
    print('MLflow Run ID: {}'.format(run_id))
    client = MlflowClient()
    client.set_tag(run_id, "mlflow.note.content", f"ML model: {str(model)}")
    if "Xgboost" in str(model):
        mlflow.xgboost.autolog()
    else:
        mlflow.sklearn.autolog()
    
    tags = {"Application": "AutoPilotML"}
    mlflow.set_tags(tags)
    model = SVC()
    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)
    # model = model.fit(x_train, y_train)
    # y_pred = model.predict(x_test)



MLflow Run ID: b1aa9652e94a4dc4af3ecc6c3f4f86e3




In [59]:
!mlflow ui

[2023-11-02 16:10:53 -0400] [44794] [INFO] Starting gunicorn 21.2.0
[2023-11-02 16:10:53 -0400] [44794] [INFO] Listening at: http://127.0.0.1:5000 (44794)
[2023-11-02 16:10:53 -0400] [44794] [INFO] Using worker: sync
[2023-11-02 16:10:53 -0400] [44795] [INFO] Booting worker with pid: 44795
[2023-11-02 16:10:53 -0400] [44796] [INFO] Booting worker with pid: 44796
[2023-11-02 16:10:53 -0400] [44797] [INFO] Booting worker with pid: 44797
[2023-11-02 16:10:53 -0400] [44798] [INFO] Booting worker with pid: 44798
^C
[2023-11-02 16:12:43 -0400] [44794] [INFO] Handling signal: int
[2023-11-02 16:12:43 -0400] [44797] [INFO] Worker exiting (pid: 44797)
[2023-11-02 16:12:43 -0400] [44796] [INFO] Worker exiting (pid: 44796)
[2023-11-02 16:12:43 -0400] [44798] [INFO] Worker exiting (pid: 44798)
[2023-11-02 16:12:43 -0400] [44795] [INFO] Worker exiting (pid: 44795)
