In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

<h1 style="background-color:LimeGreen; font-family:newtimeroman; font-size:200%; text-align:left;"> Version 1 to 5 : Tabular Playground Apr </h1>
<h1 style="background-color:LimeGreen; font-family:newtimeroman; font-size:200%; text-align:left;"> Version 6 : Tabular Playground May </h1>

<h2 style=color:green align="left"> Table of Contents </h2>

#### 1) Introduction to EvalML
#### 2) Load Required Libraries
#### 3) Read Data
#### 4) EDA (Exploratory Data Analysis)
#### 5) EvalML (AutoML)

<h1 style='background-color:magenta; font-family:newtimeroman; font-size:200%; text-align:center; border-radius: 15px 50px;' > 1) Introduction to EvalML </h1>

- **EvalML** is an AutoML library which builds, optimizes, and evaluates machine learning pipelines using domain-specific objective functions.

#### Key Functionality

- **Automation**: Makes machine learning easier. Avoid training and tuning models by hand. Includes data quality checks, cross-validation and more.

- **Data Checks**: Catches and warns of problems with your data and problem setup before modeling.

- **End-to-end**: Constructs and optimizes pipelines that include state-of-the-art preprocessing, feature engineering, feature selection, and a variety of modeling techniques.

- **Model Understanding**: Provides tools to understand and introspect on models, to learn how they'll behave in your problem domain.

- **Domain-specific**: Includes repository of domain-specific objective functions and an interface to define your own.

### Reference:
 - https://github.com/alteryx/evalml
 - https://github.com/alteryx/evalml
 - https://evalml.alteryx.com/en/stable/install.html

<h1 style='background-color:magenta; font-family:newtimeroman; font-size:200%; text-align:center; border-radius: 15px 50px;' > 2) Load Required Libraries </h1>

In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

plt.style.use("fivethirtyeight")
sns.set_style("darkgrid")

<h1 style='background-color:magenta; font-family:newtimeroman; font-size:200%; text-align:center; border-radius: 15px 50px;' > 3) Read Data </h1>

In [None]:
train = pd.read_csv('/kaggle/input/tabular-playground-series-may-2021/train.csv')
test = pd.read_csv('/kaggle/input/tabular-playground-series-may-2021/test.csv')
sub = pd.read_csv('/kaggle/input/tabular-playground-series-may-2021/sample_submission.csv')

In [None]:
display(train.head(3))
display(test.head(3))
display(sub.head(3))

In [None]:
display(train.shape)
print("--------"*5)
display(test.shape)

In [None]:
train.sample(10)

In [None]:
display(train.info())
print('-'*80)
display(test.info())

<h1 style='background-color:magenta; font-family:newtimeroman; font-size:200%; text-align:center; border-radius: 15px 50px;' > 4) EDA (Exploratory Data Analysis) </h1>

<h1 style='background-color:LimeGreen; font-family:newtimeroman; font-size:170%; text-align:left;' > 4.1) Missing values </h1>

In [None]:
print("Missing Values in Train:\n\n", train.isnull().sum())
print("\n\nMissing Values in Test:\n\n", test.isnull().sum())

In [None]:
display(train.describe().T)
display(test.describe().T)

In [None]:
train['target'].value_counts()

### Visualize missing values (NaN) values using Missingno Library

 a) Visualize missing values as a matrix
 
 b) Visualize missing values as a barplot
 
 c) Visualize missing values as a heatmap
 
 d) Visualize missing values as a dendrogram

In [None]:
import missingno as msno

<h1 style='background-color:LimeGreen; font-family:newtimeroman; font-size:170%; text-align:left;' > 4.2) Visualize missing values as a matrix </h1>

In [None]:
# Visualize missing values as a matrix
# msno.matrix(train,figsize=(11,7), sparkline=False, fontsize=12, color=(0.27, 0.52, 1.0));
# msno.matrix(train,figsize=(11,7), sparkline=False, fontsize=12, color=(0,.3,.3));
msno.matrix(train,figsize=(11,7), fontsize=12, color=(1, 0.38, 0.27));

<h1 style='background-color:LimeGreen; font-family:newtimeroman; font-size:170%; text-align:left;' > 4.3) Visualize missing values as a barplot </h1>

In [None]:
# Visualize the number of missing values as a bar chart
# color="dodgerblue" "orangered"
# msno.bar(train, color="dodgerblue", sort="ascending", figsize=(13,7), fontsize=12);

<h1 style='background-color:LimeGreen; font-family:newtimeroman; font-size:170%; text-align:left;' > 4.4) Visualize missing values as a heatmap </h1>

In [None]:
# Visualize the correlation between the number of missing values in different columns as a heatmap
# msno.heatmap(train, cmap="RdYlGn", figsize=(10,5), fontsize=12)

<h1 style='background-color:LimeGreen; font-family:newtimeroman; font-size:170%; text-align:left;' > 4.5) Visualize missing values as a dendogram </h1>

In [None]:
# msno.dendrogram(train, figsize=(12,7), fontsize=12)

<h1 style='background-color:magenta; font-family:newtimeroman; font-size:200%; text-align:center; border-radius: 15px 50px;' > 5) EvalML (AutoML) </h1>

#### a) Install EvalML (AutoML)
#### b) Configure search
#### c) Pipeline Rankings
#### d) Get pipeline
#### e) Select Best pipeline
#### f) Describe pipeline
#### g) Access raw results

<h1 style='background-color:LimeGreen; font-family:newtimeroman; font-size:170%; text-align:left;' > 5.1) Install EvalML (AutoML) </h1>

In [None]:
!pip install evalml

In [None]:
import evalml

In [None]:
X = train.drop(columns=['target'])
y = train['target']

In [None]:
# X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2)
# X_train, X_test, y_train, y_test = evalml.preprocessing.split_data(X, y, problem_type='binary', test_size=.2)

# binary classification --> problem_type='binary'
# multi classification --> problem_type='multiclass'

X_train, X_test, y_train, y_test = evalml.preprocessing.split_data(X, y, problem_type='multiclass')

In [None]:
evalml.problem_types.ProblemTypes.all_problem_types

<h1 style='background-color:LimeGreen; font-family:newtimeroman; font-size:170%; text-align:left;' > 5.2) Configure search </h1>

In [None]:
# automl = AutoMLSearch(X_train=X_train, y_train=y_train, problem_type='binary')
# automl = AutoMLSearch(X_train = X_train, y_train = y_train, problem_type='binary', max_batches=1, optimize_thresholds=True)

# objective="F1"              --> Binary Classification
# objective="F1 Micro"        --> Multiclass Classification  --> using micro averaging
# objective="F1 Macro"        --> Multiclass Classification  --> using macro averaging
# objective="F1 Weighted"     --> Multiclass Classification  --> using Weighted averaging

from evalml import AutoMLSearch

automl = AutoMLSearch(X_train=X_train, y_train=y_train, problem_type="multiclass", objective="F1 Micro", 
                      allowed_model_families=['random_forest' , 'xgboost', 'lightgbm'],
                      additional_objectives=None, max_batches=5)
automl.search()

<h1 style='background-color:LimeGreen; font-family:newtimeroman; font-size:170%; text-align:left;' > 5.3) Pipeline Rankings </h1>

In [None]:
automl.rankings

<h1 style='background-color:LimeGreen; font-family:newtimeroman; font-size:170%; text-align:left;' > 5.4) Get pipeline </h1>

In [None]:
# We can get the object of any pipeline via their id as well:
pipeline = automl.get_pipeline(1)
print('Name:\n',pipeline.name)
print('\n\nParameters:\n\n', pipeline.parameters)

<h1 style='background-color:LimeGreen; font-family:newtimeroman; font-size:170%; text-align:left;' > 5.5) Select Best pipeline </h1>

In [None]:
best_pipeline = automl.best_pipeline
best_pipeline

In [None]:
# We can also visualize the structure of our pipeline
best_pipeline.graph()

<h1 style='background-color:LimeGreen; font-family:newtimeroman; font-size:170%; text-align:left;' > 5.6) Describe pipeline </h1>

- Each **pipeline is given an id.** We can get more information about any **particular pipeline** using that id. Here, we will get more information about the **pipeline with id = 1.**

In [None]:
automl.describe_pipeline(3)

#### If we are interested in see more details about the pipeline, we can describe it using the id from the rankings table:

In [None]:
automl.describe_pipeline(automl.rankings.iloc[0]["id"])

<h1 style='background-color:LimeGreen; font-family:newtimeroman; font-size:170%; text-align:left;' > 5.7) Access raw results </h1>

- The **AutoMLSearch** class records detailed results information under the results field, including information about the **cross-validation scoring and parameters.**

In [None]:
automl.results

In [None]:
# Evaluate on the test data
scores = best_pipeline.score(X_test, y_test, objectives=evalml.objectives.get_core_objectives('multiclass'))
print(f'Accuracy Binary: {scores["F1 Micro"]}')                                                                                              

best_pipeline = automl.best_pipeline
best_pipeline.fit(X_train,y_train)
prediction = best_pipeline.predict(test)

In [None]:
best_pipeline.fit(X_train, y_train)
predictions_X_test = best_pipeline.predict(X_test)

In [None]:
predictions_test = best_pipeline.predict(test)

check_model = automl.load('model'.pkl)
check_model.predict_proba(X_test).to_dataframe()

from evalml.model_understanding.graphs import (
    graph_prediction_vs_actual, 
    #graph_feature_importance, 
    graph_confusion_matrix
)

graph_prediction_vs_actual(best_pipeline, X_test, y_test, "F1 Micro")

<h2 style=color:green align="left"> 5.7.1) Feature Importance </h2>

In [None]:
# graph_permutation_importance(best_pipeline, X_test, y_test, "F1")
best_pipeline.graph_feature_importance(importance_threshold=0)

<h2 style=color:green align="left"> 5.7.2) Confusion Matrix </h2>

In [None]:
graph_confusion_matrix(y_test, predictions_X_test)

from evalml.objectives.standard_metrics import AccuracyBinary, AUC, F1, PrecisionWeighted, Recall

#acc = AccuracyBinary()
auc = AUC()
f1 = F1()
pre_w = PrecisionWeighted()
rec = Recall()

print(f"Accuracy (Binary): {acc.score(y_true=y_test, y_predicted=predictions)}")
print(f"Area Under Curve: {auc.score(y_true=y_test, y_predicted=predictions)}")
print(f"F1: {f1.score(y_true=y_test, y_predicted=predictions)}")
print(f"Precision (Weighted): {pre_w.score(y_true=y_test, y_predicted=predictions)}")
print(f"Recall: {rec.score(y_true=y_test, y_predicted=predictions)}")

<h1 style='background-color:LimeGreen; font-family:newtimeroman; font-size:180%; text-align:center; border-radius: 15px 50px;' > Submission </h1>

In [None]:
sub['Survived'] = predictions_test.to_series()
sub.to_csv('submission.csv',index=False)
sub.head()