<a href="https://www.kaggle.com/code/rubanzasilva/ps-s04-e03-fastai?scriptVersionId=169141486" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

### Imports

If you dont have the fastai library installed, uncomment the lines with pip install fastbook to install all the dependencies we shall need.

In [None]:
#hide
#! [ -e /content ]

#hide
#This imports and sets up everything you will need for this notebook
#
!pip install -Uqq fastbook
import fastbook
fastbook.setup_book()

from fastbook import *
from fastai.tabular.all import *
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from numpy import random

from fastai.imports import *
np.set_printoptions(linewidth=130)

# for working with paths in Python, I recommend using `pathlib.Path`
from pathlib import Path
import os
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import RandomForestClassifier

from ipywidgets import interact


matplotlib.rc('image', cmap='Greys')

Set random seed for reproducibility.

In [None]:
#random.seed(42)
set_seed(42)

In [None]:
!ls

Import Dataset

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!ls /kaggle/input/playground-series-s4e3

In [None]:
path = Path('/kaggle/input/playground-series-s4e3')
path

Read in Datasets

In [None]:
train_df = pd.read_csv(path/'train.csv')
test_df = pd.read_csv(path/'test.csv')
sub_df = pd.read_csv(path/'sample_submission.csv')

In [None]:
train_df.head()

In [None]:
sub_df.head()

In [None]:
test_df.shape,train_df.shape

Lets create a list called y_names of our dependent variables / targets

In [None]:
y_names = ['Pastry', 'Z_Scratch', 'K_Scatch', 'Stains', 'Dirtiness', 'Bumps', 'Other_Faults']

In [None]:
cont_names, cat_names = cont_cat_split(train_df,dep_var=['Pastry', 'Z_Scratch', 'K_Scatch', 'Stains', 'Dirtiness', 'Bumps', 'Other_Faults'])

In [None]:
splits = RandomSplitter(valid_pct=0.2)(range_of(train_df))

### TabularPandas Object

We create an instance of a TabularPandas Object,to which is a fastai dataframe wrapper that stores all the information about our dataset such as which columns are continous, categorical, and our dependent variables. 

This instance also stores and applies the set transformations to our data.

In [None]:
to = TabularPandas(train_df, procs=[Categorify, FillMissing,Normalize],
                   cat_names = cat_names,
                   cont_names = cont_names,
                   y_names= y_names,
                  # y_block = CategoryBlock,
                   splits=splits)

In [None]:
to.xs.iloc[:2]

We now go ahead and create a dataloaders object which loads the data in a given batch size. Before loading the data the transforms are applied.

In [None]:
dls = to.dataloaders(bs=64)

In [None]:
dls.show_batch()

### Tabular Model

We can define our model by using the tabular_learner method which returns a learner that includes a tabularModel which creates a basic model for our tabular data while infering the right loss function.

In [None]:
learn = tabular_learner(dls, metrics=RocAucMulti())

In [None]:
learn.lr_find(suggest_funcs=(slide,valley))

In [None]:
#learn.fit_one_cycle(15, slice(0.0005, 0.019))
learn.fit_one_cycle(12, 0.014)

In [None]:
learn.show_results()

In [None]:
dl = learn.dls.test_dl(test_df)

nn_preds = learn.get_preds(dl=dl)
nn_preds

nn_preds returns the predictions from the model

In [None]:
nn_preds_x = learn.get_preds()[0]
nn_preds_x

We can create a submission file for our tabular_learner model by uncommenting and running the cell below.

In [None]:
#target_preds = preds[0]
#targets =['Pastry', 'Z_Scratch', 'K_Scatch', 'Stains', 'Dirtiness', 'Bumps', 'Other_Faults']
#test_df[targets] = target_preds

#test_df.to_csv('submission.csv', columns=['id','Pastry', 'Z_Scratch', 'K_Scatch', 'Stains', 'Dirtiness', 'Bumps', 'Other_Faults'], index=False)

#sub = pd.read_csv('submission.csv')
#sub.head()

### Neural Network Ensemble

In [None]:
learn.lr_find(suggest_funcs=(slide,valley))

In [None]:
dl = learn.dls.test_dl(test_df)

In [None]:
def ensemble():
    learn = tabular_learner(dls, metrics=RocAucMulti())
    with learn.no_bar(),learn.no_logging(): learn.fit(12, 0.014)
    return learn.get_preds(dl=dl)[0]

In [None]:
learns = [ensemble() for _ in range(5)]

In [None]:
ens_preds = torch.stack(learns).mean(0)

In [None]:
#targets =['Pastry', 'Z_Scratch', 'K_Scatch', 'Stains', 'Dirtiness', 'Bumps', 'Other_Faults']
#test_df[targets] = ens_preds

#test_df.to_csv('submission.csv', columns=['id','Pastry', 'Z_Scratch', 'K_Scatch', 'Stains', 'Dirtiness', 'Bumps', 'Other_Faults'], index=False)

#ens_sub = pd.read_csv('submission.csv')
#ens_sub.head()

### Random Forests

In [None]:
X_train, y_train = to.train.xs, to.train.ys.values
X_test, y_test = to.valid.xs, to.valid.ys.values

rf = RandomForestClassifier(100, min_samples_leaf=5)
rf.fit(X_train, y_train);
#mean_absolute_error(y_test, rf.predict(X_test))

roc_auc_score(y_test, rf.predict(X_test))

### Kaggle Submission

In [None]:
preds = rf.predict(X_test)

# Assuming `test_df` is your test DataFrame and it has an 'id' column
# Create a DataFrame for the submission
sub_df = pd.DataFrame(preds, columns=['Pastry', 'Z_Scratch', 'K_Scatch', 'Stains', 'Dirtiness', 'Bumps', 'Other_Faults'])
sub_df['id'] = test_df['id'] # Add the 'id' column from the test data

# Reorder the columns to match the submission format
sub_df = sub_df[['id', 'Pastry', 'Z_Scratch', 'K_Scatch', 'Stains', 'Dirtiness', 'Bumps', 'Other_Faults']]

# Save the submission DataFrame as a CSV file
sub_df.to_csv('submission.csv', index=False)

In [None]:
# Assuming `rf` is your trained model and `X_test` is your test data
# Make predictions
preds = rf.predict(X_test)

# Assuming `test_df` is your test DataFrame and it has an 'id' column
# Create a DataFrame for the submission
sub_df = pd.DataFrame(preds, columns=['Pastry', 'Z_Scratch', 'K_Scatch', 'Stains', 'Dirtiness', 'Bumps', 'Other_Faults'])
sub_df['id'] = test_df['id'] # Add the 'id' column from the test data

# Reorder the columns to match the submission format
sub_df = sub_df[['id', 'Pastry', 'Z_Scratch', 'K_Scatch', 'Stains', 'Dirtiness', 'Bumps', 'Other_Faults']]

# Save the submission DataFrame as a CSV file
sub_df.to_csv('submission.csv', index=False)


In [None]:
!ls

In [None]:
# Assuming `sub_df` is your DataFrame that needs the 'id' column converted
# Convert 'id' column to Int32 by rounding the floating-point numbers
#submission_df['id'] = submission_df['id'].round().astype('Int32')

# Now, 'id' column is of type Int32
#print(submission_df['id'].dtype)


In [None]:
# Save the submission DataFrame to a CSV file
submission_df.to_csv('submission1.csv', index=False)

In [None]:
!ls

In [None]:
#sub_df = pd.read_csv(path/'sample_submission.csv')
sub_tree = pd.read_csv('submission2.csv')

In [None]:
sub_tree.info()

In [None]:
sub_df

In [None]:
print(sub_df['id'].dtype)

In [None]:
#target_preds = preds[0]
#target_preds

Submit Outside Kaggle.

In [None]:
#!kaggle competitions submit -c playground-series-s4e3 -f submission.csv -m "fastai baseline, adding lr fron lr finder"

Full Notebook

In [None]:
from fastai.tabular.all import *
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

#hide
#! [ -e /content ]

#hide
#This imports and sets up everything you will need for this notebook
#
#!pip install -Uqq fastbook
#import fastbook
#fastbook.setup_book()

#from fastbook import *
from fastai.tabular.all import *

from fastai.imports import *
np.set_printoptions(linewidth=130)

# for working with paths in Python, I recommend using `pathlib.Path`
from pathlib import Path
import os
import seaborn as sns
import numpy as np
from numpy import random

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

from ipywidgets import interact



matplotlib.rc('image', cmap='Greys')

random.seed(42)

# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

path = Path('/kaggle/input/playground-series-s4e3')
path

train_df = pd.read_csv(path/'train.csv')
test_df = pd.read_csv(path/'test.csv')
sub_df = pd.read_csv(path/'sample_submission.csv')

y_names = ['Pastry', 'Z_Scratch', 'K_Scatch', 'Stains', 'Dirtiness', 'Bumps', 'Other_Faults']

cont_names, cat_names = cont_cat_split(train_df,dep_var=['Pastry', 'Z_Scratch', 'K_Scatch', 'Stains', 'Dirtiness', 'Bumps', 'Other_Faults'])

splits = RandomSplitter(valid_pct=0.2)(range_of(train_df))

to = TabularPandas(train_df, procs=[Categorify, FillMissing,Normalize],
                   cat_names = cat_names,
                   cont_names = cont_names,
                   y_names= y_names,
                  # y_block = CategoryBlock,
                   splits=splits)

#X_train, y_train = to.train.xs, to.train.ys.values.ravel()
#X_test, y_test = to.valid.xs, to.valid.ys.values.ravel()

X_train, y_train = to.train.xs, to.train.ys.values
X_test, y_test = to.valid.xs, to.valid.ys.values

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score

rf = RandomForestClassifier(100, min_samples_leaf=5)
rf.fit(X_train, y_train);
#mean_absolute_error(y_test, rf.predict(X_test))

roc_auc_score(y_test, rf.predict(X_test))

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

path = Path('/kaggle/input/playground-series-s4e3')
path

train_df = pd.read_csv(path/'train.csv')
test_df = pd.read_csv(path/'test.csv')
sub_df = pd.read_csv(path/'sample_submission.csv')

y_names = ['Pastry', 'Z_Scratch', 'K_Scatch', 'Stains', 'Dirtiness', 'Bumps', 'Other_Faults']

cont_names, cat_names = cont_cat_split(train_df,dep_var=['Pastry', 'Z_Scratch', 'K_Scatch', 'Stains', 'Dirtiness', 'Bumps', 'Other_Faults'])

splits = RandomSplitter(valid_pct=0.2)(range_of(train_df))

to = TabularPandas(train_df, procs=[Categorify, FillMissing,Normalize],
                   cat_names = cat_names,
                   cont_names = cont_names,
                   y_names= y_names,
                  # y_block = CategoryBlock,
                   splits=splits)

dls = to.dataloaders(bs=64)

dl = learn.dls.test_dl(test_df)

learn = tabular_learner(dls, metrics=RocAucMulti())

#learn.fit_one_cycle(15, slice(0.0005, 0.019))
learn.fit_one_cycle(12, 0.014)


preds = learn.get_preds(dl=dl)


target_preds = preds[0]
targets =['Pastry', 'Z_Scratch', 'K_Scatch', 'Stains', 'Dirtiness', 'Bumps', 'Other_Faults']
test_df[targets] = target_preds

test_df.to_csv('submission.csv', columns=['id','Pastry', 'Z_Scratch', 'K_Scatch', 'Stains', 'Dirtiness', 'Bumps', 'Other_Faults'], index=False)

#sub = pd.read_csv('submission.csv')
#sub.head()