In [None]:
#Importing the required libraries
from fastai.tabular.all import * 
pd.options.display.max_columns = 50

In [None]:
#Loading and looking at training data
train_df = pd.read_csv("../input/tabular-playground-series-may-2022/train.csv")
train_df.head()

In [None]:
#Loading and looking at test data
test_df = pd.read_csv("../input/tabular-playground-series-may-2022/test.csv")
test_df.head()

## Data Exploration

In [None]:
#Understand the data by looking into shape
train_df.shape, test_df.shape

In [None]:
#Checking is there any null value in the training data
train_df.isna().sum()

In [None]:
#understand how the data is distributed(T represent transpose which is helps to visualize better)
train_df.describe().T

In [None]:
#Let's understand relation between the data
corr = train_df.corr()
corr.style.background_gradient(cmap='coolwarm')

In [None]:
#Getting information of training data
train_df.info()

## Data preprocessing/Feature Engineering

In [None]:
#By looking into data f_27 is the only column which is not continous(Non-numeric)
string_var = 'f_27'

In [None]:
#Converting the column into 10 seperate coloumn to understand the relationship
def feature_27(df, do_convert = True):
    if do_convert:
        for i in range(10):
            df[f'f_27_{i}'] = df[string_var].str.get(i)
    df.drop([string_var], axis=1, inplace=True)
    return df

In [None]:
#Applying the above function into traing and test data
train_df = feature_27(train_df, do_convert=True)
test_df = feature_27(test_df, do_convert=True)

In [None]:
#After applying the data function, just checking the shape of the data
train_df.shape, test_df.shape

In [None]:
#Looking the feature engineered data
train_df.head()

In [None]:
#Splitting the data into categorical and continous
cont_names = [col for col in train_df if train_df[col].dtype =="float64"]
cat_names = [col for col in train_df if train_df[col].dtype !="float64"]
cont_names

In [None]:
cat_names

In [None]:
#Removing the id column which is unique number, not going to help our prediction and removing target for predictions.
cat_names.remove("target")
cat_names.remove("id")

In [None]:
#Splitting the data randomly for traing and validation
splits = RandomSplitter(valid_pct=0.2)(range_of(train_df))

In [None]:
#Looking into data how it splits
splits

## Preprocessing

In [None]:
#Building a datablock for the model
to = TabularPandas(train_df,
                   procs = [Categorify,Normalize],
                   cat_names = cat_names,
                   cont_names = cont_names,
                   y_names='target',
                   y_block = CategoryBlock,
                   splits=splits)

In [None]:
#looking to training dataset
to.xs.iloc[:2]

In [None]:
#checking the shape
to.xs.shape,train_df.shape

In [None]:
#Loading the data batch size of 2048
dls = to.dataloaders(bs=2048)

In [None]:
#Looking the data loaders
dls.show_batch()

In [None]:
#Building the tabular learner
learn = tabular_learner(dls, 
                        metrics=accuracy, 
                        layers = [512,256,128,128,64], 
                        wd=0.1)

In [None]:
#Finding the learning rate
lr_min,lr_steep = learn.lr_find(suggest_funcs=(minimum, steep))

In [None]:
print(f"Minimum/10: {lr_min:.2e}, steepest point: {lr_steep:.2e}")

In [None]:
#Fitting the model
learn.fit_one_cycle(30, lr_steep)

In [None]:
#Recorder plot how our  model is performing
learn.recorder.plot_loss()

In [None]:
#look into the result of learner
learn.show_results()

In [None]:
#To know the pipeline look into the summary
learn.summary()

In [None]:
#Lets'predict with a row of data
row, clas, probs = learn.predict(train_df.iloc[0])

In [None]:
#Lets see the row prediction
row.show()

In [None]:
#Let's check how our model is predicting the classification
interp = ClassificationInterpretation.from_learner(learn)
interp.plot_confusion_matrix()

In [None]:
#Preparing the dataset for the prediction
test_df.head()

In [None]:
#Droping the irrelevant column
test_df.drop("id", axis=1, inplace= True)

In [None]:
#Look into the test dataset
test_df

In [None]:
#Applying learner to the test data
dl = learn.dls.test_dl(test_df)

In [None]:
#Look into to it
dl

In [None]:
#Predict the test dataframe
pred = learn.get_preds(dl=dl)

In [None]:
#Let's see how our prediction
pred

In [None]:
#Converting to submission format
preds = learn.get_preds(dl=dl)[0].argmax(1).numpy()
preds[:5]

In [None]:
#Understand the shape
preds.shape

In [None]:
#Look into the submission format
sample = pd.read_csv("../input/tabular-playground-series-may-2022/sample_submission.csv")
sample

In [None]:
sub = pd.DataFrame({'id':sample.id, 'target': preds})
sub.to_csv('submission.csv', index=False)
sub.head()

In [None]:
#Training and validation set for bagging and boosting algorithms
X_train, y_train = to.train.xs, to.train.ys.values.ravel()
X_valid, y_valid = to.valid.xs, to.valid.ys.values.ravel()

In [None]:
#Exploring the training and validation datasets

In [None]:
X_train

In [None]:
y_train

In [None]:
X_valid

In [None]:
y_valid

In [None]:
#understanding the shapes
X_train.shape, y_train.shape, X_valid.shape, y_valid.shape

In [None]:
%%time
from sklearn.ensemble import HistGradientBoostingClassifier
clf = HistGradientBoostingClassifier().fit(X_train, y_train)

In [None]:
clf.score(X_valid, y_valid)

In [None]:
%%time
from sklearn.ensemble import RandomForestClassifier
clf1 = RandomForestClassifier().fit(X_train, y_train)

In [None]:
clf1.score(X_valid, y_valid)

From understanding all the above model neural net perform better may be some hyperparameter tuning in the Boosting and Bagging model gives the high accuracy...Let's explore more in upcoming nextbook.