In [None]:
# Importing necessary libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier 
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.tree import DecisionTreeClassifier

In [None]:
# Importing the train and test datasets

train_data = pd.read_csv("../input/tabular-playground-series-apr-2021/train.csv")
test_data = pd.read_csv("../input/tabular-playground-series-apr-2021/test.csv")

In [None]:
train_data

In [None]:
# Correlation matrix

correlation = train_data.corr()
plt.figure(figsize=(10,8))
sns.heatmap(correlation, annot=True)     
plt.show() 

In [None]:
# Function to process the train dataset

def preprocess_inputs(df):
    
    # Creating a copy of the dataframe 
    
    df = df.copy()
    
    # Dropping columns with excessive missing values and unnecessary columns for cardinality
    
    df = df.drop(["Cabin", "Name", "Ticket"], axis=1)
    
    # Filling the missing values in Age and Fare columns with Median
    
    df["Age"] = df["Age"].fillna(df["Age"].median()) 
    df["Fare"] = df["Fare"].fillna(df["Fare"].median()) 
    
    # Binary encoding for Sex column
    
    df["Sex"] = df["Sex"].replace({"male":"0", "female":"1"})
    
    # One-hot encoding the Embarked column
    
    dummies = pd.get_dummies(df["Embarked"], prefix="embark")
    df = pd.concat([df,dummies], axis=1)
    df = df.drop("Embarked", axis=1)
    
    return df

In [None]:
train = preprocess_inputs(train_data)

In [None]:
train  

In [None]:
test_data 

In [None]:
# Function to process the train dataset

def preprocess_inputs_test(df):
    
    # Creating a copy of the dataframe 
    
    df = df.copy()
    
    # Dropping columns with excessive missing values and unnecessary columns for cardinality
    
    df = df.drop(["Cabin", "Name", "Ticket"], axis=1)
    
    # Filling the missing values in Age and Fare columns with Median
    
    df["Age"] = df["Age"].fillna(df["Age"].median()) 
    df["Fare"] = df["Fare"].fillna(df["Fare"].median()) 
    
    # Binary encoding for Sex column
    
    df["Sex"] = df["Sex"].replace({"male":"0", "female":"1"})
    
    # One-hot encoding the Embarked column
    
    dummies = pd.get_dummies(df["Embarked"], prefix="embark")
    df = pd.concat([df,dummies], axis=1)
    df = df.drop("Embarked", axis=1)
    
    return df

In [None]:
test = preprocess_inputs_test(test_data)

In [None]:
test.head() 

In [None]:
# Splitting the training set into X_train and y_train

X_train = train.drop(["Survived"], axis=1)
y_train = train["Survived"]

In [None]:
# Modelling and fitting the data 

model = RandomForestClassifier()
model.fit(X_train,y_train)   

In [None]:
# Prediction on the test set

y_preds = model.predict(test) 

In [None]:
# Converting the NumPy array of y_preds to Series

y_preds = pd.Series(y_preds)

In [None]:
# Creating a submission file

sub_df = pd.read_csv("../input/tabular-playground-series-apr-2021/sample_submission.csv")
submission = pd.concat([sub_df["PassengerId"], y_preds], axis=1)
submission.columns = ["PassengerId", "Survived"]
submission.to_csv("Submission_colab_rf.csv", index=False) 

In [None]:
# Hyperparameter tuning - to be done

rf_grid = {"n_estimators": np.arange(10, 500, 50),
           "max_depth": [None, 3, 5, 10],
           "min_samples_split": np.arange(2, 20, 2),
           "min_samples_leaf": np.arange(1, 20, 2)}

rs_rf = RandomizedSearchCV(RandomForestClassifier(),
                           param_distributions=rf_grid,
                           cv=5,
                           n_iter=20,
                           verbose=True)

rs_rf.fit(X_train, y_train)

In [None]:
rs_rf.best_params_

In [None]:
y_preds_tuned = rs_rf.predict(test)

In [None]:
y_preds_tuned = pd.Series(y_preds_tuned) 

In [None]:
# Creating a submission file after tuning

sub_df = pd.read_csv("../input/tabular-playground-series-apr-2021/sample_submission.csv")
submission = pd.concat([sub_df["PassengerId"], y_preds_tuned], axis=1)
submission.columns = ["PassengerId", "Survived"]
submission.to_csv("Submission_rf_tuned.csv", index=False) 