## Importing libraries & data

In [None]:
import numpy as np
import pandas as pd

# Plot
import seaborn as sns
import matplotlib.pyplot as plt

# Encoding
from sklearn.preprocessing import LabelEncoder

# Modelling
from sklearn.model_selection import train_test_split
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier

In [None]:
train_df = pd.read_csv("../input/tabular-playground-series-feb-2022/train.csv",index_col=False)
test_df = pd.read_csv("../input/tabular-playground-series-feb-2022/test.csv",index_col=False)
sample_submission = pd.read_csv("../input/tabular-playground-series-feb-2022/sample_submission.csv")

In [None]:
#Check training file 
train_df = train_df.drop(['row_id'],axis=1)
train_df.head() 

In [None]:
#Check test file 
test_df = test_df.drop(['row_id'],axis=1)
test_df.head() 

## Reduce Memory usage: 

Taken from: https://www.kaggle.com/sfktrkl/tps-feb-2022/notebook

In [None]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)  
        else:
            df[col] = df[col].astype('category')
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [None]:
reduce_mem_usage(train_df)
reduce_mem_usage(test_df)
reduce_mem_usage(sample_submission)

## Explore Dataset 

In [None]:
train_df.columns

In [None]:
train_df.describe()

## Check dataset 

In [None]:
print('Train data shape:', train_df.shape)
print('Test data shape:', test_df.shape)

In [None]:
#Check missing values 

missing_values_train = train_df.isna().any().sum()
print('Missing values in train data: {0}'.format(missing_values_train[missing_values_train > 0]))

missing_values_test = test_df.isna().any().sum()
print('Missing values in test data: {0}'.format(missing_values_test[missing_values_test > 0]))

## Check duplicates

In [None]:
duplicates_train = train_df.duplicated().sum()
print('Duplicates in train data: {0}'.format(duplicates_train))

duplicates_test = test_df.duplicated().sum()
print('Duplicates in test data: {0}'.format(duplicates_test))

In [None]:
train_df['target'].value_counts().plot(kind='barh')

From the above chart Data looks balanced. 

## Features 



In [None]:
categorical_features = train_df.columns[11:-1:]
print("Categorical Columns: \n{0}".format(list(categorical_features)))

### Numerical features

In [None]:
numerical_features = train_df.columns[1:11]
print("Numerical Columns: \n{0}".format(list(train_df.columns[1:11])))
train_df[numerical_features].describe()

In [None]:
#Remove duplicates
train_df.drop_duplicates(keep='first', inplace=True)
duplicates_train = train_df.duplicated().sum()

print('Train data shape:', train_df.shape)
print('Duplicates in train data: {0}'.format(duplicates_train))

## Label Encode Target label

In [None]:
target_encoder = LabelEncoder()
train_df["target"] = target_encoder.fit_transform(train_df["target"])

X = train_df.drop(["target"], axis=1)
y = train_df["target"]

In [None]:
target_encoder.classes_

## Split Dataset into train valid

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_valid, Y_train, Y_valid = train_test_split(X,y, test_size=0.1, random_state=42)

## Prepare Voting Classifer 

> Taking Soft voting in which ‘soft’, predicts the class label based on the argmax of the sums of the predicted probabilities while If ‘hard’, uses predicted class labels for majority rule voting.

In [None]:
clf1 = RandomForestClassifier(max_depth=8, random_state=42, n_jobs=-1)
clf2 = ExtraTreesClassifier(n_estimators=1300,n_jobs=-1)
eclf1 = VotingClassifier(estimators=[ ('rf', clf1), ('gnb', clf2)], voting='soft')
eclf1 = eclf1.fit(X_train, Y_train)

## Validate the model

In [None]:
valid_pred = eclf1.predict(X_valid)
valid_score = accuracy_score(Y_valid, valid_pred)
print("Accuracy:", valid_score)

## Predict Probabilities on Test Data

In [None]:
y_probs= eclf1.predict_proba(test_df)
y_probs.shape

Using np.argmax to get index of maximum probability

In [None]:
#Probabilities 
y_probs[0], np.argmax(y_probs[0])

## Convert Probability to Class name

In [None]:
y_pred_tuned = target_encoder.inverse_transform(np.argmax(y_probs, axis=1))

In [None]:
pd.Series(y_pred_tuned, index=test_df.index).value_counts().sort_index() / len(test_df) * 100

## Prepare submission

In [None]:
sample_submission["target"] = y_pred_tuned
sample_submission.to_csv("submission.csv", index=False)
sample_submission