In [5]:
## Importing basic required libraries

import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

In [6]:
## Importing the data from google drive

train_url = 'https://drive.google.com/file/d/1UiFmiaLmD4CKbXh9xWqiUCi-5Vq6ce3_/view?usp=sharing'

train_download_url = 'https://drive.google.com/uc?id=' + train_url.split('/')[-2]

test_url = 'https://drive.google.com/file/d/1Q-YuLzD9M4d7cIcD48LzOJubSY9Poi5A/view?usp=sharing'

test_download_url = 'https://drive.google.com/uc?id=' + test_url.split('/')[-2]


train = pd.read_csv(train_download_url)
test = pd.read_csv(test_download_url)

In [None]:
## checking out the train and test data

print('Train data:\n')
print(train.head())
print()
print('Test data\n')
print(test.head())
print()

In [None]:
## Checking out the number of rows and columns in the data

print(f"The number of rows in the train data is {train.shape[0]}.\n")
print(f"The number of columns in the train data is {train.shape[1]}.")

In [None]:
print(f"The number of rows in the test data is {test.shape[0]}.\n")
print(f"The number of columns in the test data is {test.shape[1]}.")

In [None]:
## understanding the data types of features 

train.info()

<strong>Looks like all the independent features are of int64 datatype.</strong>

In [None]:
## Let's find out the number of unique values in each of the independent features

for feature in train.columns:
    print(f"The unique values in the feature {feature} are {train[feature].unique()} (total of {train[feature].nunique()}).\n")

In [None]:
## Finding out the features with less than 50 unique values (These will be categorical features which are encoded as )

print('Features with less than 50 unique values are: \n')
for feature in train.columns:
    if train[feature].nunique() <= 50:
        print(f"{feature}", end=", ")

In [None]:
## In the dataset, null values are represented as 'na'. Let's convert them to np.nan

def miss(x):
    if x == 'na':
        return np.nan

    else:
        return x


for feature in train.columns[1:]:
    train[feature] = train[feature].map(miss)

In [None]:
## checking for the missing values in the data

missing_values_df = pd.DataFrame()
missing_values_df['Features'] = train.columns
missing_values_df['Number_of_missing_values'] = train.isnull().sum().to_numpy()
missing_values_df['Percentage_of_missing_values (%)'] = missing_values_df['Number_of_missing_values'].apply(lambda x: np.round((x/train.shape[0])*100),2)
missing_values_df

In [None]:
## finding the features having more than 20% missing values

useless_feat = dict()
for feature in train.columns[1:]:
    if train[feature].isnull().sum()/train.shape[0] >= 0.2:
        useless_feat[feature] = train[feature].isnull().sum()/train.shape[0]

In [None]:
useless_feat

In [None]:
train.drop(columns=useless_feat.keys(), axis=1, inplace=True)
test.drop(columns=useless_feat.keys(), axis=1, inplace=True)

<strong>Since there are too many features, instead of finding if each of the feature contains outlier we will try to train the model with and without removing outliers and then check the results.</strong>

<strong>Also, to avoid overfitting, let's use medium criteria while replacing missing values in the remaining data.</strong>

In [None]:
## checking if the dataset is imbalanced or not

train['class'].value_counts()

<strong>It seems the data is highly imbalanced. So, let's use the machine learning algorithms that are immune to this.</strong>