In [1]:
## Importing basic required libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

In [2]:
## Importing the data from google drive

train_url = 'https://drive.google.com/file/d/1UiFmiaLmD4CKbXh9xWqiUCi-5Vq6ce3_/view?usp=sharing'

train_download_url = 'https://drive.google.com/uc?id=' + train_url.split('/')[-2]

test_url = 'https://drive.google.com/file/d/1Q-YuLzD9M4d7cIcD48LzOJubSY9Poi5A/view?usp=sharing'

test_download_url = 'https://drive.google.com/uc?id=' + test_url.split('/')[-2]


train = pd.read_csv(train_download_url)
test = pd.read_csv(test_download_url)

In [3]:
## checking out the train and test data

print('Train data:\n')
print(train.head())
print()
print('Test data\n')
print(test.head())
print()

Train data:

  class  aa_000 ab_000      ac_000 ad_000 ae_000 af_000 ag_000 ag_001 ag_002  \
0   neg   76698     na  2130706438    280      0      0      0      0      0   
1   neg   33058     na           0     na      0      0      0      0      0   
2   neg   41040     na         228    100      0      0      0      0      0   
3   neg      12      0          70     66      0     10      0      0      0   
4   neg   60874     na        1368    458      0      0      0      0      0   

   ...   ee_002  ee_003  ee_004  ee_005  ee_006  ee_007  ee_008 ee_009 ef_000  \
0  ...  1240520  493384  721044  469792  339156  157956   73224      0      0   
1  ...   421400  178064  293306  245416  133654   81140   97576   1500      0   
2  ...   277378  159812  423992  409564  320746  158022   95128    514      0   
3  ...      240      46      58      44      10       0       0      0      4   
4  ...   622012  229790  405298  347188  286954  311560  433954   1218      0   

  eg_000  
0      0

In [4]:
## Checking out the number of rows and columns in the data

print(f"The number of rows in the train data is {train.shape[0]}.\n")
print(f"The number of columns in the train data is {train.shape[1]}.")

The number of rows in the train data is 60000.

The number of columns in the train data is 171.


In [5]:
print(f"The number of rows in the test data is {test.shape[0]}.\n")
print(f"The number of columns in the test data is {test.shape[1]}.")

The number of rows in the test data is 16000.

The number of columns in the test data is 171.


In [6]:
## understanding the data types of features 

train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 60000 entries, 0 to 59999
Columns: 171 entries, class to eg_000
dtypes: int64(1), object(170)
memory usage: 78.3+ MB


<strong>Looks like all the independent features are of int64 datatype.</strong>

In [7]:
## Let's find out the number of unique values in each of the independent features

for feature in train.columns:
    print(f"The unique values in the feature {feature} are {train[feature].unique()} (total of {train[feature].nunique()}).\n")

The unique values in the feature class are ['neg' 'pos'] (total of 2).

The unique values in the feature aa_000 are [ 76698  33058  41040 ...  83818  16978 153002] (total of 22095).

The unique values in the feature ab_000 are ['na' '0' '2' '4' '6' '8' '18' '10' '12' '16' '204' '24' '134' '90' '26'
 '14' '34' '52' '20' '22' '30' '36' '100' '48' '58' '32' '28' '44' '46'
 '68'] (total of 30).

The unique values in the feature ac_000 are ['2130706438' '0' '228' ... '8974' '123514' '2602'] (total of 2062).

The unique values in the feature ad_000 are ['280' 'na' '100' ... '5286' '3608' '194868'] (total of 1887).

The unique values in the feature ae_000 are ['0' '16' '104' 'na' '2' '222' '4' '290' '1286' '170' '6' '806' '1464'
 '274' '18' '8' '342' '64' '34' '192' '144' '176' '1512' '120' '58' '114'
 '460' '12' '112' '1314' '20' '550' '36' '1424' '180' '324' '204' '254'
 '106' '154' '92' '256' '202' '352' '26' '124' '2240' '466' '778' '28'
 '100' '14' '276' '42' '82' '22' '384' '282' '386' 

In [8]:
## Finding out the features with less than 50 unique values (These will be categorical features which are encoded as )

print('Features with less than 50 unique values are: \n')
for feature in train.columns:
    if train[feature].nunique() <= 50:
        print(f"{feature}", end=", ")

Features with less than 50 unique values are: 

class, ab_000, as_000, cd_000, ch_000, dz_000, ef_000, eg_000, 

In [23]:
## In the dataset, null values are represented as 'na'. Let's convert them to np.nan

def miss(x):
    if x == 'na':
        return np.nan

    else:
        return x


for feature in train.columns[1:]:
    train[feature] = train[feature].map(miss)

In [24]:
## checking for the missing values in the data

missing_values_df = pd.DataFrame()
missing_values_df['Features'] = train.columns
missing_values_df['Number_of_missing_values'] = train.isnull().sum().to_numpy()
missing_values_df['Percentage_of_missing_values (%)'] = missing_values_df['Number_of_missing_values'].apply(lambda x: np.round((x/train.shape[0])*100),2)
missing_values_df

Unnamed: 0,Features,Number_of_missing_values,Percentage_of_missing_values (%)
0,class,0,0.0
1,aa_000,0,0.0
2,ab_000,46329,77.0
3,ac_000,3335,6.0
4,ad_000,14861,25.0
...,...,...,...
166,ee_007,671,1.0
167,ee_008,671,1.0
168,ee_009,671,1.0
169,ef_000,2724,5.0


In [27]:
## finding the features having more than 20% missing values

useless_feat = dict()
for feature in train.columns[1:]:
    if train[feature].isnull().sum()/train.shape[0] >= 0.2:
        useless_feat[feature] = train[feature].isnull().sum()/train.shape[0]

In [28]:
useless_feat

{'ab_000': 0.77215,
 'ad_000': 0.24768333333333334,
 'bk_000': 0.3839,
 'bl_000': 0.45461666666666667,
 'bm_000': 0.65915,
 'bn_000': 0.7334833333333334,
 'bo_000': 0.7722166666666667,
 'bp_000': 0.7956666666666666,
 'bq_000': 0.8120333333333334,
 'br_000': 0.8210666666666666,
 'cf_000': 0.24768333333333334,
 'cg_000': 0.24768333333333334,
 'ch_000': 0.24768333333333334,
 'co_000': 0.24768333333333334,
 'cr_000': 0.77215,
 'ct_000': 0.23013333333333333,
 'cu_000': 0.23013333333333333,
 'cv_000': 0.23013333333333333,
 'cx_000': 0.23013333333333333,
 'cy_000': 0.23013333333333333,
 'cz_000': 0.23013333333333333,
 'da_000': 0.23013333333333333,
 'db_000': 0.23013333333333333,
 'dc_000': 0.23013333333333333}

In [29]:
train.drop(columns=useless_feat.keys(), axis=1, inplace=True)
test.drop(columns=useless_feat.keys(), axis=1, inplace=True)

<strong>Since there are too many features, instead of finding if each of the feature contains outlier we will try to train the model with and without removing outliers and then check the results.</strong>

<strong>Also, to avoid overfitting, let's use medium criteria while replacing missing values in the remaining data.</strong>

In [31]:
## checking if the dataset is imbalanced or not

train['class'].value_counts()

neg    59000
pos     1000
Name: class, dtype: int64

<strong>It seems the data is highly imbalanced. So, let's use the machine learning algorithms that are immune to this.</strong>