In [3]:
# libraries

import numpy as np
import random
import pandas as pd

import warnings

In [4]:
# set warning off
warnings.filterwarnings("ignore")

In [5]:
#read data
x=pd.read_csv('./model_data/features.csv')
y=pd.read_csv('./model_data/labels.csv')

### Check Balance of labels

In [5]:
y['label'].value_counts()

1    589222
0     62219
Name: label, dtype: int64

This means data are highly imbalanced.

## Checking features and labels 

In [6]:
x.shape , y.shape

((1522116, 77), (651441, 2))

This means features and labels are not properly ordered and require proper order.

### checking duplicated

In [8]:
x.duplicated().sum()

522663

In [10]:
y_=y[y.duplicated(keep='last')==False]
y_.shape,y.shape

((651441, 2), (651441, 2))

There are too many duplicates in features (i.e. x). But there are __no duplicates__ in labels

# Data cleaning:
- Remove duplicates
- Properly order features and labels using 'key'
- Balance the data by duplicating data 

### Remove duplicates
Keeping only first data point. 

In [9]:
x_=x[x.duplicated(keep='last')==False]
x_.shape,x.shape

((999453, 77), (1522116, 77))

### Order data as per 'key' values
Using dictionary

In [11]:
from collections import defaultdict
from tqdm import tqdm

In [13]:
x_nokey=x_.loc[:,x_.columns != 'key']
x_key=x_['key']
x_dict = defaultdict()
for i in tqdm(range(len(x_))):
    k=x_key.iloc[i]
    v=x_nokey.iloc[i]
    x_dict[k]=list(v)

100%|██████████| 999453/999453 [01:56<00:00, 8571.24it/s]


In [14]:
len(x_dict.keys()),x_.shape

(999453, (999453, 77))

In [15]:
data_dict = defaultdict()
for i in tqdm(range(len(y))):
    if y['key'][i] in x_dict:
        x_dict[y['key'][i]].append(y['label'][i])
        data_dict[y['key'][i]] = x_dict[y['key'][i]]

100%|██████████| 651441/651441 [00:19<00:00, 33196.31it/s]


In [16]:
data=pd.DataFrame.from_dict(data_dict, orient = 'index')

In [17]:
data.shape

(650954, 77)

In [19]:
data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,67,68,69,70,71,72,73,74,75,76
bba27b1fffbecb4f37400ec3368995e39dfabbce61422531a55a36c7696c33,0,1,1,1,1,1,0,1,0,1,...,1,1,1,1,1,1,1,0,0,1
41d66270e5a2bf2366d0804462f1f8609c5ecd83900f005dc64ece6ae965bb,0,1,0,1,0,1,0,0,1,1,...,1,0,0,0,1,0,0,1,0,1
995ac7bd74a6050bd4d0bcd7a891fd57ef7073ddd00accfd7ce37920a8d408,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
afb52fe7c13c652190c118f33f42402733766ae714fd4891ecd5def7b226bc,0,1,0,1,0,1,1,1,1,1,...,1,0,0,0,0,0,0,0,0,1
0aab946153a05d31cd98ffae63bd68a397745a66a03ed6e1d6199607489e37,1,1,0,1,0,1,0,1,1,1,...,1,1,1,0,0,1,1,1,0,1


### Balance the data 

Now try to make data more balanced by duplicating the data. At least both label should be near equal. Lets make 0 label 7 times.

In [20]:
data[76].value_counts()

1    589001
0     61953
Name: 76, dtype: int64

In [21]:
d_zeros = data[data[76]==0]
d_zeros.shape

(61953, 77)

In [22]:
data_balanced = pd.concat([d_zeros,data,d_zeros,d_zeros,d_zeros,d_zeros,d_zeros,d_zeros,])

In [24]:
data_balanced.shape

(1084625, 77)

In [26]:
data_balanced[76].value_counts()

1    589001
0    495624
Name: 76, dtype: int64

#### To avoid biasnes due to grouping occured in cocatenation operation, shuffle the rows.

In [27]:
data_balanced = data_balanced.sample(frac=1)

In [28]:
data_balanced.shape

(1084625, 77)

## save clean and balanced data

In [33]:
x_clean = data_balanced.iloc[:,0:76]
y_clean = data_balanced.iloc[:,76]

In [34]:
x_clean.shape , y_clean.shape

((1084625, 76), (1084625,))

In [35]:
x_clean.to_csv(r'./model_data/features_clean.csv', index = False, header = True)
y_clean.to_csv(r'./model_data/labels_clean.csv', index = False, header = True)