# Cleaning data
**Handling missing values, outliers, duplicates, data integrity and standardizing data**

**Import** data from "data/raw.csv"

**Export** to "data/cleaned.csv"

In [1]:
# Setup

import numpy as np
import pandas as pd

from pkg import utils
from pkg import clean

In [2]:
# Configurations

run_save = True

run_aminoCluster = True

numericiate = True
instancing = False
encoding = False

seed = 123

In [3]:
# Importing data

parent = 'data'
filename = 'raw.csv'

path = utils.get_path(parent,filename)

df = pd.read_csv(path)

df.head()

print(df.shape)

(13028, 69)


  df = pd.read_csv(path)


In [4]:
df = df.apply(pd.to_numeric, errors='ignore').fillna(df)

numerical_variables = df.select_dtypes(include=['number'])
numerical_variables = numerical_variables.columns

categorical_variables = df.select_dtypes(exclude=['number'])
categorical_variables = categorical_variables.columns

print(f'Numerical: \n{numerical_variables}\n')
print(f'Categorical: \n{categorical_variables}')

Numerical: 
Index(['DNAtype', 'SpeciesID', 'Ncodons', 'UUA', 'UUG', 'CUU', 'CUC', 'CUA',
       'CUG', 'AUU', 'AUC', 'AUA', 'AUG', 'GUU', 'GUC', 'GUA', 'GUG', 'GCU',
       'GCC', 'GCA', 'GCG', 'CCU', 'CCC', 'CCA', 'CCG', 'UGG', 'GGU', 'GGC',
       'GGA', 'GGG', 'UCU', 'UCC', 'UCA', 'UCG', 'AGU', 'AGC', 'ACU', 'ACC',
       'ACA', 'ACG', 'UAU', 'UAC', 'CAA', 'CAG', 'AAU', 'AAC', 'UGU', 'UGC',
       'CAU', 'CAC', 'AAA', 'AAG', 'CGU', 'CGC', 'CGA', 'CGG', 'AGA', 'AGG',
       'GAU', 'GAC', 'GAA', 'GAG', 'UAA', 'UAG', 'UGA'],
      dtype='object')

Categorical: 
Index(['Kingdom', 'SpeciesName', 'UUU', 'UUC'], dtype='object')


In [5]:
missing_values = df.isnull().sum()

print(f'total missing values: {missing_values.sum()}')

if missing_values.sum()>0:
    print("Columns with missing values:")
    for column_name, num_missing in missing_values.items():
        if num_missing > 0:
            print(f"{column_name}\n\tNumber of missing values: {num_missing}")

total missing values: 0


In [6]:
if numericiate:
    from itertools import compress

    def input_numeric(series):

        numeric_series = pd.to_numeric(series, errors='coerce')

        non_numeric_indices = numeric_series.isna()

        if any(non_numeric_indices):
            numeric_sample = np.random.choice(numeric_series[~non_numeric_indices],
                                              sum(non_numeric_indices),
                                              replace=True)
            numeric_series[non_numeric_indices] = numeric_sample


        return numeric_series


    df["UUU"] = input_numeric(df["UUU"])
    df["UUC"] = input_numeric(df["UUC"])

    df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13028 entries, 0 to 13027
Data columns (total 69 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Kingdom      13028 non-null  object 
 1   DNAtype      13028 non-null  int64  
 2   SpeciesID    13028 non-null  int64  
 3   Ncodons      13028 non-null  int64  
 4   SpeciesName  13028 non-null  object 
 5   UUU          13028 non-null  float64
 6   UUC          13028 non-null  float64
 7   UUA          13028 non-null  float64
 8   UUG          13028 non-null  float64
 9   CUU          13028 non-null  float64
 10  CUC          13028 non-null  float64
 11  CUA          13028 non-null  float64
 12  CUG          13028 non-null  float64
 13  AUU          13028 non-null  float64
 14  AUC          13028 non-null  float64
 15  AUA          13028 non-null  float64
 16  AUG          13028 non-null  float64
 17  GUU          13028 non-null  float64
 18  GUC          13028 non-null  float64
 19  GUA 

In [7]:
if instancing:
    from sklearn.impute import SimpleImputer
    
    numeric_imputer = SimpleImputer(strategy='median')
    df[numerical_variables] = numeric_imputer.fit_transform(df[numerical_variables])

In [8]:
if encoding:
    from sklearn.preprocessing import LabelEncoder
    
    label_encoder = LabelEncoder()

    df[categorical_variables] = df[categorical_variables].apply(label_encoder.fit_transform)
    df[categorical_variables]

In [9]:
# Exporting data

cleaned = df # Add final df

if run_save:
    utils.save_df(cleaned, "cleaned.csv")

csv file saved on:  c:\Users\ptons\Code\repositories\codons\data\cleaned.csv


In [10]:
# amino_dict

dict_amino       = {'Phe': ['UUU','UUC'],
                  'Leu': ['UUA', 'UUG'],
                  'Ser': ['UCU', 'UCC', 'UCA', 'UCG'],
                  'Tyr': ['UAU', 'UAC'],
                  'Stop': ['UAA', 'UAG', 'UGA'],
                  'Cys': ['UGU', 'UGC'],
                  'Trip': ['UGG'],
                  'Leu': ['CUU', 'CUC', 'CUA', 'CUG'],
                  'Pro': ['CCU', 'CCC', 'CCA', 'CCG'],
                  'His': ['CAU', 'CAC'],
                  'Gin': ['CAA','CAG'],
                  'Arg': ['CGU', 'CGC', 'CGA', 'CGG'],
                  'Lle': ['AUU', 'AUC', 'AUA'],
                  'Met': ['AUG'],
                  'Thr': ['ACU', 'ACC', 'ACA', 'ACG'],
                  'Asn': ['AAU', 'AAC'],
                  'Lys': ['AAA', 'AAG'],
                  'Ser': ['AGU', 'AGC'],
                  'Arg': ['AGA', 'AGG'],
                  'Vai': ['GUU', 'GUC', 'GUA', 'GUG'],
                  'Ala': ['GCU', 'GCC', 'GCA', 'GCG'],
                  'Asp': ['GAU', 'GAC'],
                  'Glu': ['GAA', 'GAG'],
                  'Gly': ['GGU', 'GGC', 'GGA', 'GGG']}

aminos = list(dict_amino.keys())

df.head()

Unnamed: 0,Kingdom,DNAtype,SpeciesID,Ncodons,SpeciesName,UUU,UUC,UUA,UUG,CUU,...,CGG,AGA,AGG,GAU,GAC,GAA,GAG,UAA,UAG,UGA
0,vrl,0,100217,1995,Epizootic haematopoietic necrosis virus,0.01654,0.01203,0.0005,0.00351,0.01203,...,0.00451,0.01303,0.03559,0.01003,0.04612,0.01203,0.04361,0.00251,0.0005,0.0
1,vrl,0,100220,1474,Bohle iridovirus,0.02714,0.01357,0.00068,0.00678,0.00407,...,0.00136,0.01696,0.03596,0.01221,0.04545,0.0156,0.0441,0.00271,0.00068,0.0
2,vrl,0,100755,4862,Sweet potato leaf curl virus,0.01974,0.0218,0.01357,0.01543,0.00782,...,0.00596,0.01974,0.02489,0.03126,0.02036,0.02242,0.02468,0.00391,0.0,0.00144
3,vrl,0,100880,1915,Northern cereal mosaic virus,0.01775,0.02245,0.01619,0.00992,0.01567,...,0.00366,0.0141,0.01671,0.0376,0.01932,0.03029,0.03446,0.00261,0.00157,0.0
4,vrl,0,100887,22831,Soil-borne cereal mosaic virus,0.02816,0.01371,0.00767,0.03679,0.0138,...,0.00604,0.01494,0.01734,0.04148,0.02483,0.03359,0.03679,0.0,0.00044,0.00131


In [11]:
if run_aminoCluster:
    prune_factor = 5

    amino_df = clean.run_aminoCluster(df, dict_amino)
    miscs = ["SpeciesID", "DNAtype","SpeciesName", "Kingdom"]
    complete_df = pd.concat([df[miscs], amino_df], axis=1)

In [12]:
amino_df.describe()

Unnamed: 0,Ncodons,Phe,Leu,Ser,Tyr,Stop,Cys,Trip,Pro,His,...,Lle,Met,Thr,Asn,Lys,Vai,Ala,Asp,Glu,Gly
count,13028.0,13028.0,13028.0,13028.0,13028.0,13028.0,13028.0,13028.0,13028.0,13028.0,...,13028.0,13028.0,13028.0,13028.0,13028.0,13028.0,13028.0,13028.0,13028.0,13028.0
mean,79605.76,0.048258,0.073602,0.020733,0.034382,0.008415,0.014885,0.011611,0.049887,0.023712,...,0.071687,0.021135,0.062682,0.044512,0.050033,0.062886,0.0745,0.045342,0.049974,0.065237
std,719701.0,0.017334,0.038413,0.008767,0.010522,0.011048,0.010463,0.006569,0.014672,0.007684,...,0.032314,0.008163,0.019284,0.014083,0.021846,0.015602,0.024852,0.017552,0.019863,0.017753
min,1000.0,0.0,0.00137,0.0,0.0,0.00017,0.0,0.0,0.0021,0.0,...,0.00338,0.0,0.01144,0.0,0.0,0.00264,0.00784,0.0,0.0,0.0
25%,1602.0,0.03714,0.047948,0.01513,0.02755,0.00206,0.00821,0.007108,0.039957,0.01901,...,0.049857,0.015787,0.051807,0.03565,0.02973,0.054277,0.05956,0.03348,0.036737,0.05477
50%,2927.5,0.04377,0.062945,0.02063,0.03358,0.00319,0.012495,0.01206,0.049625,0.02307,...,0.06125,0.022,0.05919,0.04303,0.051425,0.063905,0.07115,0.05018,0.053195,0.06489
75%,9120.0,0.05686,0.087652,0.0258,0.04014,0.005843,0.01943,0.015383,0.05789,0.02728,...,0.086013,0.02626,0.070123,0.0522,0.063622,0.072163,0.087853,0.05744,0.06322,0.07545
max,40662580.0,0.24251,0.25001,0.43672,0.16178,0.12456,0.16739,0.08382,0.18513,0.10079,...,0.28931,0.10169,0.2097,0.16826,0.18898,0.21558,0.39964,0.22895,0.24543,0.32968


In [13]:
if run_save:
    amino = complete_df
    utils.save_df(amino, "amino.csv")

csv file saved on:  c:\Users\ptons\Code\repositories\codons\data\amino.csv
