Check, clean data for better usability
1. Parsed columns 
* "period" into "period_name" ,"period start", "period end"  
*  "named_by" into "named by" ,"year of naming"
2. Moved some values from wrong columns
* from "type" to "length" (1 value)
* from "named_by" to "species" (5 values)
3. replace column "length" (type str) with column "length_meters" (type float)

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import re

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        # print(os.path.join(dirname, filename))
        pass

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# functions to parse columns

def parse_length(s):
    # parse field "length" into "length" ,"unit of measure"
    if pd.isnull(s):
        return (np.nan, np.nan)
    return (float(str(s)[:-1]), str(s)[-1:])

def parse_period(s):
    # parse field "period" into "period_name" ,"period start", "period end"
    try:
        parsed_groups = re.match(r'([\D]+)([\d]+)(-?)([\d]*)([\D]+)', s).groups()
    except: 
        return [s, np.nan, np.nan]
    period = parsed_groups[0].strip()
    period_start = parsed_groups[1]
    period_end = parsed_groups[3] if parsed_groups[3] != '' else parsed_groups[1] 
    return [period, str(period_start), str(period_end)]

def parse_named(s):
    # parse field "named_by" into "named by" ,"year of naming"
    try:
        parsed_groups = re.match(r'([^\d\(]+)(\(?)(\d+)', s).groups()
    except: 
        return [s, np.nan]
    named = parsed_groups[0].strip()
    year = parsed_groups[2]
    return [named, str(year)]

In [None]:
df = pd.read_csv('/kaggle/input/jurassic-park-the-exhaustive-dinosaur-dataset/data.csv')
print(df.shape)
df.set_index('name', inplace=True)
df.sample(3)

In [None]:
df.describe()

In [None]:
print(f'{df.type.nunique()}')
print(f'{df.type.isna().sum()}')
df.type.value_counts()

In [None]:
# manually moved value from "type" to "length"
print(df.type.nunique())
df.type.value_counts()[:10]
df.loc[df.type=='1.0m', 'length'] = '1.0m'
df.loc[df.type=='1.0m','type'] = np.nan
df.loc[df.type.isna()]

In [None]:
# parse column "period"
df['period_name'], df['period_start'], df['period_end']  = zip(*df['period'].apply(parse_period))
df[['period', 'period_name', 'period_start', 'period_end']].sample(3)

In [None]:
df['named'], df['named_year'] = zip(*df['named_by'].apply(parse_named))
df[['named_by','named', 'named_year']].sample(3)

In [None]:
# founded, that 5 field filled by species name
df.loc[df['named_year'].isnull()][['named_by','named', 'named_year','species']]

In [None]:
# and there is no more nan values for species
df.loc[df['species'].isnull()].named.count()

In [None]:
# moved values from named_by to species
df['species'] = np.where(df['species'].isnull(), df['named'], df['species'])
df.loc[df['named_year'].isnull(), 'named_by'] = np.nan
df.loc[df['named_year'].isnull(), 'named'] = np.nan
df.loc[df['named_year'].isnull()][['named_by','named', 'named_year','species']]

In [None]:
df['length_parsed'],df['measure'] = zip(*df['length'].apply(parse_length))
df[['length','length_parsed', 'measure']].sample(3)

In [None]:
# check that there only one measure - meters
# rename column and drop column 'measure'
print(df.measure.nunique())
df.rename(columns={'length_parsed':'length_meters'}, inplace=True)
df.drop(columns=['measure'], inplace=True)
df.sample(3)

In [None]:
df.columns

In [None]:
# drop useless columns
df_clean = df[['diet', 'lived_in', 'type', 'taxonomy',
       'species', 'period_name', 'period_start',
       'period_end', 'named', 'named_year',
       'length_meters']].copy()
print(df_clean.shape)
df_clean.sample(3)

In [None]:
df_clean.to_csv('jurassic_park.csv', sep='|')