### Authors Data Analysis

In [1]:
import pandas as pd
import numpy as np

In [2]:
authors_df = pd.read_csv('data/Author Data.csv')
authors_df.head(30)

Unnamed: 0,author_id,author_name
0,1,yaksh
1,2,XIT
2,3,Daniel Meyer
3,4,Seedify Fund
4,5,Ifedolapo Shiloh Olotu
5,6,Brian
6,7,Nick Babich
7,8,Xerxemi
8,9,Harry Alexander
9,10,E2Analyst


### Preprocessing Authors meta data

#### No empty entries

In [3]:
authors_df.isna().sum()

author_id      0
author_name    0
dtype: int64

#### Use only ',' to seperate the names and replace it if there any other characters

In [4]:
import re
# Replace known separators with commas (only when used between words)
def normalize_author_name(name):
    if pd.isnull(name):
        return name
    # Replace pipe, slash, ampersand, hyphen (when used as a separator) with comma
    # Avoid replacing hyphens in hyphenated names like "Jean-Luc"
    name = re.sub(r'\s*[\|/&]\s*', ', ', name)
    return name.strip()

authors_df['author_name_cleaned'] = authors_df['author_name'].apply(normalize_author_name)


In [5]:
authors_df[authors_df['author_name'] == 'Render Network | RNDR Team']

Unnamed: 0,author_id,author_name,author_name_cleaned
44,45,Render Network | RNDR Team,"Render Network, RNDR Team"


#### Confirms all authors are ',' seperated

In [6]:
no_comma_authors = authors_df[~authors_df['author_name'].str.contains(',', na=False)]


In [7]:
# Convert each name component to TitleCase
def to_camel_case(name):
    if pd.isnull(name):
        return name
    parts = [part.strip().title() for part in name.split(',')]
    return ', '.join(parts)

authors_df['author_name_cleaned'] = authors_df['author_name_cleaned'].apply(to_camel_case)

In [8]:
authors_df.head(10)

Unnamed: 0,author_id,author_name,author_name_cleaned
0,1,yaksh,Yaksh
1,2,XIT,Xit
2,3,Daniel Meyer,Daniel Meyer
3,4,Seedify Fund,Seedify Fund
4,5,Ifedolapo Shiloh Olotu,Ifedolapo Shiloh Olotu
5,6,Brian,Brian
6,7,Nick Babich,Nick Babich
7,8,Xerxemi,Xerxemi
8,9,Harry Alexander,Harry Alexander
9,10,E2Analyst,E2Analyst


In [9]:
authors_df[authors_df['author_name'] == 'Render Network | RNDR Team']

Unnamed: 0,author_id,author_name,author_name_cleaned
44,45,Render Network | RNDR Team,"Render Network, Rndr Team"


In [10]:
def replace_names_in_parenthesis(name):
    if pd.isnull(name):
        return name
    return re.sub(r'\s*\([^)]*\)', '', name).strip()

    

In [11]:
authors_df[authors_df['author_name'].str.contains('\(', na=False)].head(30)

Unnamed: 0,author_id,author_name,author_name_cleaned
58,59,Kaoutar Tarik (高達烈),Kaoutar Tarik (高達烈)
232,233,Tarik Kaoutar (高達烈),Tarik Kaoutar (高達烈)
252,253,Alastair (Ali) Mitchell,Alastair (Ali) Mitchell
303,304,Dwayne Wong (Omowale),Dwayne Wong (Omowale)
398,399,Oluwatomilola (Cyberhawk),Oluwatomilola (Cyberhawk)
505,506,Gowthamaraj Rajendran (@fuffsec),Gowthamaraj Rajendran (@Fuffsec)
536,537,"(KJH) Kuan-Jung, Huang","(Kjh) Kuan-Jung, Huang"
548,549,Mateusz (meshcode),Mateusz (Meshcode)
586,587,Pundi X (writers),Pundi X (Writers)
637,638,LockTrip.com (LOC Token) Official Blog,Locktrip.Com (Loc Token) Official Blog


In [12]:
authors_df['author_name_cleaned']  = authors_df['author_name_cleaned'].apply(replace_names_in_parenthesis)

In [13]:
authors_df[authors_df['author_name'].str.contains('\(', na=False)].head(30)

Unnamed: 0,author_id,author_name,author_name_cleaned
58,59,Kaoutar Tarik (高達烈),Kaoutar Tarik
232,233,Tarik Kaoutar (高達烈),Tarik Kaoutar
252,253,Alastair (Ali) Mitchell,Alastair Mitchell
303,304,Dwayne Wong (Omowale),Dwayne Wong
398,399,Oluwatomilola (Cyberhawk),Oluwatomilola
505,506,Gowthamaraj Rajendran (@fuffsec),Gowthamaraj Rajendran
536,537,"(KJH) Kuan-Jung, Huang","Kuan-Jung, Huang"
548,549,Mateusz (meshcode),Mateusz
586,587,Pundi X (writers),Pundi X
637,638,LockTrip.com (LOC Token) Official Blog,Locktrip.Com Official Blog


In [14]:
authors_df = authors_df.drop(columns='author_name')
authors_df = authors_df.rename(columns={'author_name_cleaned': 'author_name'})

In [15]:
authors_df.columns

Index(['author_id', 'author_name'], dtype='object')

In [16]:
authors_df.to_csv('data/Authors_Data_Cleaned.csv', header=True, index=False)