A dataset is considered “cleaned” if you do things like:

-->Handle missing values (drop or fill them)

-->Fix column names

-->Remove duplicates

-->Change data types

-->Filter incorrect or irrelevant rows

In [4]:
#Importing required library
import pandas as pd

In [5]:
#reading file
df = pd.read_csv("cumulative.csv", index_col=0)

In [6]:
#top values of dataset
df.head()

Unnamed: 0_level_0,kepid,kepoi_name,kepler_name,koi_disposition,koi_pdisposition,koi_score,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
rowid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,10797460,K00752.01,Kepler-227 b,CONFIRMED,CANDIDATE,1.0,0,0,0,0,...,-81.0,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
2,10797460,K00752.02,Kepler-227 c,CONFIRMED,CANDIDATE,0.969,0,0,0,0,...,-81.0,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
3,10811496,K00753.01,,FALSE POSITIVE,FALSE POSITIVE,0.0,0,1,0,0,...,-176.0,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436
4,10848459,K00754.01,,FALSE POSITIVE,FALSE POSITIVE,0.0,0,1,0,0,...,-174.0,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.28521,15.597
5,10854555,K00755.01,Kepler-664 b,CONFIRMED,CANDIDATE,1.0,0,0,0,0,...,-211.0,4.438,0.07,-0.21,1.046,0.334,-0.133,288.75488,48.2262,15.509


In [7]:
#info regaeding dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 9564 entries, 1 to 9564
Data columns (total 49 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   kepid              9564 non-null   int64  
 1   kepoi_name         9564 non-null   object 
 2   kepler_name        2294 non-null   object 
 3   koi_disposition    9564 non-null   object 
 4   koi_pdisposition   9564 non-null   object 
 5   koi_score          8054 non-null   float64
 6   koi_fpflag_nt      9564 non-null   int64  
 7   koi_fpflag_ss      9564 non-null   int64  
 8   koi_fpflag_co      9564 non-null   int64  
 9   koi_fpflag_ec      9564 non-null   int64  
 10  koi_period         9564 non-null   float64
 11  koi_period_err1    9110 non-null   float64
 12  koi_period_err2    9110 non-null   float64
 13  koi_time0bk        9564 non-null   float64
 14  koi_time0bk_err1   9110 non-null   float64
 15  koi_time0bk_err2   9110 non-null   float64
 16  koi_impact         9201 non-n

In [None]:
#Check Missing Values
df.isnull().sum().sort_values(ascending=False).head(15)

koi_teq_err1        9564
koi_teq_err2        9564
kepler_name         7270
koi_score           1510
koi_steff_err2       483
koi_srad_err1        468
koi_steff_err1       468
koi_slogg_err2       468
koi_slogg_err1       468
koi_srad_err2        468
koi_time0bk_err1     454
koi_period_err1      454
koi_period_err2      454
koi_time0bk_err2     454
koi_impact_err1      454
dtype: int64

In [14]:
#Drop columns with too many missing values
#Keeping columns with at least 70% filled
threshold = len(df) * 0.7
df = df.dropna(axis=1, thresh=threshold)

In [16]:
#Check Remaning Missing Values
df.isnull().sum().sort_values(ascending=False).head(15)

koi_score            1510
koi_steff_err2        483
koi_slogg_err2        468
koi_srad_err1         468
koi_slogg_err1        468
koi_steff_err1        468
koi_srad_err2         468
koi_time0bk_err2      454
koi_impact_err1       454
koi_period_err2       454
koi_time0bk_err1      454
koi_period_err1       454
koi_depth_err1        454
koi_duration_err2     454
koi_depth_err2        454
dtype: int64

In [21]:
#dropping irrelevant ID columns
columns_to_drop = [
    "kepid",
    "kepoi_name",
    "kepler_name"
]

df = df.drop(columns=[c for c in columns_to_drop if c in df.columns])


In [22]:
#Fill missing values with median
numeric_cols = df.select_dtypes(include="number").columns
df.loc[:,numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].median())

In [23]:
#Droping duplicates
df.loc[:, numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].median())

In [24]:
#final verification
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 9564 entries, 1 to 9564
Data columns (total 44 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   koi_disposition    9564 non-null   object 
 1   koi_pdisposition   9564 non-null   object 
 2   koi_score          9564 non-null   float64
 3   koi_fpflag_nt      9564 non-null   int64  
 4   koi_fpflag_ss      9564 non-null   int64  
 5   koi_fpflag_co      9564 non-null   int64  
 6   koi_fpflag_ec      9564 non-null   int64  
 7   koi_period         9564 non-null   float64
 8   koi_period_err1    9564 non-null   float64
 9   koi_period_err2    9564 non-null   float64
 10  koi_time0bk        9564 non-null   float64
 11  koi_time0bk_err1   9564 non-null   float64
 12  koi_time0bk_err2   9564 non-null   float64
 13  koi_impact         9564 non-null   float64
 14  koi_impact_err1    9564 non-null   float64
 15  koi_impact_err2    9564 non-null   float64
 16  koi_duration       9564 non-n

In [25]:
#save cleaned dataset
df.to_csv("cleaned_cumulative.csv", index=False)