In [1]:
import pandas as pd
import numpy as np

In [2]:
# sample dataset

data = {
    'hero' : ['iron man', 'thor', 'hulk', 'black widow', 'hawkeye', np.nan, 'spider-man'], 
    'power level': [85, 95, 100, 70, 65, np.nan, 80], 
    'speed': [np.nan, 90, 70, 85, 60, 75, 80], 
    'iq' : [95, 85, 70, 80, np.nan, 65, 90]
}

df = pd.DataFrame(data)

In [3]:
df

Unnamed: 0,hero,power level,speed,iq
0,iron man,85.0,,95.0
1,thor,95.0,90.0,85.0
2,hulk,100.0,70.0,70.0
3,black widow,70.0,85.0,80.0
4,hawkeye,65.0,60.0,
5,,,75.0,65.0
6,spider-man,80.0,80.0,90.0


In [4]:
df.to_csv('superheroes.csv', index=False) 

In [5]:
df = pd.read_csv('superheroes.csv')

In [6]:
df

Unnamed: 0,hero,power level,speed,iq
0,iron man,85.0,,95.0
1,thor,95.0,90.0,85.0
2,hulk,100.0,70.0,70.0
3,black widow,70.0,85.0,80.0
4,hawkeye,65.0,60.0,
5,,,75.0,65.0
6,spider-man,80.0,80.0,90.0


In [7]:
# listwise deletion (remove rows with missing)
listwise_deleted_df = df.dropna() 

In [8]:
df

Unnamed: 0,hero,power level,speed,iq
0,iron man,85.0,,95.0
1,thor,95.0,90.0,85.0
2,hulk,100.0,70.0,70.0
3,black widow,70.0,85.0,80.0
4,hawkeye,65.0,60.0,
5,,,75.0,65.0
6,spider-man,80.0,80.0,90.0


In [9]:
listwise_deleted_df

Unnamed: 0,hero,power level,speed,iq
1,thor,95.0,90.0,85.0
2,hulk,100.0,70.0,70.0
3,black widow,70.0,85.0,80.0
6,spider-man,80.0,80.0,90.0


In [10]:
# drop specific columns with too many missing values 

pairwise_deleted_df = df.dropna(axis=1)

In [11]:
pairwise_deleted_df

0
1
2
3
4
5
6


In [12]:
listwise_deleted_df.reset_index(drop=True)

Unnamed: 0,hero,power level,speed,iq
0,thor,95.0,90.0,85.0
1,hulk,100.0,70.0,70.0
2,black widow,70.0,85.0,80.0
3,spider-man,80.0,80.0,90.0


In [13]:
listwise_deleted_df

Unnamed: 0,hero,power level,speed,iq
1,thor,95.0,90.0,85.0
2,hulk,100.0,70.0,70.0
3,black widow,70.0,85.0,80.0
6,spider-man,80.0,80.0,90.0


In [14]:
help(listwise_deleted_df.sort_values)

Help on method sort_values in module pandas.core.frame:

sort_values(by, axis=0, ascending=True, inplace=False, kind='quicksort', na_position='last', ignore_index=False, key: 'ValueKeyFunc' = None) method of pandas.core.frame.DataFrame instance
    Sort by the values along either axis.
    
    Parameters
    ----------
            by : str or list of str
                Name or list of names to sort by.
    
                - if `axis` is 0 or `'index'` then `by` may contain index
                  levels and/or column labels.
                - if `axis` is 1 or `'columns'` then `by` may contain column
                  levels and/or index labels.
    axis : {0 or 'index', 1 or 'columns'}, default 0
         Axis to be sorted.
    ascending : bool or list of bool, default True
         Sort ascending vs. descending. Specify list for multiple sort
         orders.  If this is a list of bools, must match the length of
         the by.
    inplace : bool, default False
         If True, 

In [15]:
df.isna()

Unnamed: 0,hero,power level,speed,iq
0,False,False,True,False
1,False,False,False,False
2,False,False,False,False
3,False,False,False,False
4,False,False,False,True
5,True,True,False,False
6,False,False,False,False


In [16]:
df.isna().sum()

hero           1
power level    1
speed          1
iq             1
dtype: int64

In [17]:
df.isna().sum(axis=1)

0    1
1    0
2    0
3    0
4    1
5    2
6    0
dtype: int64

In [18]:
df

Unnamed: 0,hero,power level,speed,iq
0,iron man,85.0,,95.0
1,thor,95.0,90.0,85.0
2,hulk,100.0,70.0,70.0
3,black widow,70.0,85.0,80.0
4,hawkeye,65.0,60.0,
5,,,75.0,65.0
6,spider-man,80.0,80.0,90.0


In [None]:
!pip install scikit-learn

In [19]:
from sklearn.impute import SimpleImputer 
from sklearn.impute import KNNImputer 

In [20]:
# mean imputation 

mean_imputer = SimpleImputer(strategy='mean') 

In [21]:
df['power level'] = mean_imputer.fit_transform(df[['power level']])

In [22]:
df

Unnamed: 0,hero,power level,speed,iq
0,iron man,85.0,,95.0
1,thor,95.0,90.0,85.0
2,hulk,100.0,70.0,70.0
3,black widow,70.0,85.0,80.0
4,hawkeye,65.0,60.0,
5,,82.5,75.0,65.0
6,spider-man,80.0,80.0,90.0


In [23]:
# median impute 
median_imputer = SimpleImputer(strategy='median')

df['speed'] = median_imputer.fit_transform(df[['speed']])

df

Unnamed: 0,hero,power level,speed,iq
0,iron man,85.0,77.5,95.0
1,thor,95.0,90.0,85.0
2,hulk,100.0,70.0,70.0
3,black widow,70.0,85.0,80.0
4,hawkeye,65.0,60.0,
5,,82.5,75.0,65.0
6,spider-man,80.0,80.0,90.0


In [24]:
df_a = pd.read_csv('superheroes.csv')

In [26]:
numerical_cols = df_a.select_dtypes(include=['number'])
string_cols = df.select_dtypes(exclude=['number']) 



In [28]:
numerical_cols

Unnamed: 0,power level,speed,iq
0,85.0,,95.0
1,95.0,90.0,85.0
2,100.0,70.0,70.0
3,70.0,85.0,80.0
4,65.0,60.0,
5,,75.0,65.0
6,80.0,80.0,90.0


In [33]:
# K nearest neighbour 
# imute entire dataframe
knn_imputer = KNNImputer(n_neighbors=2)

df_knn = pd.DataFrame(knn_imputer.fit_transform(numerical_cols), columns=numerical_cols.columns)

In [34]:
df_knn

Unnamed: 0,power level,speed,iq
0,85.0,85.0,95.0
1,95.0,90.0,85.0
2,100.0,70.0,70.0
3,70.0,85.0,80.0
4,65.0,60.0,77.5
5,85.0,75.0,65.0
6,80.0,80.0,90.0


In [35]:
df_final = pd.concat([string_cols, df_knn], axis = 1)

df_final

Unnamed: 0,hero,power level,speed,iq
0,iron man,85.0,85.0,95.0
1,thor,95.0,90.0,85.0
2,hulk,100.0,70.0,70.0
3,black widow,70.0,85.0,80.0
4,hawkeye,65.0,60.0,77.5
5,,85.0,75.0,65.0
6,spider-man,80.0,80.0,90.0


In [36]:
df_final['affiliation'] = ['Avengers', 'Avengers', 'Avengers', 'S.H.I.E.L.D.', 'S.H.I.E.L.D.', 'Avengers', 'Avengers']
df_final['rank'] = ['A', 'S', 'S', 'A', 'B', 'A', 'S']

In [37]:
df_final

Unnamed: 0,hero,power level,speed,iq,affiliation,rank
0,iron man,85.0,85.0,95.0,Avengers,A
1,thor,95.0,90.0,85.0,Avengers,S
2,hulk,100.0,70.0,70.0,Avengers,S
3,black widow,70.0,85.0,80.0,S.H.I.E.L.D.,A
4,hawkeye,65.0,60.0,77.5,S.H.I.E.L.D.,B
5,,85.0,75.0,65.0,Avengers,A
6,spider-man,80.0,80.0,90.0,Avengers,S


In [38]:
# one hot
df_final = pd.get_dummies(df_final, columns=['affiliation'])

In [39]:
# label 
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df_final['rank'] = le.fit_transform(df_final['rank'])

In [40]:
df_final

Unnamed: 0,hero,power level,speed,iq,rank,affiliation_Avengers,affiliation_S.H.I.E.L.D.
0,iron man,85.0,85.0,95.0,0,1,0
1,thor,95.0,90.0,85.0,2,1,0
2,hulk,100.0,70.0,70.0,2,1,0
3,black widow,70.0,85.0,80.0,0,0,1
4,hawkeye,65.0,60.0,77.5,1,0,1
5,,85.0,75.0,65.0,0,1,0
6,spider-man,80.0,80.0,90.0,2,1,0


In [42]:
df_final.columns

Index(['hero', 'power level', 'speed', 'iq', 'rank', 'affiliation_Avengers',
       'affiliation_S.H.I.E.L.D.'],
      dtype='object')

In [43]:
df_c = df_final.copy()

In [41]:
# normalization
from sklearn.preprocessing import MinMaxScaler

In [44]:
scaler = MinMaxScaler()
df_c [ ['power level', 'speed', 'iq']] = scaler.fit_transform(df_c [['power level', 'speed', 'iq']])


In [45]:
df_c

Unnamed: 0,hero,power level,speed,iq,rank,affiliation_Avengers,affiliation_S.H.I.E.L.D.
0,iron man,0.571429,0.833333,1.0,0,1,0
1,thor,0.857143,1.0,0.666667,2,1,0
2,hulk,1.0,0.333333,0.166667,2,1,0
3,black widow,0.142857,0.833333,0.5,0,0,1
4,hawkeye,0.0,0.0,0.416667,1,0,1
5,,0.571429,0.5,0.0,0,1,0
6,spider-man,0.428571,0.666667,0.833333,2,1,0


In [46]:
df_d = df_final.copy()

In [47]:
# standardization 

from sklearn.preprocessing import StandardScaler

scaler_s = StandardScaler()

df_d [ ['power level', 'speed', 'iq']] = scaler_s.fit_transform(df_d [['power level', 'speed', 'iq']])

In [48]:
df_d

Unnamed: 0,hero,power level,speed,iq,rank,affiliation_Avengers,affiliation_S.H.I.E.L.D.
0,iron man,0.184637,0.745356,1.485273,0,1,0
1,thor,1.046278,1.267105,0.47094,2,1,0
2,hulk,1.477098,-0.819892,-1.050559,2,1,0
3,black widow,-1.107823,0.745356,-0.036226,0,0,1
4,hawkeye,-1.538644,-1.86339,-0.289809,1,0,1
5,,0.184637,-0.298142,-1.557726,0,1,0
6,spider-man,-0.246183,0.223607,0.978107,2,1,0
