In [3]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.tree import DecisionTreeClassifier


In [4]:
df = pd.read_csv("data_science_job.csv")

In [5]:
df

Unnamed: 0,enrollee_id,city,city_development_index,gender,relevent_experience,enrolled_university,education_level,major_discipline,experience,company_size,company_type,training_hours,target
0,8949,city_103,0.920,Male,Has relevent experience,no_enrollment,Graduate,STEM,20.0,,,36.0,1.0
1,29725,city_40,0.776,Male,No relevent experience,no_enrollment,Graduate,STEM,15.0,50-99,Pvt Ltd,47.0,0.0
2,11561,city_21,0.624,,No relevent experience,Full time course,Graduate,STEM,5.0,,,83.0,0.0
3,33241,city_115,0.789,,No relevent experience,,Graduate,Business Degree,0.0,,Pvt Ltd,52.0,1.0
4,666,city_162,0.767,Male,Has relevent experience,no_enrollment,Masters,STEM,20.0,50-99,Funded Startup,8.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
19153,7386,city_173,0.878,Male,No relevent experience,no_enrollment,Graduate,Humanities,14.0,,,42.0,1.0
19154,31398,city_103,0.920,Male,Has relevent experience,no_enrollment,Graduate,STEM,14.0,,,52.0,1.0
19155,24576,city_103,0.920,Male,Has relevent experience,no_enrollment,Graduate,STEM,20.0,50-99,Pvt Ltd,44.0,0.0
19156,5756,city_65,0.802,Male,Has relevent experience,no_enrollment,High School,,0.0,500-999,Pvt Ltd,97.0,0.0


In [6]:
df.isnull().sum()

enrollee_id                  0
city                         0
city_development_index     479
gender                    4508
relevent_experience          0
enrolled_university        386
education_level            460
major_discipline          2813
experience                  65
company_size              5938
company_type              6140
training_hours             766
target                       0
dtype: int64

In [7]:
df.isnull().mean()*100

enrollee_id                0.000000
city                       0.000000
city_development_index     2.500261
gender                    23.530640
relevent_experience        0.000000
enrolled_university        2.014824
education_level            2.401086
major_discipline          14.683161
experience                 0.339284
company_size              30.994885
company_type              32.049274
training_hours             3.998330
target                     0.000000
dtype: float64

In [8]:
for col in df.columns:
    print(col,df[col].isnull().sum(),df[col].isnull().mean()*100)

enrollee_id 0 0.0
city 0 0.0
city_development_index 479 2.5002609875769912
gender 4508 23.530639941538784
relevent_experience 0 0.0
enrolled_university 386 2.0148240943731075
education_level 460 2.401085708320284
major_discipline 2813 14.68316108153252
experience 65 0.33928385008873574
company_size 5938 30.994884643490973
company_type 6140 32.04927445453596
training_hours 766 3.998329679507256
target 0 0.0


In [9]:
missing_data = {
    "Column Name": df.columns,
    "Missing Values": [df[col].isnull().sum() for col in df.columns],
    "Missing Percentage (%)": [df[col].isnull().mean() * 100 for col in df.columns],
}

missing_table = pd.DataFrame(missing_data)
print(missing_table)


               Column Name  Missing Values  Missing Percentage (%)
0              enrollee_id               0                0.000000
1                     city               0                0.000000
2   city_development_index             479                2.500261
3                   gender            4508               23.530640
4      relevent_experience               0                0.000000
5      enrolled_university             386                2.014824
6          education_level             460                2.401086
7         major_discipline            2813               14.683161
8               experience              65                0.339284
9             company_size            5938               30.994885
10            company_type            6140               32.049274
11          training_hours             766                3.998330
12                  target               0                0.000000


In [10]:
from tabulate import tabulate

table_data = [
    [col, df[col].isnull().sum(), df[col].isnull().mean() * 100]
    for col in df.columns
]
print(tabulate(table_data, headers=["Column Name", "Missing Values", "Missing Percentage (%)"], tablefmt="grid"))


+------------------------+------------------+--------------------------+
| Column Name            |   Missing Values |   Missing Percentage (%) |
| enrollee_id            |                0 |                 0        |
+------------------------+------------------+--------------------------+
| city                   |                0 |                 0        |
+------------------------+------------------+--------------------------+
| city_development_index |              479 |                 2.50026  |
+------------------------+------------------+--------------------------+
| gender                 |             4508 |                23.5306   |
+------------------------+------------------+--------------------------+
| relevent_experience    |                0 |                 0        |
+------------------------+------------------+--------------------------+
| enrolled_university    |              386 |                 2.01482  |
+------------------------+------------------+------

In [11]:
pip install import-ipynb

Note: you may need to restart the kernel to use updated packages.


In [20]:
import import_ipynb
from common import display_missing_values_table

In [21]:
df = pd.read_csv("data_science_job.csv")
display_missing_values_table(df)

+------------------------+------------------+--------------------------+
| Column Name            |   Missing Values |   Missing Percentage (%) |
| enrollee_id            |                0 |                 0        |
+------------------------+------------------+--------------------------+
| city                   |                0 |                 0        |
+------------------------+------------------+--------------------------+
| city_development_index |              479 |                 2.50026  |
+------------------------+------------------+--------------------------+
| gender                 |             4508 |                23.5306   |
+------------------------+------------------+--------------------------+
| relevent_experience    |                0 |                 0        |
+------------------------+------------------+--------------------------+
| enrolled_university    |              386 |                 2.01482  |
+------------------------+------------------+------

In [66]:
cols = [var for var in df.columns if(df[var].isnull().mean()*100 < 5.0 and df[var].isnull().mean()*100 > 0.0)]

In [64]:
cols

['enrollee_id',
 'city',
 'city_development_index',
 'relevent_experience',
 'enrolled_university',
 'education_level',
 'experience',
 'training_hours',
 'target']

In [40]:
type(df.columns)

pandas.core.indexes.base.Index