### Data Preprocessing

In [1]:
# importing packages

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
pd.pandas.set_option ("display.max_columns", None)


In [2]:
df = pd.read_csv("Visadataset.csv")

In [3]:
df.shape

(25480, 12)

## Data Cleaning




- Handling Missing values
- Handling Duplicates
- Check data type
- Understand the dataset

## Check Null Values


In [4]:
features_with_na = [feature for feature in df.columns if df[feature].isnull().sum()>=1]
for feature in features_with_na:
    print(feature, np.round(df[feature].isnull().mean()*100, 5), "% missing values")





In [5]:
features_with_na

[]

- There are no null values in the dataset

## Handling Duplicates

In [8]:
df.duplicated().sum()

np.int64(0)

- There are no duplicate values in the dataset

**Remove case_id from the dataset as it cannot used in Model Training**

In [11]:
df.drop('case_id', inplace=True, axis =1)

### Feature Engineering

In [12]:
df.head()

Unnamed: 0,continent,education_of_employee,has_job_experience,requires_job_training,no_of_employees,yr_of_estab,region_of_employment,prevailing_wage,unit_of_wage,full_time_position,case_status
0,Asia,High School,N,N,14513,2007,West,592.2029,Hour,Y,Denied
1,Asia,Master's,Y,N,2412,2002,Northeast,83425.65,Year,Y,Certified
2,Asia,Bachelor's,N,Y,44444,2008,West,122996.86,Year,Y,Denied
3,Asia,Bachelor's,N,N,98,1897,West,83434.03,Year,Y,Denied
4,Africa,Master's,Y,N,1082,2005,South,149907.39,Year,Y,Certified


In [24]:
# finding the age of the company
from datetime import date

today_date  = date.today()
current_year = today_date.year
current_year

2025

**Subtract current year with year of estab to get company's age**

In [25]:
df['company_age'] = current_year - df['yr_of_estab']

In [26]:
df['company_age']

0         18
1         23
2         17
3        128
4         20
        ... 
25475     17
25476     19
25477    115
25478    138
25479     65
Name: company_age, Length: 25480, dtype: int64

In [27]:
df.head()

Unnamed: 0,continent,education_of_employee,has_job_experience,requires_job_training,no_of_employees,yr_of_estab,region_of_employment,prevailing_wage,unit_of_wage,full_time_position,case_status,comapny_age,company_age
0,Asia,High School,N,N,14513,2007,West,592.2029,Hour,Y,Denied,18,18
1,Asia,Master's,Y,N,2412,2002,Northeast,83425.65,Year,Y,Certified,23,23
2,Asia,Bachelor's,N,Y,44444,2008,West,122996.86,Year,Y,Denied,17,17
3,Asia,Bachelor's,N,N,98,1897,West,83434.03,Year,Y,Denied,128,128
4,Africa,Master's,Y,N,1082,2005,South,149907.39,Year,Y,Certified,20,20


In [28]:
#drop the yr_of_estab column

df.drop('yr_of_estab',inplace = True, axis =1)

In [29]:
df.head()

Unnamed: 0,continent,education_of_employee,has_job_experience,requires_job_training,no_of_employees,region_of_employment,prevailing_wage,unit_of_wage,full_time_position,case_status,comapny_age,company_age
0,Asia,High School,N,N,14513,West,592.2029,Hour,Y,Denied,18,18
1,Asia,Master's,Y,N,2412,Northeast,83425.65,Year,Y,Certified,23,23
2,Asia,Bachelor's,N,Y,44444,West,122996.86,Year,Y,Denied,17,17
3,Asia,Bachelor's,N,N,98,West,83434.03,Year,Y,Denied,128,128
4,Africa,Master's,Y,N,1082,South,149907.39,Year,Y,Certified,20,20


### Types of feature

**Numeric Features**

In [31]:
num_features = [feature for feature in df.columns if df[feature].dtype !="O"]
print("Numerical features : ", len(num_features))



Numerical features :  4



**Categorical Features**


In [32]:
cat_features = [feature for feature in df. columns if df[feature].dtype =="O"]

print("Num of categorical Features: ", len(cat_features))

Num of categorical Features:  8



**Discrete features**
