In [2]:
import pandas as pd
import numpy as np
from datetime import timedelta

In [4]:
df = pd.read_csv(r"C:\Users\sachinpc\Desktop\visa_prediction\datasets\raw_data.csv")
df.head()

Unnamed: 0,case_id,continent,education_of_employee,has_job_experience,requires_job_training,no_of_employees,yr_of_estab,region_of_employment,prevailing_wage,unit_of_wage,full_time_position,case_status
0,EZYV01,Asia,High School,N,N,14513,2007,West,592.2029,Hour,Y,Denied
1,EZYV02,Asia,Master's,Y,N,2412,2002,Northeast,83425.65,Year,Y,Certified
2,EZYV03,Asia,Bachelor's,N,Y,44444,2008,West,122996.86,Year,Y,Denied
3,EZYV04,Asia,Bachelor's,N,N,98,1897,West,83434.03,Year,Y,Denied
4,EZYV05,Africa,Master's,Y,N,1082,2005,South,149907.39,Year,Y,Certified


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25480 entries, 0 to 25479
Data columns (total 12 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   case_id                25480 non-null  object 
 1   continent              25480 non-null  object 
 2   education_of_employee  25480 non-null  object 
 3   has_job_experience     25480 non-null  object 
 4   requires_job_training  25480 non-null  object 
 5   no_of_employees        25480 non-null  int64  
 6   yr_of_estab            25480 non-null  int64  
 7   region_of_employment   25480 non-null  object 
 8   prevailing_wage        25480 non-null  float64
 9   unit_of_wage           25480 non-null  object 
 10  full_time_position     25480 non-null  object 
 11  case_status            25480 non-null  object 
dtypes: float64(1), int64(2), object(9)
memory usage: 2.3+ MB


In [3]:
df.isnull().sum()

case_id                  0
continent                0
education_of_employee    0
has_job_experience       0
requires_job_training    0
no_of_employees          0
yr_of_estab              0
region_of_employment     0
prevailing_wage          0
unit_of_wage             0
full_time_position       0
case_status              0
dtype: int64

generating application date

In [None]:
df["application_date"] = pd.to_datetime(
    np.random.choice(
        pd.date_range("2016-01-01", "2024-12-31"),
        size=len(df) 
    )
)


generating processing time based on criteria

In [5]:
def generate_processing_time(row):
    days = np.random.randint(30, 180)

    if row["full_time_position"] == "Y":
        days -= 10

    if row["prevailing_wage"] > df["prevailing_wage"].median():
        days -= 15

    if row["no_of_employees"] > 50:
        days -= 10

    if row["case_status"] == "Certified":
        days -= 5

    return max(days, 15)

In [6]:
df["processing_time_days"] = df.apply(generate_processing_time, axis=1)

calculating decision date

In [7]:
df["decision_date"] = df["application_date"] + pd.to_timedelta(
    df["processing_time_days"], unit="D"
)


In [8]:
df.head()

Unnamed: 0,case_id,continent,education_of_employee,has_job_experience,requires_job_training,no_of_employees,yr_of_estab,region_of_employment,prevailing_wage,unit_of_wage,full_time_position,case_status,application_date,processing_time_days,decision_date
0,EZYV01,Asia,High School,N,N,14513,2007,West,592.2029,Hour,Y,Denied,2016-12-08,19,2016-12-27
1,EZYV02,Asia,Master's,Y,N,2412,2002,Northeast,83425.65,Year,Y,Certified,2023-09-21,72,2023-12-02
2,EZYV03,Asia,Bachelor's,N,Y,44444,2008,West,122996.86,Year,Y,Denied,2022-04-29,95,2022-08-02
3,EZYV04,Asia,Bachelor's,N,N,98,1897,West,83434.03,Year,Y,Denied,2020-02-08,140,2020-06-27
4,EZYV05,Africa,Master's,Y,N,1082,2005,South,149907.39,Year,Y,Certified,2016-09-25,116,2017-01-19


In [9]:
categorical_cols = [
    "continent",
    "education_of_employee",
    "region_of_employment",
    "unit_of_wage",
    "full_time_position",
    "has_job_experience",
    "requires_job_training"
]

numerical_cols = [
    "no_of_employees",
    "yr_of_estab",
    "prevailing_wage",
    "processing_time_days"
]


In [11]:
df[categorical_cols].isnull().sum()

continent                0
education_of_employee    0
region_of_employment     0
unit_of_wage             0
full_time_position       0
has_job_experience       0
requires_job_training    0
dtype: int64

In [12]:
df[numerical_cols].isnull().sum()

no_of_employees         0
yr_of_estab             0
prevailing_wage         0
processing_time_days    0
dtype: int64

In [13]:
for col in categorical_cols:
    df[col].fillna("Unknown", inplace=True)

for col in numerical_cols:
    df[col].fillna(df[col].median(), inplace=True)


handling datatype

In [15]:
df["yr_of_estab"] = df["yr_of_estab"].astype(int)
df["prevailing_wage"] = pd.to_numeric(df["prevailing_wage"], errors="coerce")


calculating age of the company

In [18]:
df["company_age"] = df["application_date"].dt.year - df["yr_of_estab"]
df["company_age"] = df["company_age"].clip(lower=0)
df.drop(columns=["yr_of_estab"], inplace=True)

normalizing wages

In [19]:
def normalize_wage(row):
    if row["unit_of_wage"] == "Hour":
        return row["prevailing_wage"] * 40 * 52
    elif row["unit_of_wage"] == "Week":
        return row["prevailing_wage"] * 52
    elif row["unit_of_wage"] == "Month":
        return row["prevailing_wage"] * 12
    else:
        return row["prevailing_wage"]

df["annual_wage"] = df.apply(normalize_wage, axis=1)
df.drop(columns=["prevailing_wage", "unit_of_wage"], inplace=True)

In [20]:
df.case_status.value_counts()

case_status
Certified    17018
Denied        8462
Name: count, dtype: int64

In [21]:
df["case_status"] = df["case_status"].replace({
    "Certified": "Approved",
    "Denied": "Denied"
})

In [23]:
df.drop(columns=["case_id"], inplace=True)

In [25]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25480 entries, 0 to 25479
Data columns (total 13 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   continent              25480 non-null  object        
 1   education_of_employee  25480 non-null  object        
 2   has_job_experience     25480 non-null  object        
 3   requires_job_training  25480 non-null  object        
 4   no_of_employees        25480 non-null  int64         
 5   region_of_employment   25480 non-null  object        
 6   full_time_position     25480 non-null  object        
 7   case_status            25480 non-null  object        
 8   application_date       25480 non-null  datetime64[ns]
 9   processing_time_days   25480 non-null  int64         
 10  decision_date          25480 non-null  datetime64[ns]
 11  company_age            25480 non-null  int32         
 12  annual_wage            25480 non-null  float64       
dtypes

In [26]:
df["application_year"] = df["application_date"].dt.year
df["application_month"] = df["application_date"].dt.month
df.drop(columns=["application_date"], inplace=True)


In [27]:
df.head()

Unnamed: 0,continent,education_of_employee,has_job_experience,requires_job_training,no_of_employees,region_of_employment,full_time_position,case_status,processing_time_days,decision_date,company_age,annual_wage,application_year,application_month
0,Asia,High School,N,N,14513,West,Y,Denied,19,2016-12-27,9,1231782.032,2016,12
1,Asia,Master's,Y,N,2412,Northeast,Y,Approved,72,2023-12-02,21,83425.65,2023,9
2,Asia,Bachelor's,N,Y,44444,West,Y,Denied,95,2022-08-02,14,122996.86,2022,4
3,Asia,Bachelor's,N,N,98,West,Y,Denied,140,2020-06-27,123,83434.03,2020,2
4,Africa,Master's,Y,N,1082,South,Y,Approved,116,2017-01-19,11,149907.39,2016,9


In [28]:
df["education_of_employee"] = df["education_of_employee"].replace({
    "Bachelor's": "Bachelor",
    "Master's": "Master"
})

arranging data fields

In [29]:
df = df[
    [
        "continent",
        "region_of_employment",
        "education_of_employee",
        "has_job_experience",
        "requires_job_training",
        "full_time_position",
        "company_age",
        "no_of_employees",
        "annual_wage",
        "application_year",
        "application_month",
        "decision_date",
        "case_status",
        "processing_time_days"
    ]
]


In [30]:
df.head()

Unnamed: 0,continent,region_of_employment,education_of_employee,has_job_experience,requires_job_training,full_time_position,company_age,no_of_employees,annual_wage,application_year,application_month,decision_date,case_status,processing_time_days
0,Asia,West,High School,N,N,Y,9,14513,1231782.032,2016,12,2016-12-27,Denied,19
1,Asia,Northeast,Master,Y,N,Y,21,2412,83425.65,2023,9,2023-12-02,Approved,72
2,Asia,West,Bachelor,N,Y,Y,14,44444,122996.86,2022,4,2022-08-02,Denied,95
3,Asia,West,Bachelor,N,N,Y,123,98,83434.03,2020,2,2020-06-27,Denied,140
4,Africa,South,Master,Y,N,Y,11,1082,149907.39,2016,9,2017-01-19,Approved,116


In [33]:
df.to_csv("Final_Cleaned.csv", index=False)