In [24]:
import pandas as pd
import numpy as np
from datetime import timedelta

In [29]:
df = pd.read_csv(r"C:\Users\sachin\visa\visa-status-prediction-1\datasets\raw_data.csv")
df.head()

Unnamed: 0,case_id,continent,education_of_employee,has_job_experience,requires_job_training,no_of_employees,yr_of_estab,region_of_employment,prevailing_wage,unit_of_wage,full_time_position,case_status
0,EZYV01,Asia,High School,N,N,14513,2007,West,592.2029,Hour,Y,Denied
1,EZYV02,Asia,Master's,Y,N,2412,2002,Northeast,83425.65,Year,Y,Certified
2,EZYV03,Asia,Bachelor's,N,Y,44444,2008,West,122996.86,Year,Y,Denied
3,EZYV04,Asia,Bachelor's,N,N,98,1897,West,83434.03,Year,Y,Denied
4,EZYV05,Africa,Master's,Y,N,1082,2005,South,149907.39,Year,Y,Certified


In [30]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25480 entries, 0 to 25479
Data columns (total 12 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   case_id                25480 non-null  object 
 1   continent              25480 non-null  object 
 2   education_of_employee  25480 non-null  object 
 3   has_job_experience     25480 non-null  object 
 4   requires_job_training  25480 non-null  object 
 5   no_of_employees        25480 non-null  int64  
 6   yr_of_estab            25480 non-null  int64  
 7   region_of_employment   25480 non-null  object 
 8   prevailing_wage        25480 non-null  float64
 9   unit_of_wage           25480 non-null  object 
 10  full_time_position     25480 non-null  object 
 11  case_status            25480 non-null  object 
dtypes: float64(1), int64(2), object(9)
memory usage: 2.3+ MB


In [31]:
df.isnull().sum()

case_id                  0
continent                0
education_of_employee    0
has_job_experience       0
requires_job_training    0
no_of_employees          0
yr_of_estab              0
region_of_employment     0
prevailing_wage          0
unit_of_wage             0
full_time_position       0
case_status              0
dtype: int64

generating application date

In [32]:
df["application_date"] = pd.to_datetime(
    np.random.choice(
        pd.date_range("2016-01-01", "2024-12-31"),
        size=len(df) 
    )
)


generating processing time based on criteria

In [33]:
def generate_processing_time(row):
    days = np.random.randint(30, 180)

    if row["full_time_position"] == "Y":
        days -= 10

    if row["prevailing_wage"] > df["prevailing_wage"].median():
        days -= 15

    if row["no_of_employees"] > 50:
        days -= 10

    if row["case_status"] == "Certified":
        days -= 5

    return max(days, 15)

In [34]:
df["processing_time_days"] = df.apply(generate_processing_time, axis=1)

calculating decision date

In [35]:
df["decision_date"] = df["application_date"] + pd.to_timedelta(
    df["processing_time_days"], unit="D"
)


In [36]:
df.head()

Unnamed: 0,case_id,continent,education_of_employee,has_job_experience,requires_job_training,no_of_employees,yr_of_estab,region_of_employment,prevailing_wage,unit_of_wage,full_time_position,case_status,application_date,processing_time_days,decision_date
0,EZYV01,Asia,High School,N,N,14513,2007,West,592.2029,Hour,Y,Denied,2017-08-12,49,2017-09-30
1,EZYV02,Asia,Master's,Y,N,2412,2002,Northeast,83425.65,Year,Y,Certified,2023-07-05,28,2023-08-02
2,EZYV03,Asia,Bachelor's,N,Y,44444,2008,West,122996.86,Year,Y,Denied,2022-01-06,129,2022-05-15
3,EZYV04,Asia,Bachelor's,N,N,98,1897,West,83434.03,Year,Y,Denied,2024-11-09,110,2025-02-27
4,EZYV05,Africa,Master's,Y,N,1082,2005,South,149907.39,Year,Y,Certified,2021-02-06,73,2021-04-20


In [37]:
categorical_cols = [
    "continent",
    "education_of_employee",
    "region_of_employment",
    "unit_of_wage",
    "full_time_position",
    "has_job_experience",
    "requires_job_training"
]

numerical_cols = [
    "no_of_employees",
    "yr_of_estab",
    "prevailing_wage",
    "processing_time_days"
]


In [38]:
df[categorical_cols].isnull().sum()

continent                0
education_of_employee    0
region_of_employment     0
unit_of_wage             0
full_time_position       0
has_job_experience       0
requires_job_training    0
dtype: int64

In [39]:
df[numerical_cols].isnull().sum()

no_of_employees         0
yr_of_estab             0
prevailing_wage         0
processing_time_days    0
dtype: int64

In [41]:
for col in categorical_cols:
    df[col] = df[col].fillna("Unknown")

for col in numerical_cols:
    df[col] = df[col].fillna(df[col].median())



handling datatype

In [42]:
df["yr_of_estab"] = df["yr_of_estab"].astype(int)
df["prevailing_wage"] = pd.to_numeric(df["prevailing_wage"], errors="coerce")


calculating age of the company

In [43]:
df["company_age"] = df["application_date"].dt.year - df["yr_of_estab"]
df["company_age"] = df["company_age"].clip(lower=0)
df.drop(columns=["yr_of_estab"], inplace=True)

normalizing wages

In [44]:
def normalize_wage(row):
    if row["unit_of_wage"] == "Hour":
        return row["prevailing_wage"] * 40 * 52
    elif row["unit_of_wage"] == "Week":
        return row["prevailing_wage"] * 52
    elif row["unit_of_wage"] == "Month":
        return row["prevailing_wage"] * 12
    else:
        return row["prevailing_wage"]

df["annual_wage"] = df.apply(normalize_wage, axis=1)
df.drop(columns=["prevailing_wage", "unit_of_wage"], inplace=True)

In [45]:
df.case_status.value_counts()

case_status
Certified    17018
Denied        8462
Name: count, dtype: int64

In [46]:
df["case_status"] = df["case_status"].replace({
    "Certified": "Approved",
    "Denied": "Denied"
})

In [47]:
df.drop(columns=["case_id"], inplace=True)

In [48]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25480 entries, 0 to 25479
Data columns (total 13 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   continent              25480 non-null  object        
 1   education_of_employee  25480 non-null  object        
 2   has_job_experience     25480 non-null  object        
 3   requires_job_training  25480 non-null  object        
 4   no_of_employees        25480 non-null  int64         
 5   region_of_employment   25480 non-null  object        
 6   full_time_position     25480 non-null  object        
 7   case_status            25480 non-null  object        
 8   application_date       25480 non-null  datetime64[ns]
 9   processing_time_days   25480 non-null  int64         
 10  decision_date          25480 non-null  datetime64[ns]
 11  company_age            25480 non-null  int32         
 12  annual_wage            25480 non-null  float64       
dtypes

In [49]:
df["application_year"] = df["application_date"].dt.year
df["application_month"] = df["application_date"].dt.month
df.drop(columns=["application_date"], inplace=True)


In [50]:
df.head()

Unnamed: 0,continent,education_of_employee,has_job_experience,requires_job_training,no_of_employees,region_of_employment,full_time_position,case_status,processing_time_days,decision_date,company_age,annual_wage,application_year,application_month
0,Asia,High School,N,N,14513,West,Y,Denied,49,2017-09-30,10,1231782.032,2017,8
1,Asia,Master's,Y,N,2412,Northeast,Y,Approved,28,2023-08-02,21,83425.65,2023,7
2,Asia,Bachelor's,N,Y,44444,West,Y,Denied,129,2022-05-15,14,122996.86,2022,1
3,Asia,Bachelor's,N,N,98,West,Y,Denied,110,2025-02-27,127,83434.03,2024,11
4,Africa,Master's,Y,N,1082,South,Y,Approved,73,2021-04-20,16,149907.39,2021,2


In [51]:
df["education_of_employee"] = df["education_of_employee"].replace({
    "Bachelor's": "Bachelor",
    "Master's": "Master"
})

arranging data fields

In [52]:
df = df[
    [
        "continent",
        "region_of_employment",
        "education_of_employee",
        "has_job_experience",
        "requires_job_training",
        "full_time_position",
        "company_age",
        "no_of_employees",
        "annual_wage",
        "application_year",
        "application_month",
        "decision_date",
        "case_status",
        "processing_time_days"
    ]
]


In [53]:
df.head()

Unnamed: 0,continent,region_of_employment,education_of_employee,has_job_experience,requires_job_training,full_time_position,company_age,no_of_employees,annual_wage,application_year,application_month,decision_date,case_status,processing_time_days
0,Asia,West,High School,N,N,Y,10,14513,1231782.032,2017,8,2017-09-30,Denied,49
1,Asia,Northeast,Master,Y,N,Y,21,2412,83425.65,2023,7,2023-08-02,Approved,28
2,Asia,West,Bachelor,N,Y,Y,14,44444,122996.86,2022,1,2022-05-15,Denied,129
3,Asia,West,Bachelor,N,N,Y,127,98,83434.03,2024,11,2025-02-27,Denied,110
4,Africa,South,Master,Y,N,Y,16,1082,149907.39,2021,2,2021-04-20,Approved,73


In [54]:
df.to_csv(r"C:\Users\sachin\visa\visa-status-prediction-1\datasets\Final_Cleaned.csv", index=False)

In [55]:
df = pd.read_csv(r"C:\Users\sachin\visa\visa-status-prediction-1\datasets\Final_Cleaned.csv")
df.head()

Unnamed: 0,continent,region_of_employment,education_of_employee,has_job_experience,requires_job_training,full_time_position,company_age,no_of_employees,annual_wage,application_year,application_month,decision_date,case_status,processing_time_days
0,Asia,West,High School,N,N,Y,10,14513,1231782.032,2017,8,2017-09-30,Denied,49
1,Asia,Northeast,Master,Y,N,Y,21,2412,83425.65,2023,7,2023-08-02,Approved,28
2,Asia,West,Bachelor,N,Y,Y,14,44444,122996.86,2022,1,2022-05-15,Denied,129
3,Asia,West,Bachelor,N,N,Y,127,98,83434.03,2024,11,2025-02-27,Denied,110
4,Africa,South,Master,Y,N,Y,16,1082,149907.39,2021,2,2021-04-20,Approved,73


label encoding

In [56]:
df=pd.read_csv(r"C:\Users\sachin\visa\visa-status-prediction-1\datasets\Final_Cleaned.csv")

In [57]:
categorical_cols = [
    'continent',
    'region_of_employment',
    'education_of_employee'
]

for col in categorical_cols:
    print(f"\n{col} ({df[col].nunique()} unique values):")
    print(df[col].unique())



continent (6 unique values):
['Asia' 'Africa' 'North America' 'Europe' 'South America' 'Oceania']

region_of_employment (5 unique values):
['West' 'Northeast' 'South' 'Midwest' 'Island']

education_of_employee (4 unique values):
['High School' 'Master' 'Bachelor' 'Doctorate']


In [58]:
binary_cols = [
    "has_job_experience",
    "requires_job_training",
    "full_time_position"
]

for col in binary_cols:
    df[col] = df[col].map({"Y": 1, "N": 0})


In [59]:
df.head()

Unnamed: 0,continent,region_of_employment,education_of_employee,has_job_experience,requires_job_training,full_time_position,company_age,no_of_employees,annual_wage,application_year,application_month,decision_date,case_status,processing_time_days
0,Asia,West,High School,0,0,1,10,14513,1231782.032,2017,8,2017-09-30,Denied,49
1,Asia,Northeast,Master,1,0,1,21,2412,83425.65,2023,7,2023-08-02,Approved,28
2,Asia,West,Bachelor,0,1,1,14,44444,122996.86,2022,1,2022-05-15,Denied,129
3,Asia,West,Bachelor,0,0,1,127,98,83434.03,2024,11,2025-02-27,Denied,110
4,Africa,South,Master,1,0,1,16,1082,149907.39,2021,2,2021-04-20,Approved,73


In [60]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25480 entries, 0 to 25479
Data columns (total 14 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   continent              25480 non-null  object 
 1   region_of_employment   25480 non-null  object 
 2   education_of_employee  25480 non-null  object 
 3   has_job_experience     25480 non-null  int64  
 4   requires_job_training  25480 non-null  int64  
 5   full_time_position     25480 non-null  int64  
 6   company_age            25480 non-null  int64  
 7   no_of_employees        25480 non-null  int64  
 8   annual_wage            25480 non-null  float64
 9   application_year       25480 non-null  int64  
 10  application_month      25480 non-null  int64  
 11  decision_date          25480 non-null  object 
 12  case_status            25480 non-null  object 
 13  processing_time_days   25480 non-null  int64  
dtypes: float64(1), int64(8), object(5)
memory usage: 2.7+ 

onehot encoding

In [61]:
df = pd.get_dummies(
    df,
    columns=[
        "continent",
        "region_of_employment",
        "education_of_employee"
    ],
    drop_first=False 
)

In [62]:
df.head()

Unnamed: 0,has_job_experience,requires_job_training,full_time_position,company_age,no_of_employees,annual_wage,application_year,application_month,decision_date,case_status,...,continent_South America,region_of_employment_Island,region_of_employment_Midwest,region_of_employment_Northeast,region_of_employment_South,region_of_employment_West,education_of_employee_Bachelor,education_of_employee_Doctorate,education_of_employee_High School,education_of_employee_Master
0,0,0,1,10,14513,1231782.032,2017,8,2017-09-30,Denied,...,False,False,False,False,False,True,False,False,True,False
1,1,0,1,21,2412,83425.65,2023,7,2023-08-02,Approved,...,False,False,False,True,False,False,False,False,False,True
2,0,1,1,14,44444,122996.86,2022,1,2022-05-15,Denied,...,False,False,False,False,False,True,True,False,False,False
3,0,0,1,127,98,83434.03,2024,11,2025-02-27,Denied,...,False,False,False,False,False,True,True,False,False,False
4,1,0,1,16,1082,149907.39,2021,2,2021-04-20,Approved,...,False,False,False,False,True,False,False,False,False,True


In [63]:
dummy_cols = df.select_dtypes(include="bool").columns
df[dummy_cols] = df[dummy_cols].astype(int)


In [64]:
df.head()

Unnamed: 0,has_job_experience,requires_job_training,full_time_position,company_age,no_of_employees,annual_wage,application_year,application_month,decision_date,case_status,...,continent_South America,region_of_employment_Island,region_of_employment_Midwest,region_of_employment_Northeast,region_of_employment_South,region_of_employment_West,education_of_employee_Bachelor,education_of_employee_Doctorate,education_of_employee_High School,education_of_employee_Master
0,0,0,1,10,14513,1231782.032,2017,8,2017-09-30,Denied,...,0,0,0,0,0,1,0,0,1,0
1,1,0,1,21,2412,83425.65,2023,7,2023-08-02,Approved,...,0,0,0,1,0,0,0,0,0,1
2,0,1,1,14,44444,122996.86,2022,1,2022-05-15,Denied,...,0,0,0,0,0,1,1,0,0,0
3,0,0,1,127,98,83434.03,2024,11,2025-02-27,Denied,...,0,0,0,0,0,1,1,0,0,0
4,1,0,1,16,1082,149907.39,2021,2,2021-04-20,Approved,...,0,0,0,0,1,0,0,0,0,1


In [65]:
df.to_csv(r"C:\Users\sachin\visa\visa-status-prediction-1\datasets\Final_Cleaned_encoded.csv", index=False)

In [66]:
df=pd.read_csv(r"C:\Users\sachin\visa\visa-status-prediction-1\datasets\Final_Cleaned_encoded.csv")
df.head()

Unnamed: 0,has_job_experience,requires_job_training,full_time_position,company_age,no_of_employees,annual_wage,application_year,application_month,decision_date,case_status,...,continent_South America,region_of_employment_Island,region_of_employment_Midwest,region_of_employment_Northeast,region_of_employment_South,region_of_employment_West,education_of_employee_Bachelor,education_of_employee_Doctorate,education_of_employee_High School,education_of_employee_Master
0,0,0,1,10,14513,1231782.032,2017,8,2017-09-30,Denied,...,0,0,0,0,0,1,0,0,1,0
1,1,0,1,21,2412,83425.65,2023,7,2023-08-02,Approved,...,0,0,0,1,0,0,0,0,0,1
2,0,1,1,14,44444,122996.86,2022,1,2022-05-15,Denied,...,0,0,0,0,0,1,1,0,0,0
3,0,0,1,127,98,83434.03,2024,11,2025-02-27,Denied,...,0,0,0,0,0,1,1,0,0,0
4,1,0,1,16,1082,149907.39,2021,2,2021-04-20,Approved,...,0,0,0,0,1,0,0,0,0,1


In [67]:
df["case_status"] = df["case_status"].map({"Approved": 1, "Denied": 0})

In [68]:
df.head()

Unnamed: 0,has_job_experience,requires_job_training,full_time_position,company_age,no_of_employees,annual_wage,application_year,application_month,decision_date,case_status,...,continent_South America,region_of_employment_Island,region_of_employment_Midwest,region_of_employment_Northeast,region_of_employment_South,region_of_employment_West,education_of_employee_Bachelor,education_of_employee_Doctorate,education_of_employee_High School,education_of_employee_Master
0,0,0,1,10,14513,1231782.032,2017,8,2017-09-30,0,...,0,0,0,0,0,1,0,0,1,0
1,1,0,1,21,2412,83425.65,2023,7,2023-08-02,1,...,0,0,0,1,0,0,0,0,0,1
2,0,1,1,14,44444,122996.86,2022,1,2022-05-15,0,...,0,0,0,0,0,1,1,0,0,0
3,0,0,1,127,98,83434.03,2024,11,2025-02-27,0,...,0,0,0,0,0,1,1,0,0,0
4,1,0,1,16,1082,149907.39,2021,2,2021-04-20,1,...,0,0,0,0,1,0,0,0,0,1


In [69]:
df["case_status"].value_counts()

case_status
1    17018
0     8462
Name: count, dtype: int64

In [70]:
df["case_status"].isnull().sum()

0

In [71]:
df.to_csv(r"C:\Users\sachin\visa\visa-status-prediction-1\datasets\Final_Cleaned_encoded.csv", index=False)