### STEP I : DATA CLEANING

### Step 1 - Import Libraries

In [1]:
import pandas as pd
import numpy as np


### Step 2 - Load Dataset

In [2]:
df = pd.read_csv(r"C:\Users\Priya\Desktop\Speedup\Data Science\All_Data_Science_Projects\Python_Packages\AI_based_Resume_Screening\AI_Resume_Screening.csv")


In [3]:
df.head()

Unnamed: 0,Resume_ID,Name,Skills,Experience (Years),Education,Certifications,Job Role,Recruiter Decision,Salary Expectation ($),Projects Count,AI Score (0-100)
0,1,Ashley Ali,"TensorFlow, NLP, Pytorch",10,B.Sc,,AI Researcher,Hire,104895,8,100
1,2,Wesley Roman,"Deep Learning, Machine Learning, Python, SQL",10,MBA,Google ML,Data Scientist,Hire,113002,1,100
2,3,Corey Sanchez,"Ethical Hacking, Cybersecurity, Linux",1,MBA,Deep Learning Specialization,Cybersecurity Analyst,Hire,71766,7,70
3,4,Elizabeth Carney,"Python, Pytorch, TensorFlow",7,B.Tech,AWS Certified,AI Researcher,Hire,46848,0,95
4,5,Julie Hill,"SQL, React, Java",4,PhD,,Software Engineer,Hire,87441,9,100


### Step 3 - Inspect Dataset

In [4]:
# Check structure:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 11 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   Resume_ID               1000 non-null   int64 
 1   Name                    1000 non-null   object
 2   Skills                  1000 non-null   object
 3   Experience (Years)      1000 non-null   int64 
 4   Education               1000 non-null   object
 5   Certifications          726 non-null    object
 6   Job Role                1000 non-null   object
 7   Recruiter Decision      1000 non-null   object
 8   Salary Expectation ($)  1000 non-null   int64 
 9   Projects Count          1000 non-null   int64 
 10  AI Score (0-100)        1000 non-null   int64 
dtypes: int64(5), object(6)
memory usage: 86.1+ KB


In [5]:
# Check missing values:
df.isnull().sum()

Resume_ID                   0
Name                        0
Skills                      0
Experience (Years)          0
Education                   0
Certifications            274
Job Role                    0
Recruiter Decision          0
Salary Expectation ($)      0
Projects Count              0
AI Score (0-100)            0
dtype: int64

In [6]:
# Check shape:
df.shape

(1000, 11)

### Step 4 - Fix Column Names

In [7]:
df.rename(columns={"Experience (Years)" : "Experience_Years",
                  "Job Role" : "Job_Role", 
                  "Recruiter Decision" : "Recruiter_Decision",
                   "Salary Expectation ($)" : "Salary_Expectation_Dollars",
                   "Projects Count" : "Projects_Count",
                   "AI Score (0-100)" : "AI_Score"
                  }, inplace=True)

In [8]:
df.columns

Index(['Resume_ID', 'Name', 'Skills', 'Experience_Years', 'Education',
       'Certifications', 'Job_Role', 'Recruiter_Decision',
       'Salary_Expectation_Dollars', 'Projects_Count', 'AI_Score'],
      dtype='object')

### Step 5 - Handle Missing Values

In [9]:
# Check Missing Values:
df.isnull().sum()

Resume_ID                       0
Name                            0
Skills                          0
Experience_Years                0
Education                       0
Certifications                274
Job_Role                        0
Recruiter_Decision              0
Salary_Expectation_Dollars      0
Projects_Count                  0
AI_Score                        0
dtype: int64

In [10]:
df["Certifications"].fillna("None", inplace=True)

In [11]:
df.isnull().sum()

Resume_ID                     0
Name                          0
Skills                        0
Experience_Years              0
Education                     0
Certifications                0
Job_Role                      0
Recruiter_Decision            0
Salary_Expectation_Dollars    0
Projects_Count                0
AI_Score                      0
dtype: int64

In [12]:
df.dtypes

Resume_ID                      int64
Name                          object
Skills                        object
Experience_Years               int64
Education                     object
Certifications                object
Job_Role                      object
Recruiter_Decision            object
Salary_Expectation_Dollars     int64
Projects_Count                 int64
AI_Score                       int64
dtype: object

### Step 6 - Checking Duplicate Records

In [13]:
print(df.duplicated().sum())

0


### Step 7 - Save Clean Data

In [14]:
df.to_csv("cleaned_resumes.csv", index=False)