In [None]:
# Internship: Data Analysis
# Project 1 : Data Collection and Cleaning 
# Developer:  Muhammad Mohsin


In [1]:
# The first step is going on we are importing required libraries

import pandas as pd
import numpy as np


In [30]:
# Now we are loading or uploading our dataset which we are going to clean
df = pd.read_json("data_secinetis_job.json")

# Exploring dataSet
df.head()


Unnamed: 0,job_title,seniority_level,status,company,location,post_date,headquarter,industry,ownership,company_size,revenue,salary,skills
0,data scientist,senior,hybrid,company_003,"Grapevine, TX . Hybrid",17 days ago,"Bentonville, AR, US",Retail,Public,€352.44B,Public,"€100,472 - €200,938","['spark', 'r', 'python', 'scala', 'machine lea..."
1,data scientist,lead,hybrid,company_005,"Fort Worth, TX . Hybrid",15 days ago,"Detroit, MI, US",Manufacturing,Public,155030,€51.10B,"€118,733","['spark', 'r', 'python', 'sql', 'machine learn..."
2,data scientist,senior,on-site,company_007,"Austin, TX . Toronto, Ontario, Canada . Kirkla...",a month ago,"Redwood City, CA, US",Technology,Public,25930,€33.80B,"€94,987 - €159,559","['aws', 'git', 'python', 'docker', 'sql', 'mac..."
3,data scientist,senior,hybrid,company_008,"Chicago, IL . Scottsdale, AZ . Austin, TX . Hy...",8 days ago,"San Jose, CA, US",Technology,Public,34690,€81.71B,"€112,797 - €194,402","['sql', 'r', 'python']"
4,data scientist,,on-site,company_009,On-site,3 days ago,"Stamford, CT, US",Finance,Private,1800,Private,"€114,172 - €228,337",[]


In [20]:
# Now we are gaining information about the dataSet
print("\nInfo About DataSet")
df.info()


Info About DataSet
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 944 entries, 0 to 943
Data columns (total 13 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   job_title        941 non-null    object
 1   seniority_level  884 non-null    object
 2   status           688 non-null    object
 3   company          944 non-null    object
 4   location         942 non-null    object
 5   post_date        944 non-null    object
 6   headquarter      944 non-null    object
 7   industry         944 non-null    object
 8   ownership        897 non-null    object
 9   company_size     944 non-null    object
 10  revenue          929 non-null    object
 11  salary           944 non-null    object
 12  skills           944 non-null    object
dtypes: object(13)
memory usage: 96.0+ KB


In [21]:
# Now we can check order like rows and column
print("rows,columns:",df.shape)

rows,columns: (944, 13)


In [5]:
df.describe(include='all')

Unnamed: 0,job_title,seniority_level,status,company,location,post_date,headquarter,industry,ownership,company_size,revenue,salary,skills
count,941,884,688,944,942,944,944,944,897,944,929,944,944
unique,4,4,3,420,431,42,197,8,2,510,312,896,400
top,data scientist,senior,on-site,company_134,"Bengaluru, Karnataka, India",a month ago,"San Francisco, CA, US",Technology,Public,900,Private,"€25,214",[]
freq,856,630,363,30,52,167,91,582,579,18,247,3,201


In [22]:
#NOW start , Edit missing values

print("Missing values per row:",df.isnull().sum())


Missing values per row: job_title            3
seniority_level     60
status             256
company              0
location             2
post_date            0
headquarter          0
industry             0
ownership           47
company_size         0
revenue             15
salary               0
skills               0
dtype: int64


In [23]:
# Now we can check duplication of data like rows and column
print("Duplicated rows:",df.duplicated().sum())

Duplicated rows: 0


In [24]:
# df.dropna()

In [16]:
# df.shape

(599, 13)

In [18]:
# df.duplicated().sum()

# It is not a valid possible solution because our too much data is to be lost so we replace missing values using fillna()

np.int64(0)

In [27]:
# 1. Check again to confirm missing values
df.isnull().sum()

# 2. Fill categorical columns with mode (most frequent value)
df['job_title'].fillna(df['job_title'].mode()[0], inplace=True)
df['seniority_level'].fillna(df['seniority_level'].mode()[0], inplace=True)
df['status'].fillna(df['status'].mode()[0], inplace=True)
df['location'].fillna(df['location'].mode()[0], inplace=True)
df['ownership'].fillna(df['ownership'].mode()[0], inplace=True)

# 3. Fill numeric or mixed columns with a suitable placeholder or mean if numeric
df['revenue'].fillna(df['revenue'].mode()[0], inplace=True)

# 4. Verify again if all missing values are handled
# print("Missing values per row after cleaning:")
print(df.isnull().sum())


job_title          0
seniority_level    0
status             0
company            0
location           0
post_date          0
headquarter        0
industry           0
ownership          0
company_size       0
revenue            0
salary             0
skills             0
dtype: int64


In [29]:
# 1. Remove leading/trailing spaces (applymap → map fix)
for col in df.columns:
    if df[col].dtype == 'object':
        df[col] = df[col].map(lambda x: x.strip() if isinstance(x, str) else x)

# 2. Convert text columns to lowercase
text_columns = ['job_title', 'seniority_level', 'status', 'company', 
                'location', 'headquarter', 'industry', 'ownership', 'skills']
for col in text_columns:
    df[col] = df[col].astype(str).str.lower()

# 3. Fix date format explicitly to avoid warning
df['post_date'] = pd.to_datetime(df['post_date'], format='%Y-%m-%d', errors='coerce')

# 4. Clean non-standard characters
df['company_size'] = df['company_size'].str.replace(r'[^0-9\-+]', '', regex=True)
df['revenue'] = df['revenue'].str.replace(r'[^0-9a-zA-Z\-\s]', '', regex=True)

# 5. Preview
df.head()



Unnamed: 0,job_title,seniority_level,status,company,location,post_date,headquarter,industry,ownership,company_size,revenue,salary,skills
0,data scientist,senior,hybrid,company_003,"grapevine, tx . hybrid",NaT,"bentonville, ar, us",retail,public,35244,Public,"€100,472 - €200,938","['spark', 'r', 'python', 'scala', 'machine lea..."
1,data scientist,lead,hybrid,company_005,"fort worth, tx . hybrid",NaT,"detroit, mi, us",manufacturing,public,155030,5110B,"€118,733","['spark', 'r', 'python', 'sql', 'machine learn..."
2,data scientist,senior,on-site,company_007,"austin, tx . toronto, ontario, canada . kirkla...",NaT,"redwood city, ca, us",technology,public,25930,3380B,"€94,987 - €159,559","['aws', 'git', 'python', 'docker', 'sql', 'mac..."
3,data scientist,senior,hybrid,company_008,"chicago, il . scottsdale, az . austin, tx . hy...",NaT,"san jose, ca, us",technology,public,34690,8171B,"€112,797 - €194,402","['sql', 'r', 'python']"
4,data scientist,senior,on-site,company_009,on-site,NaT,"stamford, ct, us",finance,private,1800,Private,"€114,172 - €228,337",[]


In [31]:
df

Unnamed: 0,job_title,seniority_level,status,company,location,post_date,headquarter,industry,ownership,company_size,revenue,salary,skills
0,data scientist,senior,hybrid,company_003,"Grapevine, TX . Hybrid",17 days ago,"Bentonville, AR, US",Retail,Public,€352.44B,Public,"€100,472 - €200,938","['spark', 'r', 'python', 'scala', 'machine lea..."
1,data scientist,lead,hybrid,company_005,"Fort Worth, TX . Hybrid",15 days ago,"Detroit, MI, US",Manufacturing,Public,155030,€51.10B,"€118,733","['spark', 'r', 'python', 'sql', 'machine learn..."
2,data scientist,senior,on-site,company_007,"Austin, TX . Toronto, Ontario, Canada . Kirkla...",a month ago,"Redwood City, CA, US",Technology,Public,25930,€33.80B,"€94,987 - €159,559","['aws', 'git', 'python', 'docker', 'sql', 'mac..."
3,data scientist,senior,hybrid,company_008,"Chicago, IL . Scottsdale, AZ . Austin, TX . Hy...",8 days ago,"San Jose, CA, US",Technology,Public,34690,€81.71B,"€112,797 - €194,402","['sql', 'r', 'python']"
4,data scientist,,on-site,company_009,On-site,3 days ago,"Stamford, CT, US",Finance,Private,1800,Private,"€114,172 - €228,337",[]
...,...,...,...,...,...,...,...,...,...,...,...,...,...
939,data scientist,senior,,company_171,"Bengaluru, Karnataka, India",a day ago,"Armonk, NY, US",Technology,Public,524598,€120.29B,"€33,288 - €53,080","['pytorch', 'python', 'sql', 'machine learning..."
940,machine learning engineer,senior,,company_134,"Melbourne, Victoria, Australia",a month ago,"Seattle, WA, US",Retail,Public,865456,€838.78B,"€64,290","['amazon', 'machine learning']"
941,data scientist,midlevel,on-site,company_395,"McLean, VA",11 days ago,"McLean, VA, US",Retail,Public,55150,€36.29B,"€145,904 - €166,510","['spark', 'aws', 'r', 'python', 'scala', 'sql'..."
942,data scientist,midlevel,on-site,company_395,"New York, NY",17 days ago,"McLean, VA, US",Retail,Public,55110,€36.34B,"€159,149 - €181,595","['spark', 'aws', 'r', 'python', 'scala', 'sql'..."


In [32]:
df.to_csv('cleaned_data.csv', index=False)
