# **Exploratory Data Analysis (EDA)**

## **Step-1: *Importing Libraries***

In [58]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

## **Step-2: *Loading Data***

In [59]:
df = pd.read_csv('C:/Users/lenovo/Desktop/Data-Scientist/Projects/4-Job-Market-Trends-India/Data/synthetic_jobs.csv')
print(df.head(5))

                   Job Title   Company        Location  \
0             Data Scientist    Amazon   Mumbai, India   
1             Data Scientist    Google  Chennai, India   
2             Data Scientist  Flipkart  Chennai, India   
3  Machine Learning Engineer   Infosys     Pune, India   
4  Machine Learning Engineer  Deloitte     Pune, India   

                            Skills Experience Required          Salary  \
0                Tableau, Excel, R            6+ years  ₹10L per annum   
1    Data Wrangling, Pandas, NumPy            6+ years  ₹17L per annum   
2  Machine Learning, Deep Learning            9+ years   ₹9L per annum   
3  Machine Learning, Deep Learning            4+ years  ₹19L per annum   
4            Python, SQL, Power BI            3+ years   ₹6L per annum   

          Date Posted  
0   Posted 9 days ago  
1  Posted 13 days ago  
2   Posted 7 days ago  
3   Posted 5 days ago  
4   Posted 9 days ago  


***Basic data examination***

In [60]:
print(f'Dataset Shape: {df.shape}\n')                       
print(f'Columns: {list(df.columns)}\n')
print(f'Top 5 Rows: {df.head()}, {df.tail()} \n')
print(f'Data Info: {df.info()}\n')

Dataset Shape: (50, 7)

Columns: ['Job Title', 'Company', 'Location', 'Skills', 'Experience Required', 'Salary', 'Date Posted']

Top 5 Rows:                    Job Title   Company        Location  \
0             Data Scientist    Amazon   Mumbai, India   
1             Data Scientist    Google  Chennai, India   
2             Data Scientist  Flipkart  Chennai, India   
3  Machine Learning Engineer   Infosys     Pune, India   
4  Machine Learning Engineer  Deloitte     Pune, India   

                            Skills Experience Required          Salary  \
0                Tableau, Excel, R            6+ years  ₹10L per annum   
1    Data Wrangling, Pandas, NumPy            6+ years  ₹17L per annum   
2  Machine Learning, Deep Learning            9+ years   ₹9L per annum   
3  Machine Learning, Deep Learning            4+ years  ₹19L per annum   
4            Python, SQL, Power BI            3+ years   ₹6L per annum   

          Date Posted  
0   Posted 9 days ago  
1  Posted 13 days

## **Step-3: *Data Cleaning***

In [61]:
df.head(3)

Unnamed: 0,Job Title,Company,Location,Skills,Experience Required,Salary,Date Posted
0,Data Scientist,Amazon,"Mumbai, India","Tableau, Excel, R",6+ years,₹10L per annum,Posted 9 days ago
1,Data Scientist,Google,"Chennai, India","Data Wrangling, Pandas, NumPy",6+ years,₹17L per annum,Posted 13 days ago
2,Data Scientist,Flipkart,"Chennai, India","Machine Learning, Deep Learning",9+ years,₹9L per annum,Posted 7 days ago


> ***Changing Column Name:***

In [62]:
df = df.rename(columns = {'Experience Required': 'Experience'})

> ***Checking missing Values:***

In [63]:
# Check for missing values
print(f'Missing Values: \n{df.isnull().sum()}')

Missing Values: 
Job Title      0
Company        0
Location       0
Skills         0
Experience     0
Salary         0
Date Posted    0
dtype: int64


> ***Data Cleaning & Transformation***

In [64]:
df.head(5)

Unnamed: 0,Job Title,Company,Location,Skills,Experience,Salary,Date Posted
0,Data Scientist,Amazon,"Mumbai, India","Tableau, Excel, R",6+ years,₹10L per annum,Posted 9 days ago
1,Data Scientist,Google,"Chennai, India","Data Wrangling, Pandas, NumPy",6+ years,₹17L per annum,Posted 13 days ago
2,Data Scientist,Flipkart,"Chennai, India","Machine Learning, Deep Learning",9+ years,₹9L per annum,Posted 7 days ago
3,Machine Learning Engineer,Infosys,"Pune, India","Machine Learning, Deep Learning",4+ years,₹19L per annum,Posted 5 days ago
4,Machine Learning Engineer,Deloitte,"Pune, India","Python, SQL, Power BI",3+ years,₹6L per annum,Posted 9 days ago


In [71]:
# Checking Datatypes of columns
print(df.dtypes)

Job Title      object
Company        object
Location       object
Skills         object
Experience     object
Salary          int64
Date Posted    object
dtype: object


> ***Cleaning Data***

In [66]:
# Location: Removing country from column
df['Location'] = df['Location'].str.split(',').str[0].astype(str)
df['Location']

0        Mumbai
1       Chennai
2       Chennai
3          Pune
4          Pune
5       Chennai
6         Delhi
7        Mumbai
8     Hyderabad
9     Bangalore
10    Hyderabad
11         Pune
12    Hyderabad
13        Delhi
14        Delhi
15         Pune
16      Chennai
17      Chennai
18    Bangalore
19       Mumbai
20        Delhi
21    Bangalore
22       Mumbai
23      Chennai
24      Chennai
25       Mumbai
26        Delhi
27    Bangalore
28        Delhi
29       Mumbai
30        Delhi
31         Pune
32    Hyderabad
33         Pune
34        Delhi
35        Delhi
36    Hyderabad
37       Mumbai
38    Bangalore
39      Chennai
40      Chennai
41    Hyderabad
42    Hyderabad
43        Delhi
44         Pune
45    Bangalore
46      Chennai
47    Bangalore
48    Bangalore
49         Pune
Name: Location, dtype: object

In [None]:
# Cleaning 'Experience' Column:
df['Experience'] = df['Experience'].str.extract('(\d+)')
df['Experience']

# str.extract('(\d+)'): This uses a regular expression \d+ to find one or more digits. The parentheses () create a capturing group, so only the digits are extracted.

# .astype(int): Converts the extracted string to an integer. This is essential for numerical analysis or comparisons.


# Converting 'Experience Required' column to int and removing '+ years'
# df['Experience Required'] = df['Experience Required'].str.split('+').str[0].astype(int)

  df['Experience'] = df['Experience'].str.extract('(\d+)')


0      6
1      6
2      9
3      4
4      3
5      1
6      5
7      3
8      1
9      9
10     9
11     1
12     6
13     2
14     3
15     5
16     6
17     2
18     5
19     5
20     1
21     8
22     1
23     1
24     8
25     1
26     6
27     3
28     7
29     9
30    10
31    10
32     9
33     1
34     5
35     8
36     9
37    10
38     9
39     7
40     4
41     5
42     2
43     5
44     6
45     7
46    10
47    10
48     2
49     5
Name: Experience, dtype: object

In [70]:
# Cleaning Salary column:
df['Salary'] = df['Salary'].str.extract('(\d)').astype(int)
df['Salary']

  df['Salary'] = df['Salary'].str.extract('(\d)').astype(int)


0     1
1     1
2     9
3     1
4     6
5     2
6     1
7     1
8     1
9     1
10    6
11    2
12    1
13    1
14    2
15    1
16    8
17    2
18    1
19    9
20    1
21    1
22    7
23    2
24    1
25    7
26    1
27    2
28    1
29    1
30    1
31    2
32    1
33    1
34    1
35    6
36    1
37    6
38    1
39    2
40    1
41    1
42    2
43    8
44    1
45    1
46    1
47    2
48    1
49    1
Name: Salary, dtype: int64

> ***Standardize Text Formatting***

In [74]:
df["Job Title"] = df["Job Title"].str.title()
df["Company"] = df["Company"].str.title()
df["Skills"] = df["Skills"].str.title()

## **Step-4: *Final Cleaned Dataset***

In [75]:
print(df.head())
df.to_csv("../data/cleaned_jobs.csv", index=False)
print("Data Cleaning Complete! Saved as 'cleaned_jobs.csv'. \n✅")

                   Job Title   Company Location  \
0             Data Scientist    Amazon   Mumbai   
1             Data Scientist    Google  Chennai   
2             Data Scientist  Flipkart  Chennai   
3  Machine Learning Engineer   Infosys     Pune   
4  Machine Learning Engineer  Deloitte     Pune   

                            Skills Experience  Salary         Date Posted  
0                Tableau, Excel, R          6       1   Posted 9 days ago  
1    Data Wrangling, Pandas, Numpy          6       1  Posted 13 days ago  
2  Machine Learning, Deep Learning          9       9   Posted 7 days ago  
3  Machine Learning, Deep Learning          4       1   Posted 5 days ago  
4            Python, Sql, Power Bi          3       6   Posted 9 days ago  
Data Cleaning Complete! Saved as 'cleaned_jobs.csv'. 
✅


***Basic Statistics:***

In [None]:
# Basic Statistics
print("\nSummary Statistics:\n", df.describe(include='all'))


Summary Statistics:
              Job Title Company Location                         Skills  \
count               50      50       50                             50   
unique               4       7        6                              4   
top     Data Scientist  Google  Chennai  Data Wrangling, Pandas, Numpy   
freq                14       9       10                             14   
mean               NaN     NaN      NaN                            NaN   
std                NaN     NaN      NaN                            NaN   
min                NaN     NaN      NaN                            NaN   
25%                NaN     NaN      NaN                            NaN   
50%                NaN     NaN      NaN                            NaN   
75%                NaN     NaN      NaN                            NaN   
max                NaN     NaN      NaN                            NaN   

        Experience Required          Salary        Date Posted  
count             50.000