# **Exploratory Data Analysis (EDA)**

## **Step-1: *Importing Libraries***

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

## **Step-2: *Loading Data***

In [6]:
df = pd.read_csv('C:/Users/lenovo/Desktop/Data-Scientist/Projects/4-Job-Market-Trends-India/Data/synthetic_jobs.csv')
print(df.head(5))

                   Job Title   Company        Location  \
0             Data Scientist    Amazon   Mumbai, India   
1             Data Scientist    Google  Chennai, India   
2             Data Scientist  Flipkart  Chennai, India   
3  Machine Learning Engineer   Infosys     Pune, India   
4  Machine Learning Engineer  Deloitte     Pune, India   

                            Skills Experience Required          Salary  \
0                Tableau, Excel, R            6+ years  ₹10L per annum   
1    Data Wrangling, Pandas, NumPy            6+ years  ₹17L per annum   
2  Machine Learning, Deep Learning            9+ years   ₹9L per annum   
3  Machine Learning, Deep Learning            4+ years  ₹19L per annum   
4            Python, SQL, Power BI            3+ years   ₹6L per annum   

          Date Posted  
0   Posted 9 days ago  
1  Posted 13 days ago  
2   Posted 7 days ago  
3   Posted 5 days ago  
4   Posted 9 days ago  


***Basic data examination***

In [15]:
print(f'Dataset Shape: {df.shape}\n')                       
print(f'Columns: {list(df.columns)}\n')
print(f'Top 5 Rows: {df.head()}, {df.tail()} \n')
print(f'Data Info: {df.info()}\n')

Dataset Shape: (50, 7)

Columns: ['Job Title', 'Company', 'Location', 'Skills', 'Experience Required', 'Salary', 'Date Posted']

Top 5 Rows:                    Job Title   Company        Location  \
0             Data Scientist    Amazon   Mumbai, India   
1             Data Scientist    Google  Chennai, India   
2             Data Scientist  Flipkart  Chennai, India   
3  Machine Learning Engineer   Infosys     Pune, India   
4  Machine Learning Engineer  Deloitte     Pune, India   

                            Skills Experience Required          Salary  \
0                Tableau, Excel, R            6+ years  ₹10L per annum   
1    Data Wrangling, Pandas, NumPy            6+ years  ₹17L per annum   
2  Machine Learning, Deep Learning            9+ years   ₹9L per annum   
3  Machine Learning, Deep Learning            4+ years  ₹19L per annum   
4            Python, SQL, Power BI            3+ years   ₹6L per annum   

          Date Posted  
0   Posted 9 days ago  
1  Posted 13 days

## **Step-3: *Data Cleaning***

***Checking missing Values:***

In [23]:
# Check for missing values
print(f'Missing Values: \n{df.isnull().sum()}')

Missing Values: 
Job Title              0
Company                0
Location               0
Skills                 0
Experience Required    0
Salary                 0
Date Posted            0
dtype: int64


***Data Cleaning & Transformation***

In [26]:
df.head(5)

Unnamed: 0,Job Title,Company,Location,Skills,Experience Required,Salary,Date Posted
0,Data Scientist,Amazon,"Mumbai, India","Tableau, Excel, R",6+ years,₹10L per annum,Posted 9 days ago
1,Data Scientist,Google,"Chennai, India","Data Wrangling, Pandas, NumPy",6+ years,₹17L per annum,Posted 13 days ago
2,Data Scientist,Flipkart,"Chennai, India","Machine Learning, Deep Learning",9+ years,₹9L per annum,Posted 7 days ago
3,Machine Learning Engineer,Infosys,"Pune, India","Machine Learning, Deep Learning",4+ years,₹19L per annum,Posted 5 days ago
4,Machine Learning Engineer,Deloitte,"Pune, India","Python, SQL, Power BI",3+ years,₹6L per annum,Posted 9 days ago


In [34]:
# Checking Datatypes of columns
print(df.dtypes)

Job Title              object
Company                object
Location               object
Skills                 object
Experience Required     int64
Salary                 object
Date Posted            object
dtype: object


***Standardize Column Formats***

In [None]:
# Converting 'Experience Required' column to int and removing '+ years'
df['Experience Required'] = df['Experience Required'].str.split('+').str[0].astype(int)
print(df['Experience Required'])

0      6
1      6
2      9
3      4
4      3
5      1
6      5
7      3
8      1
9      9
10     9
11     1
12     6
13     2
14     3
15     5
16     6
17     2
18     5
19     5
20     1
21     8
22     1
23     1
24     8
25     1
26     6
27     3
28     7
29     9
30    10
31    10
32     9
33     1
34     5
35     8
36     9
37    10
38     9
39     7
40     4
41     5
42     2
43     5
44     6
45     7
46    10
47    10
48     2
49     5
Name: Experience Required, dtype: int64


***Basic Statistics:***

In [25]:
# Basic statistics
print("\nSummary Statistics:\n", df.describe(include='all'))


Summary Statistics:
              Job Title Company        Location                         Skills  \
count               50      50              50                             50   
unique               4       7               6                              4   
top     Data Scientist  Google  Chennai, India  Data Wrangling, Pandas, NumPy   
freq                14       9              10                             14   

       Experience Required          Salary        Date Posted  
count                   50              50                 50  
unique                  10              15                 14  
top               1+ years  ₹20L per annum  Posted 9 days ago  
freq                     8              10                  6  
