**Task:**- Generated a Synthetic Dataset with columns of age, salary, department, years_experience, and is_manager.

In [2]:
import numpy as np
import pandas as pd

In [3]:
np.random.seed(42) # For reproducibility
n_samples = 100 # Number of samples

data = {
    'age': np.random.randint(18, 60,size=n_samples),
    'salary': np.random.randint(30000, 120000, size=n_samples),
    'department': np.random.choice(['IT', 'HR', 'Finance', 'Marketing'], size=n_samples),
    'years_experience': np.round(np.random.normal(5, 2, size=n_samples), 1),
    'is_manager': np.random.choice([0, 1], size=n_samples)
}
df = pd.DataFrame(data)

Q1. View data structure

In [4]:
print(df.head())
print(df.tail())
print(df.shape)
print(df.columns)

   age  salary department  years_experience  is_manager
0   56   38392         IT              -0.8           0
1   46   60535  Marketing               3.4           1
2   32  108603         HR               5.0           1
3   25   82256         HR               4.2           1
4   38  119135         HR               4.1           1
    age  salary department  years_experience  is_manager
95   59   82662         IT               4.0           0
96   56   42688         HR               4.4           1
97   58   55342  Marketing               6.0           0
98   45   67157         HR              11.4           0
99   24   97863         HR               5.2           1
(100, 5)
Index(['age', 'salary', 'department', 'years_experience', 'is_manager'], dtype='object')
age                   int64
salary                int64
department           object
years_experience    float64
is_manager            int64
dtype: object


Q2. Get DataFrame Info and Summary Stats

In [5]:
df.info()
print(df.describe())
print(df.describe(include='object'))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   age               100 non-null    int64  
 1   salary            100 non-null    int64  
 2   department        100 non-null    object 
 3   years_experience  100 non-null    float64
 4   is_manager        100 non-null    int64  
dtypes: float64(1), int64(3), object(1)
memory usage: 4.0+ KB
              age         salary  years_experience  is_manager
count  100.000000     100.000000        100.000000  100.000000
mean    37.910000   77809.160000          4.823000    0.470000
std     12.219454   26058.643576          2.237822    0.501614
min     18.000000   30206.000000         -0.800000    0.000000
25%     26.750000   55141.000000          3.475000    0.000000
50%     38.000000   80932.000000          4.700000    0.000000
75%     46.250000   98107.250000          6.000000    1.000000
ma

Q3. Do Simple Numpy Operations

In [6]:
# Mean and median of salary
mean_salary = np.mean(df['salary'])
median_salary = np.median(df['salary'])
print("Mean Salary:", mean_salary)
print("Median Salary:", median_salary)

# Maximum and minimum of age
print("Max Age:", np.max(df['age']))
print("Min Age:", np.min(df['age']))

# Standard deviation of years of experience
print("Std of Years Experience:", np.std(df['years_experience']))

Mean Salary: 77809.16
Median Salary: 80932.0
Max Age: 59
Min Age: 18
Std of Years Experience: 2.2266052636244265


Q4. Filtering and Indexing Rows

In [7]:
# Filter employees with salary > 80,000
high_salary = df[df['salary'] > 80000]
print(high_salary.head())

# Employees in IT department with more than 5 years of experience
experienced_it = df[(df['department'] == 'IT') & (df['years_experience'] > 5)]
print(experienced_it.head())

# Select specific columns
print(df[['age', 'salary']].head())


   age  salary department  years_experience  is_manager
2   32  108603         HR               5.0           1
3   25   82256         HR               4.2           1
4   38  119135         HR               4.1           1
6   36  107373    Finance               6.7           1
7   40  109575  Marketing               6.9           0
    age  salary department  years_experience  is_manager
10   41   40965         IT               9.5           1
15   39   57266         IT               5.8           1
31   20   97214         IT               5.7           1
41   26   79080         IT               7.8           0
53   21   74262         IT               5.4           0
   age  salary
0   56   38392
1   46   60535
2   32  108603
3   25   82256
4   38  119135


Q5. Adding a Column

In [8]:
df['performance_score'] = np.random.randint(1, 11, size=n_samples)
df['salary_in_lakhs'] = np.round(df['salary'] / 100000, 2)
print(df.head())


   age  salary department  years_experience  is_manager  performance_score  \
0   56   38392         IT              -0.8           0                  4   
1   46   60535  Marketing               3.4           1                  9   
2   32  108603         HR               5.0           1                  6   
3   25   82256         HR               4.2           1                  3   
4   38  119135         HR               4.1           1                  1   

   salary_in_lakhs  
0             0.38  
1             0.61  
2             1.09  
3             0.82  
4             1.19  


Q6. Grouping and Aggregation

In [10]:
# Group by department and get mean salary
dept_salary = df.groupby('department')['salary'].mean()
print(dept_salary)


# Average salary for managers vs non-managers
manager_salary = df.groupby('is_manager')['salary'].mean()
print(manager_salary)


department
Finance      83124.708333
HR           73523.052632
IT           75825.476190
Marketing    77684.722222
Name: salary, dtype: float64
is_manager
0    76500.867925
1    79284.468085
Name: salary, dtype: float64
