In [2]:
import pandas as pd
import numpy as np
#Creating a dummy dataset
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve'],
    'Age': [25, 30, 35, 40, 45],
    'Salary': [50000, 60000, 70000, 80000, 90000],
    'Department': ['HR', 'IT"', 'Finance', 'HR', 'IT'],
    'Start_Date': pd.to_datetime(['2020-01-01', '2019-03-15', '2021-05-20', '2018-09-10', '2022-02-28']),
    'Experience': [5, 10, 3, 15, 2],
    'Rating': [4.2, 3.8, 4.5, 4.0, 4.7]
}
df= pd.DataFrame(data)
df

Unnamed: 0,Name,Age,Salary,Department,Start_Date,Experience,Rating
0,Alice,25,50000,HR,2020-01-01,5,4.2
1,Bob,30,60000,"IT""",2019-03-15,10,3.8
2,Charlie,35,70000,Finance,2021-05-20,3,4.5
3,David,40,80000,HR,2018-09-10,15,4.0
4,Eve,45,90000,IT,2022-02-28,2,4.7


In [3]:
#Question 1: employees who are older than 30:

subset_df = df[df['Age'] > 30]
print(subset_df)

      Name  Age  Salary Department Start_Date  Experience  Rating
2  Charlie   35   70000    Finance 2021-05-20           3     4.5
3    David   40   80000         HR 2018-09-10          15     4.0
4      Eve   45   90000         IT 2022-02-28           2     4.7


In [5]:
#Question 3: Calculating Summary Statistics
#Let's calculate summary statistics for the numerical columns in the DataFrame

summary_stats = df.describe()
print(summary_stats)


             Age        Salary           Start_Date  Experience    Rating
count   5.000000      5.000000                    5     5.00000  5.000000
mean   35.000000  70000.000000  2020-04-14 19:12:00     7.00000  4.240000
min    25.000000  50000.000000  2018-09-10 00:00:00     2.00000  3.800000
25%    30.000000  60000.000000  2019-03-15 00:00:00     3.00000  4.000000
50%    35.000000  70000.000000  2020-01-01 00:00:00     5.00000  4.200000
75%    40.000000  80000.000000  2021-05-20 00:00:00    10.00000  4.500000
max    45.000000  90000.000000  2022-02-28 00:00:00    15.00000  4.700000
std     7.905694  15811.388301                  NaN     5.43139  0.364692


In [6]:
#Question 4: Reshaping the Layout of Tables
#let's reshape the DataFrame to have "Name" as the index and "Department" as columns, witl"Salary" as values:

reshape_df = df.pivot_table(index='Name', columns='Department', values='Salary', aggfunc='first')
print(reshape_df)

Department  Finance       HR       IT      IT"
Name                                          
Alice           NaN  50000.0      NaN      NaN
Bob             NaN      NaN      NaN  60000.0
Charlie     70000.0      NaN      NaN      NaN
David           NaN  80000.0      NaN      NaN
Eve             NaN      NaN  90000.0      NaN


In [7]:
#Question 5: Combining Data from Multiple Tables
#Let's create another DataFrame with bonus information and merge it with the original DataFrame:

bonus_info = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve'],
    'Bonus': [2000, 3000, 2500, 3500, 4000]
}

bonus_df = pd.DataFrame(bonus_info)
merge_df = pd.merge(df, bonus_df, on='Name')
print(merge_df)


      Name  Age  Salary Department Start_Date  Experience  Rating Age_Group  \
0    Alice   25   50000         HR 2020-01-01           5     4.2       <30   
1      Bob   30   60000        IT" 2019-03-15          10     3.8     30-39   
2  Charlie   35   70000    Finance 2021-05-20           3     4.5     30-39   
3    David   40   80000         HR 2018-09-10          15     4.0     40-49   
4      Eve   45   90000         IT 2022-02-28           2     4.7     40-49   

   Bonus  
0   2000  
1   3000  
2   2500  
3   3500  
4   4000  


In [8]:
#Question 6: Manipulating Textual Data
#let's create a new column based on the length of the employee's name

df['Name_Length'] = df['Name'].str.len()
print(df)


      Name  Age  Salary Department Start_Date  Experience  Rating Age_Group  \
0    Alice   25   50000         HR 2020-01-01           5     4.2       <30   
1      Bob   30   60000        IT" 2019-03-15          10     3.8     30-39   
2  Charlie   35   70000    Finance 2021-05-20           3     4.5     30-39   
3    David   40   80000         HR 2018-09-10          15     4.0     40-49   
4      Eve   45   90000         IT 2022-02-28           2     4.7     40-49   

   Name_Length  
0            5  
1            3  
2            7  
3            5  
4            3  


In [9]:
#Question 7: Filtering Data Based on Multiple Conditions
#let's filter the DataFrame to include only employees from the IT department who are olde than 30:

filter_df = df[(df['Department'] == 'IT') & (df['Age'] > 30)]
print(filter_df)

  Name  Age  Salary Department Start_Date  Experience  Rating Age_Group  \
4  Eve   45   90000         IT 2022-02-28           2     4.7     40-49   

   Name_Length  
4            3  


In [10]:
#Question 9: Calculating Group-Wise Summary Statistics
#Let's calculate the mean salary and experience for each department:

group_stats = df.groupby('Department').agg({'Salary': 'mean', 'Experience': 'mean'})
print(group_stats)


             Salary  Experience
Department                     
Finance     70000.0         3.0
HR          65000.0        10.0
IT          90000.0         2.0
IT"         60000.0        10.0


In [11]:
#Question 10: Sorting Data
#Let's sort the DataFrame by age in descending order:

sort_df = df.sort_values(by='Age')
print(sort_df)

      Name  Age  Salary Department Start_Date  Experience  Rating Age_Group  \
0    Alice   25   50000         HR 2020-01-01           5     4.2       <30   
1      Bob   30   60000        IT" 2019-03-15          10     3.8     30-39   
2  Charlie   35   70000    Finance 2021-05-20           3     4.5     30-39   
3    David   40   80000         HR 2018-09-10          15     4.0     40-49   
4      Eve   45   90000         IT 2022-02-28           2     4.7     40-49   

   Name_Length  
0            5  
1            3  
2            7  
3            5  
4            3  
