In [2]:
import pandas as pd
import numpy as np

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [3]:
data={
    'Name':['Alice','Bob','Charlie','David','Eve'],
    'Age':[25,30,35,40,45],
    'Salary':[50000,60000,70000,80000,90000],
    'Department':['HR','IT','Finance','HR','IT'],
    'Start_Date':pd.to_datetime(['2020-01-01','2019-03-15','2021-05-20','2018-09-10','2022-02-28']),
    'Experience':[5,10,3,15,2],
    'Rating':[4.2,3.8,4.5,4.0,4.7]
}
df=pd.DataFrame(data)
df

Unnamed: 0,Name,Age,Salary,Department,Start_Date,Experience,Rating
0,Alice,25,50000,HR,2020-01-01,5,4.2
1,Bob,30,60000,IT,2019-03-15,10,3.8
2,Charlie,35,70000,Finance,2021-05-20,3,4.5
3,David,40,80000,HR,2018-09-10,15,4.0
4,Eve,45,90000,IT,2022-02-28,2,4.7


In [4]:
# Q.1 Selecting a subset of dataframe 
# let's select employee who are older than 30

older_age=df[df["Age"]>30]
older_age

Unnamed: 0,Name,Age,Salary,Department,Start_Date,Experience,Rating
2,Charlie,35,70000,Finance,2021-05-20,3,4.5
3,David,40,80000,HR,2018-09-10,15,4.0
4,Eve,45,90000,IT,2022-02-28,2,4.7


In [5]:
'''Q.2 Creating new column derived from existing column
    lets create a new column called "Age_Group" based on age of employees.'''
def age_group(age):
    if age<30:
        return "Young People"
    elif age>30 and age<40:
        return "Uncle type People"
    else:
        return "Old people"



In [6]:
df["Age_Group"]=df["Age"].apply(age_group)
df

Unnamed: 0,Name,Age,Salary,Department,Start_Date,Experience,Rating,Age_Group
0,Alice,25,50000,HR,2020-01-01,5,4.2,Young People
1,Bob,30,60000,IT,2019-03-15,10,3.8,Old people
2,Charlie,35,70000,Finance,2021-05-20,3,4.5,Uncle type People
3,David,40,80000,HR,2018-09-10,15,4.0,Old people
4,Eve,45,90000,IT,2022-02-28,2,4.7,Old people


In [7]:
'''Q.3 Calculating Summary Statistics
    let's calculate summary statistics for numerical columns in the Dataframe'''
df.describe()

Unnamed: 0,Age,Salary,Start_Date,Experience,Rating
count,5.0,5.0,5,5.0,5.0
mean,35.0,70000.0,2020-04-14 19:12:00,7.0,4.24
min,25.0,50000.0,2018-09-10 00:00:00,2.0,3.8
25%,30.0,60000.0,2019-03-15 00:00:00,3.0,4.0
50%,35.0,70000.0,2020-01-01 00:00:00,5.0,4.2
75%,40.0,80000.0,2021-05-20 00:00:00,10.0,4.5
max,45.0,90000.0,2022-02-28 00:00:00,15.0,4.7
std,7.905694,15811.388301,,5.43139,0.364692


In [8]:
'''Q.4 Reshapig the Layout of Tables
    Let"s reshape the Dataframe to have "name" as index and"Department" as columns, with 
    "Salary" as values'''

reshape_data=df.melt(id_vars=["Name"],
                     var_name="Department",value_name="Salary")
reshape_data

ValueError: value_name (Salary) cannot match an element in the DataFrame columns.

In [11]:
'''Q.5 combining Data from multiple tables
    let"s create another dataframe with bonus information and merge it with original dataframe '''
bonus_data={
    'Name':['Alice','Bob','Charlie','David','Eve'],
    'Bonus':[1500,3000,45000,6000,7500]
}
df_bonus_data=pd.DataFrame(bonus_data)
df_bonus_data

Unnamed: 0,Name,Bonus
0,Alice,1500
1,Bob,3000
2,Charlie,45000
3,David,6000
4,Eve,7500


In [12]:
merged_data=pd.merge(df,df_bonus_data,on="Name")
merged_data

Unnamed: 0,Name,Age,Salary,Department,Start_Date,Experience,Rating,Age_Group,Bonus
0,Alice,25,50000,HR,2020-01-01,5,4.2,Young People,1500
1,Bob,30,60000,IT,2019-03-15,10,3.8,Old people,3000
2,Charlie,35,70000,Finance,2021-05-20,3,4.5,Uncle type People,45000
3,David,40,80000,HR,2018-09-10,15,4.0,Old people,6000
4,Eve,45,90000,IT,2022-02-28,2,4.7,Old people,7500


In [19]:
"""Q.6 Manipulating textual data
    Let's create a new column based on length of employee's name:"""
df["lenght_of_name"]=(df["Name"]).str.len()
df

# As I stuck so I get the reference from google,not chatGpt. Sorry for that.

Unnamed: 0,Name,Age,Salary,Department,Start_Date,Experience,Rating,Age_Group,lenght_of_name
0,Alice,25,50000,HR,2020-01-01,5,4.2,Young People,5
1,Bob,30,60000,IT,2019-03-15,10,3.8,Old people,3
2,Charlie,35,70000,Finance,2021-05-20,3,4.5,Uncle type People,7
3,David,40,80000,HR,2018-09-10,15,4.0,Old people,5
4,Eve,45,90000,IT,2022-02-28,2,4.7,Old people,3


In [20]:
"""Q.7 Filtering the data based on multiple conditions
    Let's filter the dataframe to include only employees from IT department who are older than 30"""
filter_data=df[(df["Department"]=="IT")&(df["Age"]>30)]
filter_data

Unnamed: 0,Name,Age,Salary,Department,Start_Date,Experience,Rating,Age_Group,lenght_of_name
4,Eve,45,90000,IT,2022-02-28,2,4.7,Old people,3


In [21]:
"""Q.8 creating a new column based on condition
    Let's create a new column called 'Performance' based on employeed rating"""

def performance_data(rating):
    if rating<4:
        return "Bad"
    elif rating>=4.5:
        return "Good"
    else:
        return "Excellent"

In [22]:
df["Performance"]=df["Rating"].apply(performance_data)
df

Unnamed: 0,Name,Age,Salary,Department,Start_Date,Experience,Rating,Age_Group,lenght_of_name,Performance
0,Alice,25,50000,HR,2020-01-01,5,4.2,Young People,5,Excellent
1,Bob,30,60000,IT,2019-03-15,10,3.8,Old people,3,Bad
2,Charlie,35,70000,Finance,2021-05-20,3,4.5,Uncle type People,7,Good
3,David,40,80000,HR,2018-09-10,15,4.0,Old people,5,Excellent
4,Eve,45,90000,IT,2022-02-28,2,4.7,Old people,3,Good


In [25]:
"""Q.9 Calculate group wise summary Statistics
    Let's calculate mean salary and experience for each department"""

group_stats=df.groupby("Department").agg({"Salary":"mean",
                                         "Experience":"mean"})
group_stats

Unnamed: 0_level_0,Salary,Experience
Department,Unnamed: 1_level_1,Unnamed: 2_level_1
Finance,70000.0,3.0
HR,65000.0,10.0
IT,75000.0,6.0


In [30]:
"""Q.10 Sorting Data
    Let's sort dataframe by age in descending order"""
sort_data=df.sort_values(by="Age",ascending=False)
sort_data

Unnamed: 0,Name,Age,Salary,Department,Start_Date,Experience,Rating,Age_Group,lenght_of_name,Performance
4,Eve,45,90000,IT,2022-02-28,2,4.7,Old people,3,Good
3,David,40,80000,HR,2018-09-10,15,4.0,Old people,5,Excellent
2,Charlie,35,70000,Finance,2021-05-20,3,4.5,Uncle type People,7,Good
1,Bob,30,60000,IT,2019-03-15,10,3.8,Old people,3,Bad
0,Alice,25,50000,HR,2020-01-01,5,4.2,Young People,5,Excellent


In [32]:
"""Q.11 Concatenating Dataframe
    Let's create a new dataframe with additional employees information and concatenate it with original dataframe"""

concat_data={
     'Name':['Papya','Gotya'],
    'Age':[30,33],
    'Salary':[25000,37000],
    'Department':['IT','Finance'],
    'Start_Date':pd.to_datetime(['2019-03-15','2021-05-20']),
    'Experience':[11,13],
    'Rating':[4.9,5.0]
}
df_concat_data=pd.DataFrame(concat_data)
df_concat_data

Unnamed: 0,Name,Age,Salary,Department,Start_Date,Experience,Rating
0,Papya,30,25000,IT,2019-03-15,11,4.9
1,Gotya,33,37000,Finance,2021-05-20,13,5.0


In [48]:
additional_data=pd.concat([df,df_concat_data],ignore_index=True)
additional_data

Unnamed: 0,Name,Age,Salary,Department,Start_Date,Experience,Rating,Age_Group,lenght_of_name,Performance
0,Alice,25,50000,HR,2020-01-01,5,4.2,Young People,5.0,Excellent
1,Bob,30,60000,IT,2019-03-15,10,3.8,Old people,3.0,Bad
2,Charlie,35,70000,Finance,2021-05-20,3,4.5,Uncle type People,7.0,Good
3,David,40,80000,HR,2018-09-10,15,4.0,Old people,5.0,Excellent
4,Eve,45,90000,IT,2022-02-28,2,4.7,Old people,3.0,Good
5,Papya,30,25000,IT,2019-03-15,11,4.9,70000,,70000
6,Gotya,33,70000,Finance,2021-05-20,13,5.0,70000,,70000
7,70000,70000,70000,70000,NaT,70000,70000.0,70000,,70000


In [49]:
"""Q.12 Handling missing data
    let's introduce some missing data and fill it with mean salary"""
additional_data.loc[2,"Salary"]=np.nan
additional_data

# I got the wrong output as u can see so I have a doubt in this question.



Unnamed: 0,Name,Age,Salary,Department,Start_Date,Experience,Rating,Age_Group,lenght_of_name,Performance
0,Alice,25,50000,HR,2020-01-01,5,4.2,Young People,5.0,Excellent
1,Bob,30,60000,IT,2019-03-15,10,3.8,Old people,3.0,Bad
2,Charlie,35,70000,Finance,2021-05-20,3,4.5,Uncle type People,7.0,Good
3,David,40,80000,HR,2018-09-10,15,4.0,Old people,5.0,Excellent
4,Eve,45,90000,IT,2022-02-28,2,4.7,Old people,3.0,Good
5,Papya,30,25000,IT,2019-03-15,11,4.9,70000,,70000
6,Gotya,33,70000,Finance,2021-05-20,13,5.0,70000,,70000
7,70000,70000,70000,70000,NaT,70000,70000.0,70000,,70000
