In [1]:
# Pandas is a data manipulation and analysis library built in Python. 
# It provides powerful data structures like Series and DataFrame to work with structured data efficiently.
# Data can be accessed by labels using loc[] or by integer-based positions using iloc[].
# Grouping and aggregation can be done using groupby() and DataFrames can be merged using merge(), concat(), join().
# Read/write data from/to various file formats (CSV, Excel, SQL, JSON) using read_csv(), to_csv(), etc.

# Questions on Pandas

In [2]:
import pandas as pd

In [3]:
# 1. Create a Pandas DataFrame from a dictionary of lists.

data = {'Name': ['Ram', 'Rohan', 'Nikhil', 'Poonam', 'Riya'],
        'Age': [25, 30, 35, 19, 28],
        'City': ['Delhi', 'Kolkata', 'Pune', 'Gurgaon', 'Kolkata']}

df = pd.DataFrame(data)
df

Unnamed: 0,Name,Age,City
0,Ram,25,Delhi
1,Rohan,30,Kolkata
2,Nikhil,35,Pune
3,Poonam,19,Gurgaon
4,Riya,28,Kolkata


In [4]:
# 2. Load a CSV file into a DataFrame and display the first 5 rows.

d = pd.read_csv('iris_data.csv')
d.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [5]:
# 3. Add a new column to an existing DataFrame.

df['Salary'] = [50000, 60000, 70000, 45000, 56000]
df

Unnamed: 0,Name,Age,City,Salary
0,Ram,25,Delhi,50000
1,Rohan,30,Kolkata,60000
2,Nikhil,35,Pune,70000
3,Poonam,19,Gurgaon,45000
4,Riya,28,Kolkata,56000


In [6]:
# 4. Filter rows where the age is greater than 25.

filt = df[df['Age'] > 25]
filt

Unnamed: 0,Name,Age,City,Salary
1,Rohan,30,Kolkata,60000
2,Nikhil,35,Pune,70000
4,Riya,28,Kolkata,56000


In [7]:
# 5. Sort a DataFrame by a specific column in descending order.

srt = df.sort_values(by='Age', ascending=False)
srt

Unnamed: 0,Name,Age,City,Salary
2,Nikhil,35,Pune,70000
1,Rohan,30,Kolkata,60000
4,Riya,28,Kolkata,56000
0,Ram,25,Delhi,50000
3,Poonam,19,Gurgaon,45000


In [8]:
# 6. Group the data by a categorical column and find the mean of each group.

grp = df.groupby('City').mean(numeric_only=True)
grp

Unnamed: 0_level_0,Age,Salary
City,Unnamed: 1_level_1,Unnamed: 2_level_1
Delhi,25.0,50000.0
Gurgaon,19.0,45000.0
Kolkata,29.0,58000.0
Pune,35.0,70000.0


In [9]:
# 7. Merge two DataFrames on a common column.

df1 = pd.DataFrame({'Name': ['Ram', 'Rohan', 'Nikhil', 'Poonam', 'Riya'],
        'Age': [25, 30, 35, 19, 28]})
df2 = pd.DataFrame({'Name': ['Ram', 'Rohan', 'Nikhil', 'Poonam', 'Riya'],
         'City': ['New Delhi', 'Lucknow', 'Pune', 'Gurgaon', 'Kolkata']})
mrg = pd.merge(df1, df2, on='Name')
mrg

Unnamed: 0,Name,Age,City
0,Ram,25,New Delhi
1,Rohan,30,Lucknow
2,Nikhil,35,Pune
3,Poonam,19,Gurgaon
4,Riya,28,Kolkata


In [10]:
# 8. Create a pivot table from a DataFrame.

pivot = pd.pivot_table(df, values='Salary', index='City', aggfunc='sum')
pivot

Unnamed: 0_level_0,Salary
City,Unnamed: 1_level_1
Delhi,50000
Gurgaon,45000
Kolkata,116000
Pune,70000


In [11]:
# 9. Drop rows that contain any missing values.

df.loc[2, 'Age'] = None
val = df.dropna()
val

Unnamed: 0,Name,Age,City,Salary
0,Ram,25.0,Delhi,50000
1,Rohan,30.0,Kolkata,60000
3,Poonam,19.0,Gurgaon,45000
4,Riya,28.0,Kolkata,56000


In [12]:
# 10. Fill missing values in a DataFrame with the mean of the column.

m = df['Age'].mean()
df['Age'] = df['Age'].fillna(m)
df

Unnamed: 0,Name,Age,City,Salary
0,Ram,25.0,Delhi,50000
1,Rohan,30.0,Kolkata,60000
2,Nikhil,25.5,Pune,70000
3,Poonam,19.0,Gurgaon,45000
4,Riya,28.0,Kolkata,56000


In [13]:
# 11. Find the unique values in a specific column.

un = df['City'].unique()
un

array(['Delhi', 'Kolkata', 'Pune', 'Gurgaon'], dtype=object)

In [14]:
# 12. Extract a specific row using iloc and loc.

r1 = df.iloc[4,3]
r2 = df.loc[4,'City']
r1, r2

(56000, 'Kolkata')

In [15]:
# 13. Apply a custom function to every element of a DataFrame column.

df['Age_in_Days'] = df['Age'].apply(lambda x: x * 365)
df

Unnamed: 0,Name,Age,City,Salary,Age_in_Days
0,Ram,25.0,Delhi,50000,9125.0
1,Rohan,30.0,Kolkata,60000,10950.0
2,Nikhil,25.5,Pune,70000,9307.5
3,Poonam,19.0,Gurgaon,45000,6935.0
4,Riya,28.0,Kolkata,56000,10220.0


In [16]:
# 14. Convert a column of strings to datetime objects.

df['Date_of_Birth'] = ['05/01/1990', '08/17/1985', '12/15/2000', '03/10/1999', '11/05/1992']
df['Date_of_Birth'] = pd.to_datetime(df['Date_of_Birth'])
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   Name           5 non-null      object        
 1   Age            5 non-null      float64       
 2   City           5 non-null      object        
 3   Salary         5 non-null      int64         
 4   Age_in_Days    5 non-null      float64       
 5   Date_of_Birth  5 non-null      datetime64[ns]
dtypes: datetime64[ns](1), float64(2), int64(1), object(2)
memory usage: 368.0+ bytes


In [17]:
# 15. Select rows based on multiple conditions (e.g., age > 25 and city is ‘New Delhi’).

r = df[(df['Age'] > 25) & (df['City'] == 'Kolkata')]
r

Unnamed: 0,Name,Age,City,Salary,Age_in_Days,Date_of_Birth
1,Rohan,30.0,Kolkata,60000,10950.0,1985-08-17
4,Riya,28.0,Kolkata,56000,10220.0,1992-11-05


In [18]:
# 16. Rename the columns of a DataFrame.

re = df.rename(columns={'Name': 'Full_Name', 'Age': 'Years', 'City': 'Location'})
re

Unnamed: 0,Full_Name,Years,Location,Salary,Age_in_Days,Date_of_Birth
0,Ram,25.0,Delhi,50000,9125.0,1990-05-01
1,Rohan,30.0,Kolkata,60000,10950.0,1985-08-17
2,Nikhil,25.5,Pune,70000,9307.5,2000-12-15
3,Poonam,19.0,Gurgaon,45000,6935.0,1999-03-10
4,Riya,28.0,Kolkata,56000,10220.0,1992-11-05


In [19]:
# 17. Write a DataFrame to a CSV file.

df.to_csv('output.csv', index=False)

In [20]:
# 18. How do you remove a column from a DataFrame?

df.drop('Age_in_Days', axis=1, inplace=True)
df

Unnamed: 0,Name,Age,City,Salary,Date_of_Birth
0,Ram,25.0,Delhi,50000,1990-05-01
1,Rohan,30.0,Kolkata,60000,1985-08-17
2,Nikhil,25.5,Pune,70000,2000-12-15
3,Poonam,19.0,Gurgaon,45000,1999-03-10
4,Riya,28.0,Kolkata,56000,1992-11-05


In [21]:
# 19. How do you check for missing values in a DataFrame?

df.isnull().sum()

Name             0
Age              0
City             0
Salary           0
Date_of_Birth    0
dtype: int64

In [22]:
# 20. How do you get a summary of the data in a DataFrame?

df.describe()

Unnamed: 0,Age,Salary,Date_of_Birth
count,5.0,5.0,5
mean,25.5,56200.0,1993-10-15 09:36:00
min,19.0,45000.0,1985-08-17 00:00:00
25%,25.0,50000.0,1990-05-01 00:00:00
50%,25.5,56000.0,1992-11-05 00:00:00
75%,28.0,60000.0,1999-03-10 00:00:00
max,30.0,70000.0,2000-12-15 00:00:00
std,4.153312,9602.083107,
