In [1]:
import pandas as pd
import numpy as np

# Task 1: Create the DataFrame
data = {
    "name": ["aman", "Saurav", "karan", "Shubham", "abhishek", np.nan, "Sameer"],
    "age": [22, 23, np.nan, 22, 25, 24, 26],
    "marks": [85, np.nan, 88, 90, 56, 72, np.nan],
    "subject": ["math", "physics", "math", "chemistry", "math", "physics", "chemistry"]
}

df = pd.DataFrame(data)

print("DataFrame:\n", df)

# Task 1.2: Basic Inspection

# Shape of dataset
print("\nShape of dataset:", df.shape)

# Datatypes of columns
print("\nDatatypes:\n", df.dtypes)

# Statistical summary for numerical columns
print("\nStatistical Summary:\n", df.describe())

DataFrame:
        name   age  marks    subject
0      aman  22.0   85.0       math
1    Saurav  23.0    NaN    physics
2     karan   NaN   88.0       math
3   Shubham  22.0   90.0  chemistry
4  abhishek  25.0   56.0       math
5       NaN  24.0   72.0    physics
6    Sameer  26.0    NaN  chemistry

Shape of dataset: (7, 4)

Datatypes:
 name           str
age        float64
marks      float64
subject        str
dtype: object

Statistical Summary:
              age      marks
count   6.000000   5.000000
mean   23.666667  78.200000
std     1.632993  14.254824
min    22.000000  56.000000
25%    22.250000  72.000000
50%    23.500000  85.000000
75%    24.750000  88.000000
max    26.000000  90.000000


In [2]:
# Selection: view only name and marks columns
print(df[['name','marks']])

# Filtering: students who scored more than 80 marks
print(df[df['marks'] > 80])

# Complex Filtering: students older than 22 AND marks > 80
print(df[(df['age'] > 22) & (df['marks'] > 80)])

# Aggregation: count students in each subject
print(df['subject'].value_counts())

       name  marks
0      aman   85.0
1    Saurav    NaN
2     karan   88.0
3   Shubham   90.0
4  abhishek   56.0
5       NaN   72.0
6    Sameer    NaN
      name   age  marks    subject
0     aman  22.0   85.0       math
2    karan   NaN   88.0       math
3  Shubham  22.0   90.0  chemistry
Empty DataFrame
Columns: [name, age, marks, subject]
Index: []
subject
math         3
physics      2
chemistry    2
Name: count, dtype: int64


In [3]:
# 1. Sort the dataframe by marks in descending order
sorted_df = df.sort_values(by='marks', ascending=False)
print("Sorted by Marks (Descending):")
print(sorted_df)

# 2. Find the top 2 highest scores (Scholarship candidates)
top2 = df.nlargest(2, 'marks')
print("\nTop 2 Highest Scores:")
print(top2)

# 3. Identify the student with the lowest marks (Remedial candidate)
lowest = df.nsmallest(1, 'marks')
print("\nStudent with Lowest Marks:")
print(lowest)

Sorted by Marks (Descending):
       name   age  marks    subject
3   Shubham  22.0   90.0  chemistry
2     karan   NaN   88.0       math
0      aman  22.0   85.0       math
5       NaN  24.0   72.0    physics
4  abhishek  25.0   56.0       math
1    Saurav  23.0    NaN    physics
6    Sameer  26.0    NaN  chemistry

Top 2 Highest Scores:
      name   age  marks    subject
3  Shubham  22.0   90.0  chemistry
2    karan   NaN   88.0       math

Student with Lowest Marks:
       name   age  marks subject
4  abhishek  25.0   56.0    math


In [4]:
# 1. Calculate total number of missing values in each column
print("Missing values in each column:")
print(df.isna().sum())

# 2. Filter rows where marks are missing
print("\nRows where marks are missing:")
print(df[df['marks'].isna()])

# 3. Create cleaned dataframe by removing rows with any missing value
cleaned_df = df.dropna()

print("\nCleaned DataFrame:")
print(cleaned_df)

# 4. Compare shape of cleaned_df vs original_df
print("\nOriginal DataFrame Shape:", df.shape)
print("Cleaned DataFrame Shape:", cleaned_df.shape)

Missing values in each column:
name       1
age        1
marks      2
subject    0
dtype: int64

Rows where marks are missing:
     name   age  marks    subject
1  Saurav  23.0    NaN    physics
6  Sameer  26.0    NaN  chemistry

Cleaned DataFrame:
       name   age  marks    subject
0      aman  22.0   85.0       math
3   Shubham  22.0   90.0  chemistry
4  abhishek  25.0   56.0       math

Original DataFrame Shape: (7, 4)
Cleaned DataFrame Shape: (3, 4)
