In [2]:
import pandas as pd
import numpy as np

# Create a DataFrame with 10 rows
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'Diana', 'Eva', 'Frank', 'Grace', 'Henry', 'Ivy', 'Jack'],
    'Age': [25, 32, 28, 45, 29, 38, 27, 41, 33, 36],
    'City': ['New York', 'London', 'Paris', 'Tokyo', 'Berlin', 'London', 'Paris', 'New York', 'Tokyo', 'Berlin'],
    'Salary': [50000, 65000, 55000, 80000, 60000, 72000, 58000, 75000, 68000, np.nan]  # One missing value
}

df = pd.DataFrame(data)
print("Original DataFrame:")
print(df)
print()

# Add a duplicate row to demonstrate duplicate removal
df = pd.concat([df, df.iloc[[2]]], ignore_index=True)  # Duplicate row 2
print("DataFrame after adding a duplicate row:")
print(df)
print()

# 1. Use .info() and .describe()
print("1. DataFrame Information:")
print("-" * 30)
print("Info:")
df.info()
print()

print("Description:")
print(df.describe())
print()

# 2. Select 'Name' and 'City' columns
print("2. Selected Columns (Name and City):")
print("-" * 40)
selected_columns = df[['Name', 'City']]
print(selected_columns)
print()

# 3. Drop 'City' column
print("3. DataFrame after dropping 'City' column:")
print("-" * 45)
df_dropped = df.drop('City', axis=1)
print(df_dropped)
print()

# 4. Fill missing values in 'Salary' column with the mean
print("4. Handling Missing Values in Salary:")
print("-" * 40)
print("Before filling missing values:")
print(f"Missing values in Salary: {df['Salary'].isnull().sum()}")

# Calculate mean salary (excluding NaN values)
salary_mean = df['Salary'].mean()
print(f"Mean Salary: ₹{salary_mean:.2f}")

# Fill missing values
df_filled = df.copy()
df_filled['Salary'] = df_filled['Salary'].fillna(salary_mean)
print("\nAfter filling missing values:")
print(f"Missing values in Salary: {df_filled['Salary'].isnull().sum()}")
print(df_filled[['Name', 'Salary']])
print()

# 5. Remove duplicate rows
print("5. Removing Duplicate Rows:")
print("-" * 30)
print(f"Before removing duplicates: {len(df_filled)} rows")
df_cleaned = df_filled.drop_duplicates()
print(f"After removing duplicates: {len(df_cleaned)} rows")
print("\nCleaned DataFrame:")
print(df_cleaned)

Original DataFrame:
      Name  Age      City   Salary
0    Alice   25  New York  50000.0
1      Bob   32    London  65000.0
2  Charlie   28     Paris  55000.0
3    Diana   45     Tokyo  80000.0
4      Eva   29    Berlin  60000.0
5    Frank   38    London  72000.0
6    Grace   27     Paris  58000.0
7    Henry   41  New York  75000.0
8      Ivy   33     Tokyo  68000.0
9     Jack   36    Berlin      NaN

DataFrame after adding a duplicate row:
       Name  Age      City   Salary
0     Alice   25  New York  50000.0
1       Bob   32    London  65000.0
2   Charlie   28     Paris  55000.0
3     Diana   45     Tokyo  80000.0
4       Eva   29    Berlin  60000.0
5     Frank   38    London  72000.0
6     Grace   27     Paris  58000.0
7     Henry   41  New York  75000.0
8       Ivy   33     Tokyo  68000.0
9      Jack   36    Berlin      NaN
10  Charlie   28     Paris  55000.0

1. DataFrame Information:
------------------------------
Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11 entri