# Pandas Practice Questions (Instructor Solutions)

This notebook contains **12 basic Python pandas practice problems** focused on **data loading, exploration, cleaning, and basic operations**.

Each question includes:
- Function definition with correct return statements
- Clear explanation of what the function should return
- Test cases using small DataFrames

In [2]:
import pandas as pd
import numpy as np
from io import StringIO

In [3]:
name = 'student name'
roll_number = 'student roll number'

### 1. Load a CSV string into a DataFrame
**Return:** A pandas DataFrame from the CSV string

**Choose the correct line:**
- (a) `return pd.read_excel(StringIO(csv_string))`
- (b) `return pd.read_csv(StringIO(csv_string))`
- (c) `return pd.DataFrame(csv_string.split('\n'))`
- (d) `return csv_string.to_dataframe()`

In [4]:
def load_csv_string(csv_string: str) -> pd.DataFrame:
    return pd.read_csv(StringIO(csv_string))

# Test data
csv_data = "name,age,score\nAlice,25,85\nBob,30,90\nCharlie,22,78"
# df = load_csv_string(csv_data)

In [5]:
# Assertions
csv_data = "name,age,score\nAlice,25,85\nBob,30,90\nCharlie,22,78"
df = load_csv_string(csv_data)
assert isinstance(df, pd.DataFrame)
assert list(df.columns) == ['name', 'age', 'score']
assert df.shape == (3, 3)

### 2. Get shape and column names
**Return:** A tuple of (number of rows, number of columns, list of column names)

**Choose the correct code:**
- (a) `return (df.size, df.ndim, df.columns)`
- (b) `return (df.shape[0], df.shape[1], list(df.columns))`
- (c) `return df.info()`
- (d) `return (len(df), len(df.index), df.to_list())`

In [6]:
def get_dataframe_info(df: pd.DataFrame) -> tuple:
    rows, cols = df.shape
    columns = list(df.columns)
    return (rows, cols, columns)

# get_dataframe_info(df)

In [7]:
# Assertions
csv_data = "x,y,z\n1,2,3\n4,5,6"
df = load_csv_string(csv_data)
rows, cols, columns = get_dataframe_info(df)
assert rows == 2
assert cols == 3
assert columns == ['x', 'y', 'z']

### 3. Get the first n rows of a DataFrame
**Return:** DataFrame containing first n rows

**Choose the correct code:**
- (a) `return df.iloc[:n]`
- (b) `return df.head(n)`
- (c) `return df.nlargest(n, axis=0)`
- (d) `return df[:n:1]`

In [8]:
def get_first_n_rows(df: pd.DataFrame, n: int) -> pd.DataFrame:
    return df.head(n)

# get_first_n_rows(df, 2)

In [9]:
# Assertions
csv_data = "a,b\n1,10\n2,20\n3,30\n4,40"
df = load_csv_string(csv_data)
first_two = get_first_n_rows(df, 2)
assert first_two.shape == (2, 2)
assert first_two['a'].tolist() == [1, 2]

### 4. Get basic statistics for numeric columns
**Return:** A pandas DataFrame with descriptive statistics (using .describe())


In [None]:
def describe_numeric(df: pd.DataFrame) -> pd.DataFrame:
    return df.describe()

# describe_numeric(df)

In [None]:
# Assertions
csv_data = "val1,val2\n10,100\n20,200\n30,300"
df = load_csv_string(csv_data)
stats = describe_numeric(df)
assert isinstance(stats, pd.DataFrame)
assert 'count' in stats.index
assert 'mean' in stats.index
assert 'std' in stats.index

### 5. Select a single column as a Series
**Return:** A pandas Series for the specified column

In [10]:
def select_column(df: pd.DataFrame, col_name: str) -> pd.Series:
    return df[col_name]

# select_column(df, 'age')

In [11]:
# Assertions
csv_data = "name,age\nAlice,25\nBob,30\nCharlie,22"
df = load_csv_string(csv_data)
age_series = select_column(df, 'age')
assert isinstance(age_series, pd.Series)
assert age_series.tolist() == [25, 30, 22]

### 6. Filter rows where a column value exceeds a threshold
**Return:** A DataFrame containing only rows where column > threshold

**Hint:** Use boolean indexing `df[df[col_name] > threshold]` and `.reset_index(drop=True)` to reset row indices.

**Choose the correct code:**
- (a) `return df.filter(column=col_name, value=threshold)`
- (b) `return df.loc[df[col_name] > threshold]`
- (c) `return df[df[col_name] > threshold].reset_index(drop=True)`
- (d) `return df.query(f'{col_name} > {threshold}')`

In [None]:
def filter_by_threshold(df: pd.DataFrame, col_name: str, threshold: float) -> pd.DataFrame:
    return df[df[col_name] > threshold].reset_index(drop=True)

# filter_by_threshold(df, 'age', 25)

In [None]:
# Assertions
csv_data = "name,score\nAlice,85\nBob,92\nCharlie,78\nDiana,88"
df = load_csv_string(csv_data)
filtered = filter_by_threshold(df, 'score', 80)
assert filtered.shape[0] == 3
assert filtered['score'].min() > 80

### 7. Count missing (NaN) values in each column
**Return:** A pandas Series with column names as index and count of NaN as values

**Hint:** Use `.isnull().sum()` to count missing values in each column.

In [None]:
def count_missing_values(df: pd.DataFrame) -> pd.Series:
    return df.isnull().sum()

# count_missing_values(df)

In [None]:
# Assertions
data = {'a': [1, 2, np.nan], 'b': [4, np.nan, np.nan], 'c': [7, 8, 9]}
df = pd.DataFrame(data)
missing = count_missing_values(df)
assert missing['a'] == 1
assert missing['b'] == 2
assert missing['c'] == 0

### 8. Drop rows containing any NaN values
**Return:** A DataFrame with all rows containing NaN removed

**Hint:** Use `.dropna()` to remove rows with missing values, then `.reset_index(drop=True)` to renumber rows.

In [None]:
def drop_rows_with_nan(df: pd.DataFrame) -> pd.DataFrame:
    return df.dropna().reset_index(drop=True)

# drop_rows_with_nan(df)

In [None]:
# Assertions
data = {'x': [1, 2, np.nan], 'y': [10, np.nan, 30]}
df = pd.DataFrame(data)
clean_df = drop_rows_with_nan(df)
assert clean_df.shape[0] == 1
assert clean_df['x'].iloc[0] == 1
assert clean_df['y'].iloc[0] == 10

### 9. Fill missing values with the mean of the column
**Return:** A DataFrame where NaN values in numeric columns are replaced by column mean

**Hint:** Get numeric columns using `.select_dtypes()`, then use `.fillna()` with the column mean.

In [None]:
def fill_missing_with_mean(df: pd.DataFrame) -> pd.DataFrame:
    df_copy = df.copy()
    numeric_cols = df_copy.select_dtypes(include=[np.number]).columns
    for col in numeric_cols:
        df_copy[col].fillna(df_copy[col].mean(), inplace=True)
    return df_copy

# fill_missing_with_mean(df)

In [None]:
# Assertions
data = {'val': [10, 20, np.nan, 30]}
df = pd.DataFrame(data)
filled = fill_missing_with_mean(df)
assert filled['val'].isnull().sum() == 0
assert filled['val'].iloc[2] == 20

### 10. Group by a column and calculate the mean of another column
**Return:** A DataFrame with grouped results (group column and mean)

**Hint:** Use `.groupby(group_col)[agg_col].mean()` and `.reset_index()` to convert to DataFrame.

In [None]:
def group_by_mean(df: pd.DataFrame, group_col: str, agg_col: str) -> pd.DataFrame:
    result = df.groupby(group_col)[agg_col].mean().reset_index()
    result.columns = [group_col, f'{agg_col}_mean']
    return result

# group_by_mean(df, 'category', 'value')

In [None]:
# Assertions
data = {'category': ['A', 'B', 'A', 'B'], 'value': [10, 20, 30, 40]}
df = pd.DataFrame(data)
grouped = group_by_mean(df, 'category', 'value')
assert grouped.shape[0] == 2
assert grouped.loc[grouped['category'] == 'A', 'value_mean'].iloc[0] == 20
assert grouped.loc[grouped['category'] == 'B', 'value_mean'].iloc[0] == 30

### 11. Merge two DataFrames on a common column
**Return:** A merged DataFrame (inner join on the specified key)

**Hint:** Use `pd.merge(left, right, on=key, how='inner')` to combine DataFrames on a common key.

**Choose the correct code:**
- (a) `return left.join(right, on=on)`
- (b) `return pd.concat([left, right])`
- (c) `return pd.merge(left, right, on=on, how='inner')`
- (d) `return left.combine(right)`

In [None]:
def merge_dataframes(left: pd.DataFrame, right: pd.DataFrame, on: str) -> pd.DataFrame:
    return pd.merge(left, right, on=on, how='inner')

# merge_dataframes(left_df, right_df, 'id')

In [None]:
# Assertions
left = pd.DataFrame({'id': [1, 2, 3], 'value_left': [10, 20, 30]})
right = pd.DataFrame({'id': [2, 3, 4], 'value_right': [200, 300, 400]})
merged = merge_dataframes(left, right, 'id')
assert merged.shape[0] == 2
assert set(merged.columns) == {'id', 'value_left', 'value_right'}

### 12. Convert a column to datetime format
**Return:** A DataFrame where the specified column has been converted to datetime

**Hint:** Use `pd.to_datetime()` to convert a column from string to datetime format.

**Choose the correct code:**
- (a) `df_copy[col_name] = df_copy[col_name].astype(datetime)`
- (b) `df_copy[col_name] = pd.to_datetime(df_copy[col_name])`
- (c) `df_copy[col_name].convert_to_datetime()`
- (d) `df_copy[col_name] = datetime.strptime(df_copy[col_name], '%Y-%m-%d')`

In [None]:
def convert_to_datetime(df: pd.DataFrame, col_name: str) -> pd.DataFrame:
    df_copy = df.copy()
    df_copy[col_name] = pd.to_datetime(df_copy[col_name])
    return df_copy

# convert_to_datetime(df, 'date_column')

In [None]:
# Assertions
data = {'date': ['2023-01-15', '2023-02-20', '2023-03-25']}
df = pd.DataFrame(data)
converted = convert_to_datetime(df, 'date')
assert pd.api.types.is_datetime64_any_dtype(converted['date'])
assert converted['date'].iloc[0].year == 2023
assert converted['date'].iloc[0].month == 1