In [None]:
#Q1
#1. pd.DataFrame() - Create a DataFrame

Function: Creates a pandas DataFrame (a 2-dimensional labeled data structure with columns of potentially different types, like a spreadsheet or SQL table).
#Example
import pandas as pd

data = {'col1': [1, 2, 3, 4], 'col2': ['A', 'B', 'C', 'D']}

df = pd.DataFrame(data)

print("DataFrame created:")
print(df)

#2. pd.read_csv() - Read data from a CSV file

Function: Reads data from a Comma Separated Values (CSV) file into a DataFrame.
#Example
import pandas as pd
data=pd.read_csv("File.csv")
print(data)

#3. .head() - View Top Rows

Function: Returns the first n rows of a DataFrame (default is 5).
Useful for quickly inspecting the data.
#Example
print("\nFirst 2 rows of the DataFrame:")
print(df_from_csv.head(2))

#4. .describe() - Get Summary Statistics

Function: Generates descriptive statistics for numerical columns (count, mean, standard deviation, min, max, quartiles).
#Example

print("\nDescriptive statistics:")
print(df_from_csv.describe())

#5. .groupby() - Group Data

Function: Groups DataFrame rows based on values in one or more columns. Often used with aggregation functions (sum, mean, count, etc.).
#Example
import pandas as pd

data_group = {'Category': ['Fruit', 'Veg', 'Fruit', 'Veg', 'Fruit'],
              'Item': ['Apple', 'Carrot', 'Banana', 'Broccoli', 'Orange'],
              'Price': [0.5, 0.3, 0.4, 0.6, 0.7]}
df_group = pd.DataFrame(data_group)

print("\nOriginal DataFrame for grouping:")
print(df_group)

avg_price_by_category = df_group.groupby('Category')['Price'].mean()

print("\nAverage price per category:")
print(avg_price_by_category)

In [1]:
#Q2
import pandas as pd
import numpy as np 

def reindex_dataframe_custom_step(df: pd.DataFrame) -> pd.DataFrame:

  num_rows = len(df)

  if num_rows == 0:
    print("Input DataFrame is empty. Returning original.")
    return df

  new_index = pd.RangeIndex(start=1, stop=1 + num_rows * 2, step=2)

  df_reindexed = df.copy()

  df_reindexed.index = new_index

  return df_reindexed


data = {'A': [10, 20, 30, 40],
        'B': ['apple', 'banana', 'cherry', 'date'],
        'C': [True, False, True, False]}
df_original = pd.DataFrame(data)

print("Original DataFrame:")
print(df_original)
print("-" * 20)

df_new_index = reindex_dataframe_custom_step(df_original)

print("DataFrame with new index (starts 1, steps 2):")
print(df_new_index)
print("-" * 20)

df_empty = pd.DataFrame({'A': [], 'B': [], 'C': []})
print("Original Empty DataFrame:")
print(df_empty)
print("-" * 20)
df_empty_reindexed = reindex_dataframe_custom_step(df_empty)
print("Re-indexed Empty DataFrame:")
print(df_empty_reindexed)

Original DataFrame:
    A       B      C
0  10   apple   True
1  20  banana  False
2  30  cherry   True
3  40    date  False
--------------------
DataFrame with new index (starts 1, steps 2):
    A       B      C
1  10   apple   True
3  20  banana  False
5  30  cherry   True
7  40    date  False
--------------------
Original Empty DataFrame:
Empty DataFrame
Columns: [A, B, C]
Index: []
--------------------
Input DataFrame is empty. Returning original.
Re-indexed Empty DataFrame:
Empty DataFrame
Columns: [A, B, C]
Index: []


In [3]:
#Q3
import pandas as pd

def sum_first_three_values(df: pd.DataFrame) -> None:

  total = df['Values'].iloc[:3].sum()
  print(total)

# Example Usage
data = {'Values': [1, 2, 3, 4, 5]}
df = pd.DataFrame(data)
sum_first_three_values(df)  # Output: 6

6


In [5]:
#Q4
import pandas as pd
import numpy as np 

def add_word_count_column(df: pd.DataFrame) -> pd.DataFrame:
 
  if 'Text' not in df.columns:
      print("Error: DataFrame must have a 'Text' column.")
      return df
  df_copy = df.copy()
  df_copy['Word_Count'] = df_copy['Text'].fillna('').str.split().str.len()

  return df_copy

data = {
    'Text': [
        "This is the first sentence.",
        "Here is another one.",
        "Short",
        "",                   
        None,                
        "One more sentence here."
    ]
}
df_original = pd.DataFrame(data)

print("Original DataFrame:")
print(df_original)
print("-" * 30)

df_with_counts = add_word_count_column(df_original)

print("DataFrame with Word_Count column:")
print(df_with_counts)

Original DataFrame:
                          Text
0  This is the first sentence.
1         Here is another one.
2                        Short
3                             
4                         None
5      One more sentence here.
------------------------------
DataFrame with Word_Count column:
                          Text  Word_Count
0  This is the first sentence.           5
1         Here is another one.           4
2                        Short           1
3                                        0
4                         None           0
5      One more sentence here.           4


In [None]:
#Q5
df.shape: 
Returns a tuple representing the dimensions: (number_of_rows, number_of_columns).

df.size:
Returns an integer representing the total number of elements (cells) in the DataFrame (rows * columns).


In [None]:
#Q6
we use read_excel() to read excel file.
#example
import pandas as pd
data=pd.read-excel("File.xlsx")
print(data)

In [9]:
#Q7
import pandas as pd
import numpy as np

def add_username_column(df: pd.DataFrame) -> pd.DataFrame:
  if 'Email' not in df.columns:
    print("Error: DataFrame does not contain an 'Email' column.")
    return df
  df_copy = df.copy()

 
  df_copy['Username'] = df_copy['Email'].str.split('@').str[0]

  return df_copy

data = {
    'Email': ['john.doe@example.com', 'jane_smith@domain.org', 'support@company.net', None, 'test-user@sub.domain.co'],
    'ID': [101, 102, 103, 104, 105]
}
df_original = pd.DataFrame(data)

print("Original DataFrame:")
print(df_original)
print("-" * 30)

df_with_username = add_username_column(df_original)

print("DataFrame with 'Username' column added:")
print(df_with_username)


Original DataFrame:
                     Email   ID
0     john.doe@example.com  101
1    jane_smith@domain.org  102
2      support@company.net  103
3                     None  104
4  test-user@sub.domain.co  105
------------------------------
DataFrame with 'Username' column added:
                     Email   ID    Username
0     john.doe@example.com  101    john.doe
1    jane_smith@domain.org  102  jane_smith
2      support@company.net  103     support
3                     None  104        None
4  test-user@sub.domain.co  105   test-user


In [11]:
#Q8
import pandas as pd

def select_rows_by_conditions(df: pd.DataFrame) -> pd.DataFrame:
 
  if not {'A', 'B'}.issubset(df.columns):
      print("Error: DataFrame must contain columns 'A' and 'B'.")
      return pd.DataFrame(columns=df.columns)


  condition1 = df['A'] > 5
  condition2 = df['B'] < 10
  combined_condition = condition1 & condition2

  selected_df = df[combined_condition].copy()

  return selected_df

data = {'A': [3, 8, 6, 2, 9],
        'B': [5, 2, 9, 3, 1],
        'C': [1, 7, 4, 5, 2]}
df_original = pd.DataFrame(data)

print("Original DataFrame:")
print(df_original)
print("-" * 20)

df_selected = select_rows_by_conditions(df_original)

print("Selected Rows (A > 5 and B < 10):")
print(df_selected)

Original DataFrame:
   A  B  C
0  3  5  1
1  8  2  7
2  6  9  4
3  2  3  5
4  9  1  2
--------------------
Selected Rows (A > 5 and B < 10):
   A  B  C
1  8  2  7
2  6  9  4
4  9  1  2


In [13]:
#Q9
import pandas as pd
from pandas.api.types import is_numeric_dtype 

def calculate_column_stats(df: pd.DataFrame) -> dict:
 
  column_name = 'Values'

 
  if column_name not in df.columns:
    print(f"Error: DataFrame does not contain a '{column_name}' column.")
    return {}

  values_series = df[column_name]

  
  if not is_numeric_dtype(values_series):
      print(f"Error: Column '{column_name}' is not numeric.")
      return {}

 
  mean_val = values_series.mean()
  median_val = values_series.median()
  std_dev_val = values_series.std()

  
  stats = {
      'mean': mean_val,
      'median': median_val,
      'standard_deviation': std_dev_val
  }

  return stats

data = {'Values': [10, 15, 12, 18, 20, 15, 22, 11],
        'Other': ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H']}
df_example = pd.DataFrame(data)

print("Original DataFrame:")
print(df_example)
print("-" * 30)


statistics = calculate_column_stats(df_example)

if statistics: # Check if the dictionary is not empty
  print("Calculated Statistics for 'Values' column:")
  print(f"  Mean: {statistics['mean']:.2f}")
  print(f"  Median: {statistics['median']:.2f}")
  print(f"  Standard Deviation: {statistics['standard_deviation']:.2f}")

print("-" * 30)


df_non_numeric = pd.DataFrame({'Values': ['apple', 'banana', 'cherry']})
print("Attempting calculation on non-numeric DataFrame:")
calculate_column_stats(df_non_numeric)

print("-" * 30)

df_missing_col = pd.DataFrame({'Data': [1, 2, 3]})
print("Attempting calculation on DataFrame with missing column:")
calculate_column_stats(df_missing_col)

Original DataFrame:
   Values Other
0      10     A
1      15     B
2      12     C
3      18     D
4      20     E
5      15     F
6      22     G
7      11     H
------------------------------
Calculated Statistics for 'Values' column:
  Mean: 15.38
  Median: 15.00
  Standard Deviation: 4.34
------------------------------
Attempting calculation on non-numeric DataFrame:
Error: Column 'Values' is not numeric.
------------------------------
Attempting calculation on DataFrame with missing column:
Error: DataFrame does not contain a 'Values' column.


{}

In [15]:
#Q10
import pandas as pd

def calculate_moving_average(df: pd.DataFrame) -> pd.DataFrame:
  df['MovingAverage'] = df.set_index('Date')['Sales'].rolling(window=7, min_periods=1).mean().values
  return df

# Example Usage
data = {'Date': pd.to_datetime(['2024-01-01', '2024-01-02', '2024-01-03', '2024-01-04', '2024-01-05', '2024-01-06', '2024-01-07', '2024-01-08', '2024-01-09']),
        'Sales': [10, 20, 15, 25, 30, 22, 18, 28, 35]}
df = pd.DataFrame(data)
df_with_ma = calculate_moving_average(df)
print(df_with_ma)

        Date  Sales  MovingAverage
0 2024-01-01     10      10.000000
1 2024-01-02     20      15.000000
2 2024-01-03     15      15.000000
3 2024-01-04     25      17.500000
4 2024-01-05     30      20.000000
5 2024-01-06     22      20.333333
6 2024-01-07     18      20.000000
7 2024-01-08     28      22.571429
8 2024-01-09     35      24.714286


In [17]:
#Q11
import pandas as pd

def add_weekday_column(df: pd.DataFrame) -> pd.DataFrame:
  df['Weekday'] = pd.to_datetime(df['Date']).dt.day_name()
  return df

# Example usage:
data = {'Date': ['2023-01-01', '2023-01-02', '2023-01-03', '2023-01-04', '2023-01-05']}
df = pd.DataFrame(data)
df_with_weekday = add_weekday_column(df)
print(df_with_weekday)

         Date    Weekday
0  2023-01-01     Sunday
1  2023-01-02     Monday
2  2023-01-03    Tuesday
3  2023-01-04  Wednesday
4  2023-01-05   Thursday


In [19]:
#Q12
import pandas as pd

def select_dates_in_range(df: pd.DataFrame) -> pd.DataFrame:
  start_date = '2023-01-01'
  end_date = '2023-01-31'
  mask = (df['Date'] >= start_date) & (df['Date'] <= end_date)
  return df.loc[mask]

# Example Usage
data = {'Date': pd.to_datetime(['2022-12-25', '2023-01-10', '2023-01-20', '2023-02-15']),
        'Value': [1, 2, 3, 4]}
df = pd.DataFrame(data)
df_selected = select_dates_in_range(df)
print(df_selected)

        Date  Value
1 2023-01-10      2
2 2023-01-20      3


In [None]:
#Q13
The pandas library itself.