# **Day 04 Practice**

In [152]:
# import libraries
import pandas as pd
import numpy as np

# Load the data
data = {
    "money_str": ["10,000", "15,500", "2,300", "7,800"],  # string numbers with commas
    "duration_str": ["5 days", "10 days", "3 hours", "45 minutes"],  # strings for timedelta
    "gender": ["M", "F", "O", "M"],  # categorical gender
    "month_str": ["2021-12", "2022-01", "2022-02", "2022-03"],  # year-month strings
    "numeric_and_str": ["100", "200", "abc", "300"],  # mixed type column
    "yn_str": ["1", "0", "Yes", "No"],  # boolean-like strings
    "numbers_as_object": pd.Series([1, 2, 3, 4], dtype="object"),  # numbers stored as object
    "mixed_type": ["10", "20.5", "xyz", None],  # mixed type column for coercion
    "random_num": [5, 10, 15, 20],  # numeric column
    "random_text": ["apple", "banana", "cherry", "date"],  # text column
}

df = pd.DataFrame(data)
df

Unnamed: 0,money_str,duration_str,gender,month_str,numeric_and_str,yn_str,numbers_as_object,mixed_type,random_num,random_text
0,10000,5 days,M,2021-12,100,1,1,10,5,apple
1,15500,10 days,F,2022-01,200,0,2,20.5,10,banana
2,2300,3 hours,O,2022-02,abc,Yes,3,xyz,15,cherry
3,7800,45 minutes,M,2022-03,300,No,4,,20,date


### Exercises (Intermediate)

In [153]:
# Convert a column with strings "10,000", "15,500" into numeric integers.
# df['money_str'] = df['money_str'].str.replace(',', '')
df['money_str'] = df['money_str'].str.replace(',', '', regex=False)
col1 = df['money_str'].astype('Int64') # 1st way
col1 = pd.to_numeric(df['money_str'], errors='coerce') # 2nd way
col1.dtypes

dtype('int64')

In [154]:
# Convert duration_str column into timedelta using pd.to_timedelta().
df['duration_str'] = pd.to_timedelta(df['duration_str'])
df.dtypes

money_str                     object
duration_str         timedelta64[ns]
gender                        object
month_str                     object
numeric_and_str               object
yn_str                        object
numbers_as_object             object
mixed_type                    object
random_num                     int64
random_text                   object
dtype: object

In [155]:
# Convert gender column into a categorical dtype.
df['gender'] = df['gender'].astype('category')
df.dtypes

# for explicitly setting categories
df['gender'] = pd.Categorical(df['gender'], categories=['M', 'F', 'O'])
df.dtypes


money_str                     object
duration_str         timedelta64[ns]
gender                      category
month_str                     object
numeric_and_str               object
yn_str                        object
numbers_as_object             object
mixed_type                    object
random_num                     int64
random_text                   object
dtype: object

In [156]:
# Convert month_str to datetime (first day of month).
# df['month_str'] = df['month_str'].astype('datetime64[ns]') # works
# but this is safer
df['month_str'] = pd.to_datetime(df['month_str'], format='%Y-%m')
df.dtypes

money_str                     object
duration_str         timedelta64[ns]
gender                      category
month_str             datetime64[ns]
numeric_and_str               object
yn_str                        object
numbers_as_object             object
mixed_type                    object
random_num                     int64
random_text                   object
dtype: object

In [157]:
# Select only numeric columns from a DataFrame using .select_dtypes().
df.select_dtypes(include='number') # 1st way
df.select_dtypes(include='number', exclude='object') # 2nd way

Unnamed: 0,duration_str,random_num
0,5 days 00:00:00,5
1,10 days 00:00:00,10
2,0 days 03:00:00,15
3,0 days 00:45:00,20


In [158]:
# Use .infer_objects() on a DataFrame where numbers are stored as object.
df.infer_objects()

Unnamed: 0,money_str,duration_str,gender,month_str,numeric_and_str,yn_str,numbers_as_object,mixed_type,random_num,random_text
0,10000,5 days 00:00:00,M,2021-12-01,100,1,1,10,5,apple
1,15500,10 days 00:00:00,F,2022-01-01,200,0,2,20.5,10,banana
2,2300,0 days 03:00:00,O,2022-02-01,abc,Yes,3,xyz,15,cherry
3,7800,0 days 00:45:00,M,2022-03-01,300,No,4,,20,date


In [159]:
# Convert yn_str column into a boolean column.
# 💡 Tip: Always build a mapping dictionary for this kind of case.
df['yn_str'] = df['yn_str'].astype(str).str.lower()
df['yn_str'] = df['yn_str'].map({
    "1": True, "0": False, "yes": True, "no": False
})
df['yn_str'] = df['yn_str'].astype('boolean')

df.dtypes

money_str                     object
duration_str         timedelta64[ns]
gender                      category
month_str             datetime64[ns]
numeric_and_str               object
yn_str                       boolean
numbers_as_object             object
mixed_type                    object
random_num                     int64
random_text                   object
dtype: object

In [160]:
# Try converting a mixed-type column into numeric with errors='coerce' and analyze what turns into NaN.
df['mixed_type'] = pd.to_numeric(df['mixed_type'], errors='coerce')
df.dtypes

money_str                     object
duration_str         timedelta64[ns]
gender                      category
month_str             datetime64[ns]
numeric_and_str               object
yn_str                       boolean
numbers_as_object             object
mixed_type                   float64
random_num                     int64
random_text                   object
dtype: object

In [None]:
import pandas as pd
import numpy as np
import ast

df1 = pd.DataFrame({
    "mixed": ["[1, 2, 3]", "{'a': 10}", "42", "3.14", "NaN"]
})

def parse_mixed(val):
    try:
        if val in ["NaN", "nan", None]:
            return np.nan
        return ast.literal_eval(val)  # safely parse lists, dicts, numbers
    except Exception:
        try:
            return pd.to_numeric(val)
        except Exception:
            return val

df1["parsed"] = df1["mixed"].apply(parse_mixed)
print(df1)


In [None]:
import re

df2 = pd.DataFrame({
    "currency": ["$1,200.50", "€850", "1 000 JPY", "GBP 500", None]
})

# Extract currency symbol or code
df2["currency_symbol"] = df2["currency"].str.extract(r"([^\d\s.,]+)")

# Extract numeric part, remove commas/spaces
df2["numeric_value"] = (
    df2["currency"]
      .str.replace(r"[^\d.,]", "", regex=True)  # keep only digits and separators
      .str.replace(",", "", regex=False)        # remove commas
      .str.replace(" ", "", regex=False)        # remove spaces
)

# Convert to float
df2["numeric_value"] = pd.to_numeric(df2["numeric_value"], errors="coerce")

print(df2)


In [None]:
df3 = pd.DataFrame({
    "dates": ["2021-12-01", "01/13/2022", "March 5, 2023", "2024.07.20", None]
})

# Convert all to datetime
df3["datetime"] = pd.to_datetime(df3["dates"], errors="coerce", infer_datetime_format=True)

# Extract month names
df3["month_name"] = df3["datetime"].dt.month_name()

print(df3)
