# Data Transformation

<!--
Author: RSK World
Website: https://rskworld.in
Email: help@rskworld.in
Phone: +91 93305 39277
Description: Guide to data transformation techniques in Pandas
-->

## Introduction

This notebook covers various data transformation techniques including applying functions, string operations, date/time transformations, and data reshaping.



In [None]:
# Author: RSK World | Website: https://rskworld.in | Email: help@rskworld.in | Phone: +91 93305 39277

import pandas as pd
import numpy as np

# Create sample DataFrame
df = pd.DataFrame({
    'Name': ['alice smith', 'BOB JONES', 'Charlie Brown', 'david wilson'],
    'Age': [25, 30, 35, 28],
    'Salary': [50000, 60000, 70000, 55000],
    'Email': ['alice@email.com', 'bob@email.com', 'charlie@email.com', 'david@email.com'],
    'Join_Date': ['2020-01-15', '2019-03-20', '2018-06-10', '2021-09-05']
})

print("Original DataFrame:")
print(df)



## String Operations


In [None]:
# Author: RSK World | Website: https://rskworld.in | Email: help@rskworld.in | Phone: +91 93305 39277

# String transformations
df_transformed = df.copy()

# Capitalize first letter
df_transformed['Name'] = df_transformed['Name'].str.title()
print("=== After title case ===")
print(df_transformed['Name'])

# Extract domain from email
df_transformed['Domain'] = df_transformed['Email'].str.split('@').str[1]
print("\n=== Email domains ===")
print(df_transformed['Domain'])

# Check if string contains certain text
contains_smith = df_transformed['Name'].str.contains('Smith', case=False)
print("\n=== Contains 'Smith' ===")
print(contains_smith)

# String replace
df_transformed['Name'] = df_transformed['Name'].str.replace(' ', '_')
print("\n=== After replacing space with underscore ===")
print(df_transformed['Name'])



## Applying Functions


In [None]:
# Author: RSK World | Website: https://rskworld.in | Email: help@rskworld.in | Phone: +91 93305 39277

# Apply function to a column
def categorize_salary(salary):
    if salary < 55000:
        return 'Low'
    elif salary < 65000:
        return 'Medium'
    else:
        return 'High'

df['Salary_Category'] = df['Salary'].apply(categorize_salary)
print("=== After applying categorize_salary function ===")
print(df[['Name', 'Salary', 'Salary_Category']])

# Apply lambda function
df['Bonus'] = df['Salary'].apply(lambda x: x * 0.1)
print("\n=== After applying lambda function (10% bonus) ===")
print(df[['Name', 'Salary', 'Bonus']])

# Apply function to multiple columns
df['Total'] = df[['Salary', 'Bonus']].apply(lambda row: row['Salary'] + row['Bonus'], axis=1)
print("\n=== After applying function to multiple columns ===")
print(df[['Name', 'Salary', 'Bonus', 'Total']])



## Date/Time Transformations


In [None]:
# Author: RSK World | Website: https://rskworld.in | Email: help@rskworld.in | Phone: +91 93305 39277

# Convert to datetime
df['Join_Date'] = pd.to_datetime(df['Join_Date'])

# Extract date components
df['Year'] = df['Join_Date'].dt.year
df['Month'] = df['Join_Date'].dt.month
df['Day'] = df['Join_Date'].dt.day
df['DayOfWeek'] = df['Join_Date'].dt.day_name()

print("=== Date transformations ===")
print(df[['Name', 'Join_Date', 'Year', 'Month', 'Day', 'DayOfWeek']])

# Calculate years of experience (assuming current date is 2024-01-01)
current_date = pd.to_datetime('2024-01-01')
df['Years_Experience'] = (current_date - df['Join_Date']).dt.days / 365.25
print("\n=== Years of experience ===")
print(df[['Name', 'Join_Date', 'Years_Experience']])



## Data Reshaping


In [None]:
# Author: RSK World | Website: https://rskworld.in | Email: help@rskworld.in | Phone: +91 93305 39277

# Pivot example
df_pivot_data = pd.DataFrame({
    'Date': ['2024-01-01', '2024-01-01', '2024-01-02', '2024-01-02'],
    'Product': ['A', 'B', 'A', 'B'],
    'Sales': [100, 150, 120, 180]
})

print("=== Original DataFrame ===")
print(df_pivot_data)

df_pivot = df_pivot_data.pivot(index='Date', columns='Product', values='Sales')
print("\n=== Pivoted DataFrame ===")
print(df_pivot)

# Melt example
df_melted = df_pivot.reset_index().melt(id_vars='Date', var_name='Product', value_name='Sales')
print("\n=== Melted DataFrame ===")
print(df_melted)

