In [2]:
print("Hi")

Hi


In [9]:
import pandas as pd
import seaborn as sns

# Load Titanic dataset
df = sns.load_dataset("titanic")

# Function to print details (fixing GroupBy issue)
def print_details(dff, message1=""):
    """
    Prints a message, shape, and first row of a DataFrame (ignoring GroupBy objects).
    """
    if isinstance(dff, pd.DataFrame):  # Check if it's a DataFrame
        display(message1, dff.shape, dff.head(1))
        print()
    else:
        print(f"{message1} - This is a GroupBy object.")
    return dff

# Applying method chaining on Titanic dataset
result = (
    df
    #.pipe(print_details, "Original DataFrame")
    .dropna(subset=['age', 'embark_town'])  # Remove missing values
    #.pipe(print_details, "Dropped missing values for 'age' and 'embark_town'")
    .query("age > 18")  # Filter only adults
    #.pipe(print_details, "Filtered only adults")
    .groupby(['embark_town', 'sex'], as_index=False)  # Group by embark_town and sex
    #.pipe(print_details, "Grouped by 'embark_town' and 'sex'")  # Will print group info
    .apply(lambda group: group.assign(avg_fare=group['fare'].mean()))  # Compute avg fare
    #.pipe(print_details, "Assigned average fare per group")
    .reset_index(drop=True)  # Reset index after groupby
    #.pipe(print_details, "Reset index after groupby")
    .merge(df[['embark_town', 'class', 'fare']], on='embark_town', how='inner', suffixes=('_grouped', '_original'))  
    #.pipe(print_details, "Merged on 'embark_town' with 'class' and 'fare'")
    .drop(columns=['age'])  # Drop the 'age' column
    #.pipe(print_details, "Dropped 'age' column")
    .rename(columns={'fare_grouped': 'fare'})  # Rename column
    #.pipe(print_details, "Renamed 'fare_grouped' to 'fare'")
    .astype({'avg_fare': 'float'})  # Ensure correct data type
    #.pipe(print_details, "Casted 'avg_fare' column to float")
)

# Apply string operations (uppercase) only to object (string) columns
for col in result.select_dtypes(include=['object']).columns:
    result[col] = result[col].str.upper()

# Save to CSV
result.to_csv("titanic_chain_rule_example.csv", index=False)

# Print the result
print(result.head())


  .apply(lambda group: group.assign(avg_fare=group['fare'].mean()))  # Compute avg fare


   survived  pclass     sex  sibsp  parch     fare embarked class_grouped  \
0         1       1  FEMALE      1      0  71.2833        C         First   
1         1       1  FEMALE      1      0  71.2833        C         First   
2         1       1  FEMALE      1      0  71.2833        C         First   
3         1       1  FEMALE      1      0  71.2833        C         First   
4         1       1  FEMALE      1      0  71.2833        C         First   

     who  adult_male deck embark_town alive  alone   avg_fare class_original  \
0  WOMAN       False    C   CHERBOURG   YES  False  96.342988          First   
1  WOMAN       False    C   CHERBOURG   YES  False  96.342988         Second   
2  WOMAN       False    C   CHERBOURG   YES  False  96.342988          Third   
3  WOMAN       False    C   CHERBOURG   YES  False  96.342988          Third   
4  WOMAN       False    C   CHERBOURG   YES  False  96.342988          First   

   fare_original  
0        71.2833  
1        30.0708  

In [15]:
import pandas as pd
import os

# Creating a small dataset
data_dict = {
    'Student': ['Ali', 'Sara', 'Ali', 'Reza', 'Mina', 'Sara', 'Reza', 'Omid'],
    'Subject': ['Math', 'Physics', 'Math', 'Chemistry', 'Math', 'Physics', 'Math', 'Chemistry'],
    'Score': [85, 90, 85, 88, 92, 90, 88, 95],
    'City': ['Tehran', 'Mashhad', 'Tehran', 'Shiraz', 'Tehran', 'Mashhad', 'Shiraz', 'Mashhad'],
}

df = pd.DataFrame(data_dict)

def print_details(dff, message1=""):
    """
    Description:
        Prints a message, shape, and first row of a dataframe. Finally, returns the dataframe.
    Parameters:
        dff (dataframe)
        message1 (str)
    Returns:
        dff (dataframe)
    """
    print(message1)
    print(dff.shape)
    print(dff.head(1))
    return dff

# Chain rule (Chaining)
df_processed = (
    df
    # Group 1: Basic Operations (head, tail, info, describe)
    .head(4)  # First 4 rows
    .pipe(print_details, "First 4 rows of the DataFrame")

    # Group 2: Removing duplicates and creating new column
    .drop_duplicates()  # Remove duplicates
    #.pipe(print_details, "After dropping duplicates")

    .assign(Passed=lambda x: x['Score'] > 85)  # Add new column 'Passed' based on Score
    #.pipe(print_details, "After adding 'Passed' column based on Score")

    # Group 3: Grouping and Aggregating Data
    .groupby('City').agg({'Score': 'mean'})  # Average Score per City
    #.pipe(print_details, "Grouped by City with average Score")

    # Group 4: Type Conversion and Export
    .reset_index()  # Reset index after grouping
    .astype({'Score': 'float'})  # Type conversion for Score column
    #.pipe(print_details, "After type conversion")
    
    # Saving the final DataFrame to CSV
    .to_csv("df_processed.csv", index=False)
)

# Load the processed DataFrame
df_processed = pd.read_csv("df_processed.csv")
print_details(df_processed, "Final Processed DataFrame:")


First 4 rows of the DataFrame
(4, 4)
  Student Subject  Score    City
0     Ali    Math     85  Tehran
Final Processed DataFrame:
(3, 2)
      City  Score
0  Mashhad   90.0


Unnamed: 0,City,Score
0,Mashhad,90.0
1,Shiraz,88.0
2,Tehran,85.0


In [30]:
import pandas as pd
import os

# Creating a small dataset
data_dict = {
    'Student': ['Ali', 'Sara', 'Ali', 'Reza', 'Mina', 'Sara', 'Reza', 'Omid'],
    'Subject': ['Math', 'Physics', 'Math', 'Chemistry', 'Math', 'Physics', 'Math', 'Chemistry'],
    'Score': [85, 90, 85, 88, 92, 90, 88, 95],
    'City': ['Tehran', 'Mashhad', 'Tehran', 'Shiraz', 'Tehran', 'Mashhad', 'Shiraz', 'Mashhad'],
}

df = pd.DataFrame(data_dict)



def print_details(dff, message1=""):
    """
    Description:
        Prints a message, shape, and first row of a dataframe. Finally, returns the dataframe.
    Parameters:
        dff (dataframe)
        message1 (str)
    Returns:
        dff (dataframe)
    """
    display(message1, dff.shape, dff.head(3)) ; print()
    return dff


# Chain rule (Chaining)
df_processed = (
    df
   #.pipe(print_details, "Original DataFrame")
    .drop_duplicates()  
    #.pipe(print_details, "Drop duplicate rows")
    .assign(Passed=lambda x: x['Score'] > 85)  
    #.pipe(print_details, "Assign 'Passed' column based on Score")
    .pivot_table(index='City', columns='Subject', values='Score', aggfunc='mean')  
    #.pipe(print_details, "Pivot table to see average scores per city and subject")
    .reset_index()
    .astype({'Math': 'float', 'Physics': 'float', 'Chemistry': 'float'})  
    # .pipe(print_details, "Ensure type casting for pivot table columns")
    .to_csv("df_processed.csv", index=False)
)

df_processed = pd.read_csv("df_processed.csv")
display("Final Processed DataFrame:", df_processed.shape, df_processed.head(3)) ; print()


'Final Processed DataFrame:'

(3, 4)

Unnamed: 0,City,Chemistry,Math,Physics
0,Mashhad,95.0,,90.0
1,Shiraz,88.0,88.0,
2,Tehran,,88.5,





In [6]:
# Creating a new dataset with missing values for demonstration
import numpy as np
import pandas as pd
data = {
    'Age': [25, np.nan, 30, 22, 28, np.nan, 35],
    'City': ['Tehran', 'Shiraz', np.nan, 'Isfahan', 'Tabriz', 'Tehran', np.nan],
    'Income': [5000, 6000, np.nan, 4500, 5500, 7000, np.nan],
    'Gender': ['Male', np.nan, 'Male', 'Female', 'Female', np.nan, 'Female']
}

df_missing = pd.DataFrame(data)

# Before handling missing values
df_missing_before = df_missing.copy()

# 1. Statistical Imputation (Mean for numeric columns)
df_missing['Age'] = df_missing['Age'].fillna(df_missing['Age'].mean())
df_missing['Income'] = df_missing['Income'].fillna(df_missing['Income'].mean())

# 2. Forward Filling for 'City' column
df_missing['City'] = df_missing['City'].fillna(method='ffill')

# 3. Custom Imputation for 'Gender' column (impute with 'Unknown')
df_missing['Gender'] = df_missing['Gender'].fillna('Unknown')

# Showing the data before and after handling missing values
print("\nModified Data (After Handling Missing Values):")
print(df_missing)


Modified Data (After Handling Missing Values):
    Age     City  Income   Gender
0  25.0   Tehran  5000.0     Male
1  28.0   Shiraz  6000.0  Unknown
2  30.0   Shiraz  5600.0     Male
3  22.0  Isfahan  4500.0   Female
4  28.0   Tabriz  5500.0   Female
5  28.0   Tehran  7000.0  Unknown
6  35.0   Tehran  5600.0   Female


  df_missing['City'] = df_missing['City'].fillna(method='ffill')


In [12]:
pip install scikit-learn


Collecting scikit-learn
  Downloading scikit_learn-1.6.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (18 kB)
Collecting scipy>=1.6.0 (from scikit-learn)
  Downloading scipy-1.15.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
Collecting joblib>=1.2.0 (from scikit-learn)
  Downloading joblib-1.4.2-py3-none-any.whl.metadata (5.4 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Downloading threadpoolctl-3.5.0-py3-none-any.whl.metadata (13 kB)
Downloading scikit_learn-1.6.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.1/13.1 MB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hDownloading joblib-1.4.2-py3-none-any.whl (301 kB)
Downloading scipy-1.15.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (37.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m37.3/37.3 MB[0m [31m5.5 MB/s[0m eta [36m

In [14]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression

# Creating a dataset with missing values
data_simple = {
    'Age': [25, np.nan, 30, 22, np.nan, 35, 40],
    'Income': [5000, 6000, np.nan, 4500, 5500, np.nan, 8000],
    'City': ['Tehran', 'Shiraz', 'Isfahan', np.nan, 'Tabriz', 'Tehran', 'Mashhad']
}

df_simple = pd.DataFrame(data_simple)

# 1. Interpolation (Fix missing 'Age' values before regression)
df_simple['Age'] = df_simple['Age'].interpolate(method='linear')

# 2. Predictive Imputation (Using linear regression for 'Income' column)
df_simple_notna = df_simple.dropna(subset=['Income'])  # Removing rows where 'Income' is NaN
X = df_simple_notna[['Age']]  # Independent variable (Age)
y = df_simple_notna['Income']  # Dependent variable (Income)

# Fit the linear regression model
model = LinearRegression()
model.fit(X, y)

# Predict missing income values
income_missing = df_simple[df_simple['Income'].isna()]
predicted_income = model.predict(income_missing[['Age']])

# Fill the missing 'Income' with predicted values
df_simple.loc[df_simple['Income'].isna(), 'Income'] = predicted_income

print(df_simple)


    Age       Income     City
0  25.0  5000.000000   Tehran
1  27.5  6000.000000   Shiraz
2  30.0  6071.124134  Isfahan
3  22.0  4500.000000      NaN
4  28.5  5500.000000   Tabriz
5  35.0  7039.424614   Tehran
6  40.0  8000.000000  Mashhad


In [15]:
import numpy as np
import pandas as pd

# Step 1: Create a dataset with missing values
data = {
    'Age': [25, np.nan, 30, 22, np.nan, 35, 40],
    'Income': [5000, 6000, np.nan, 4500, 5500, np.nan, 8000],
    'City': ['Tehran', 'Shiraz', 'Isfahan', np.nan, 'Tabriz', 'Tehran', 'Mashhad']
}

df = pd.DataFrame(data)

# Step 2: Encoding Missing as a Separate Category (For categorical column 'City')
df['City'] = df['City'].fillna('Missing')

# Step 3: Flagging Missing Values (For numerical columns)
df['Age_Missing_Flag'] = df['Age'].isna().astype(int)
df['Income_Missing_Flag'] = df['Income'].isna().astype(int)

# Step 4: Fill missing values in numerical columns (optional)
df['Age'] = df['Age'].fillna(df['Age'].median())  # Fill with median
df['Income'] = df['Income'].fillna(df['Income'].median())  # Fill with median

# Print the final dataset
print(df)


    Age  Income     City  Age_Missing_Flag  Income_Missing_Flag
0  25.0  5000.0   Tehran                 0                    0
1  30.0  6000.0   Shiraz                 1                    0
2  30.0  5500.0  Isfahan                 0                    1
3  22.0  4500.0  Missing                 0                    0
4  30.0  5500.0   Tabriz                 1                    0
5  35.0  5500.0   Tehran                 0                    1
6  40.0  8000.0  Mashhad                 0                    0


In [16]:
import numpy as np
import pandas as pd
from sklearn.impute import KNNImputer
from sklearn.experimental import enable_iterative_imputer  # Required for IterativeImputer
from sklearn.impute import IterativeImputer

# Step 1: Create a dataset with missing values
data = {
    'Age': [25, np.nan, 30, 22, np.nan, 35, 40],
    'Income': [5000, 6000, np.nan, 4500, 5500, np.nan, 8000],
    'Spending_Score': [65, 80, 70, np.nan, 60, 75, np.nan]
}

df = pd.DataFrame(data)

# Step 2: Apply KNN Imputation
knn_imputer = KNNImputer(n_neighbors=2)  # Use 2 nearest neighbors
df_knn_imputed = pd.DataFrame(knn_imputer.fit_transform(df), columns=df.columns)

# Step 3: Apply Multivariate (Iterative) Imputation
iter_imputer = IterativeImputer(max_iter=10, random_state=42)  # Regression-based imputation
df_iter_imputed = pd.DataFrame(iter_imputer.fit_transform(df), columns=df.columns)

# Print the original and imputed DataFrames
print("Original DataFrame with Missing Values:\n", df)
print("\nKNN Imputed DataFrame:\n", df_knn_imputed)
print("\nMultivariate (Iterative) Imputed DataFrame:\n", df_iter_imputed)


Original DataFrame with Missing Values:
     Age  Income  Spending_Score
0  25.0  5000.0            65.0
1   NaN  6000.0            80.0
2  30.0     NaN            70.0
3  22.0  4500.0             NaN
4   NaN  5500.0            60.0
5  35.0     NaN            75.0
6  40.0  8000.0             NaN

KNN Imputed DataFrame:
     Age  Income  Spending_Score
0  25.0  5000.0            65.0
1  32.5  6000.0            80.0
2  30.0  4750.0            70.0
3  22.0  4500.0            72.5
4  32.5  5500.0            60.0
5  35.0  7000.0            75.0
6  40.0  8000.0            72.5

Multivariate (Iterative) Imputed DataFrame:
          Age       Income  Spending_Score
0  25.000000  5000.000000       65.000000
1  29.854751  6000.000000       80.000000
2  30.000000  6028.496285       70.000000
3  22.000000  4500.000000       63.519291
4  27.310712  5500.000000       60.000000
5  35.000000  7009.262620       75.000000
6  40.000000  8000.000000       79.634126
