In [1]:
# Import required libraries for data manipulation and fetching online data
import pandas as pd
import numpy as np
# For fetching stock data; students need to install yfinance
# !pip install yfinance  # Uncomment to install if needed
import yfinance as yf

In [2]:
# --- Section 1: Loading Data from Online Sources ---
# Load the Titanic dataset from a GitHub URL, a popular dataset for learning
url = 'https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv'
df_titanic = pd.read_csv(url)
print("Step 1a: Titanic dataset loaded with", df_titanic.shape[0], "passengers and", df_titanic.shape[1], "columns.")

Step 1a: Titanic dataset loaded with 891 passengers and 12 columns.


In [3]:
# Fetch Apple stock data from 2020 to 2023 using yfinance for time series practice
df_stock = yf.download('AAPL', start='2020-01-01', end='2023-01-01', progress=False)
print("Step 1b: Apple stock data loaded with", df_stock.shape[0], "days and", df_stock.shape[1], "columns.")

  df_stock = yf.download('AAPL', start='2020-01-01', end='2023-01-01', progress=False)


Step 1b: Apple stock data loaded with 756 days and 5 columns.


In [5]:
# --- Section 2: Data Inspection ---
# Display the first 5 rows of the Titanic dataset to understand its structure
pd.set_option('display.width', 1000)
print("\nStep 2a: First 5 rows of Titanic dataset:")
print(df_titanic.head())


Step 2a: First 5 rows of Titanic dataset:
   PassengerId  Survived  Pclass                                               Name     Sex   Age  SibSp  Parch            Ticket     Fare Cabin Embarked
0            1         0       3                            Braund, Mr. Owen Harris    male  22.0      1      0         A/5 21171   7.2500   NaN        S
1            2         1       1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1      0          PC 17599  71.2833   C85        C
2            3         1       3                             Heikkinen, Miss. Laina  female  26.0      0      0  STON/O2. 3101282   7.9250   NaN        S
3            4         1       1       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1      0            113803  53.1000  C123        S
4            5         0       3                           Allen, Mr. William Henry    male  35.0      0      0            373450   8.0500   NaN        S


In [6]:
# Show DataFrame info (column names, data types, non-null counts)
print("\nStep 2b: Titanic DataFrame Info:")
print(df_titanic.info())


Step 2b: Titanic DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB
None


In [7]:
# Display summary statistics for numerical columns (count, mean, std, etc.)
print("\nStep 2c: Titanic Summary Statistics:")
print(df_titanic.describe())


Step 2c: Titanic Summary Statistics:
       PassengerId    Survived      Pclass         Age       SibSp       Parch        Fare
count   891.000000  891.000000  891.000000  714.000000  891.000000  891.000000  891.000000
mean    446.000000    0.383838    2.308642   29.699118    0.523008    0.381594   32.204208
std     257.353842    0.486592    0.836071   14.526497    1.102743    0.806057   49.693429
min       1.000000    0.000000    1.000000    0.420000    0.000000    0.000000    0.000000
25%     223.500000    0.000000    2.000000   20.125000    0.000000    0.000000    7.910400
50%     446.000000    0.000000    3.000000   28.000000    0.000000    0.000000   14.454200
75%     668.500000    1.000000    3.000000   38.000000    1.000000    0.000000   31.000000
max     891.000000    1.000000    3.000000   80.000000    8.000000    6.000000  512.329200


In [8]:
# Display the first 5 rows of stock data to see time series structure
print("\nStep 2d: Apple Stock Data (First 5 rows):")
print(df_stock.head())


Step 2d: Apple Stock Data (First 5 rows):
Price           Close       High        Low       Open     Volume
Ticker           AAPL       AAPL       AAPL       AAPL       AAPL
Date                                                             
2020-01-02  72.620834  72.681281  71.373211  71.627084  135480400
2020-01-03  71.914818  72.676447  71.689957  71.847118  146322800
2020-01-06  72.487846  72.526533  70.783248  71.034709  118387200
2020-01-07  72.146942  72.753823  71.926915  72.497529  108872000
2020-01-08  73.307510  73.609745  71.849533  71.849533  132079200


In [41]:
# Interactive: Ask students to input a column to explore
column_to_check = input("Step 2e: Enter a Titanic column name to see its unique values (e.g., 'Pclass'): ")
print(f"\nUnique values in {column_to_check}:")
print(df_titanic[column_to_check].unique())


Unique values in Pclass:
[3 1 2]


In [10]:
# --- Section 3: Data Cleaning ---
# Check for missing values in each column of the Titanic dataset
print("\nStep 3a: Missing Values in Titanic Dataset:")
print(df_titanic.isnull().sum())


Step 3a: Missing Values in Titanic Dataset:
PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64


In [11]:
# Fill missing 'Age' values with the median age to preserve data distribution
median_age = df_titanic['Age'].median()
df_titanic['Age'] = df_titanic['Age'].fillna(median_age)
print("\nStep 3b: Missing 'Age' filled with median:", median_age)


Step 3b: Missing 'Age' filled with median: 28.0


In [12]:
# Fill missing 'Embarked' with the most common port (mode) to maintain consistency
most_common_embarked = df_titanic['Embarked'].mode()[0]
df_titanic['Embarked'] = df_titanic['Embarked'].fillna(most_common_embarked)
print("Step 3c: Missing 'Embarked' filled with mode:", most_common_embarked)

Step 3c: Missing 'Embarked' filled with mode: S


In [13]:
# Drop 'Cabin' column due to excessive missing values, reducing noise
df_titanic = df_titanic.drop('Cabin', axis=1)
print("\nStep 3d: 'Cabin' column dropped due to many missing values.")


Step 3d: 'Cabin' column dropped due to many missing values.


In [14]:
# Verify no missing values remain after cleaning
print("Step 3e: Missing Values After Cleaning:")
print(df_titanic.isnull().sum())

Step 3e: Missing Values After Cleaning:
PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Embarked       0
dtype: int64


In [15]:
# Check and remove duplicate rows in the Titanic dataset
print("\nStep 3f: Number of Duplicate Rows:", df_titanic.duplicated().sum())
df_titanic = df_titanic.drop_duplicates()
print("Step 3g: Duplicate rows removed. New shape:", df_titanic.shape)


Step 3f: Number of Duplicate Rows: 0
Step 3g: Duplicate rows removed. New shape: (891, 11)


In [16]:
# Clean stock data: Handle any missing values with forward fill
df_stock = df_stock.fillna(method='ffill')
print("\nStep 3h: Missing values in stock data filled with forward fill.")


Step 3h: Missing values in stock data filled with forward fill.


  df_stock = df_stock.fillna(method='ffill')


In [17]:
# --- Section 4: Data Transformation ---
# Filter passengers older than 30 to focus on a specific group
older_than_30 = df_titanic[df_titanic['Age'] > 30]
print("\nStep 4a: Passengers older than 30 (first 5):")
print(older_than_30.head())


Step 4a: Passengers older than 30 (first 5):
    PassengerId  Survived  Pclass                                               Name     Sex   Age  SibSp  Parch    Ticket     Fare Embarked
1             2         1       1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1      0  PC 17599  71.2833        C
3             4         1       1       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1      0    113803  53.1000        S
4             5         0       3                           Allen, Mr. William Henry    male  35.0      0      0    373450   8.0500        S
6             7         0       1                            McCarthy, Mr. Timothy J    male  54.0      0      0     17463  51.8625        S
11           12         1       1                           Bonnell, Miss. Elizabeth  female  58.0      0      0    113783  26.5500        S


In [18]:
# Sort passengers by 'Fare' in descending order to identify high spenders
sorted_by_fare = df_titanic.sort_values(by='Fare', ascending=False)
print("\nStep 4b: Top 5 passengers by fare:")
print(sorted_by_fare[['Name', 'Fare']].head())


Step 4b: Top 5 passengers by fare:
                                   Name      Fare
258                    Ward, Miss. Anna  512.3292
737              Lesurer, Mr. Gustave J  512.3292
679  Cardeza, Mr. Thomas Drake Martinez  512.3292
88           Fortune, Miss. Mabel Helen  263.0000
27       Fortune, Mr. Charles Alexander  263.0000


In [19]:
# Group by 'Pclass' and calculate mean 'Fare' to analyze class-based pricing
mean_fare_by_class = df_titanic.groupby('Pclass')['Fare'].mean()
print("\nStep 4c: Mean Fare by Pclass:")
print(mean_fare_by_class)


Step 4c: Mean Fare by Pclass:
Pclass
1    84.154687
2    20.662183
3    13.675550
Name: Fare, dtype: float64


In [20]:
# Interactive: Group by a column chosen by students
group_by_col = input("Step 4d: Enter a column to group by (e.g., 'Sex'): ")
if group_by_col in df_titanic.columns:
    group_result = df_titanic.groupby(group_by_col)['Fare'].mean()
    print(f"\nMean Fare by {group_by_col}:")
    print(group_result)
else:
    print("Column not found. Try 'Sex', 'Pclass', or 'Embarked'.")


Mean Fare by Sex:
Sex
female    44.479818
male      25.523893
Name: Fare, dtype: float64


In [21]:
# --- Section 5: Feature Engineering ---
# Create 'FamilySize' feature by summing 'SibSp', 'Parch', and 1 (self)
df_titanic['FamilySize'] = df_titanic['SibSp'] + df_titanic['Parch'] + 1
print("\nStep 5a: Added 'FamilySize' feature. First 5 rows:")
print(df_titanic[['SibSp', 'Parch', 'FamilySize']].head())


Step 5a: Added 'FamilySize' feature. First 5 rows:
   SibSp  Parch  FamilySize
0      1      0           2
1      1      0           2
2      0      0           1
3      1      0           2
4      0      0           1


In [22]:
# One-hot encode 'Sex' and 'Embarked' for ML model compatibility
df_titanic = pd.get_dummies(df_titanic, columns=['Sex', 'Embarked'], drop_first=True)
print("\nStep 5b: One-hot encoded 'Sex' and 'Embarked'. New columns:")
print(list(df_titanic.columns))


Step 5b: One-hot encoded 'Sex' and 'Embarked'. New columns:
['PassengerId', 'Survived', 'Pclass', 'Name', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'FamilySize', 'Sex_male', 'Embarked_Q', 'Embarked_S']


In [23]:
# Bin 'Age' into categories for better feature representation
bins = [0, 12, 18, 35, 60, 100]
labels = ['Child', 'Teen', 'Young Adult', 'Adult', 'Senior']
df_titanic['AgeGroup'] = pd.cut(df_titanic['Age'], bins=bins, labels=labels)
print("\nStep 5c: Added 'AgeGroup' feature. First 5 rows:")
print(df_titanic[['Age', 'AgeGroup']].head())


Step 5c: Added 'AgeGroup' feature. First 5 rows:
    Age     AgeGroup
0  22.0  Young Adult
1  38.0        Adult
2  26.0  Young Adult
3  35.0  Young Adult
4  35.0  Young Adult


In [24]:
# Interactive: Ask students to suggest a feature to scale
feature_to_scale = input("Step 5d: Enter a numerical feature to scale (e.g., 'Fare'): ")
if feature_to_scale in df_titanic.select_dtypes(include=[np.number]).columns:
    df_titanic[feature_to_scale + '_scaled'] = (df_titanic[feature_to_scale] - df_titanic[feature_to_scale].min()) / (df_titanic[feature_to_scale].max() - df_titanic[feature_to_scale].min())
    print(f"\nScaled {feature_to_scale} added. First 5 rows:")
    print(df_titanic[[feature_to_scale, feature_to_scale + '_scaled']].head())
else:
    print("Please enter a valid numerical column like 'Fare' or 'Age'.")


Scaled Fare added. First 5 rows:
      Fare  Fare_scaled
0   7.2500     0.014151
1  71.2833     0.139136
2   7.9250     0.015469
3  53.1000     0.103644
4   8.0500     0.015713


In [25]:
# --- Section 6: Time Series Manipulation ---
# Ensure stock data index is datetime for time series operations
df_stock.index = pd.to_datetime(df_stock.index)
print("\nStep 6a: Stock data index converted to datetime.")


Step 6a: Stock data index converted to datetime.


In [26]:
# Resample stock data to monthly frequency, calculating mean closing price
monthly_close = df_stock['Close'].resample('M').mean()
print("\nStep 6b: Monthly Average Closing Price (first 5):")
print(monthly_close.head())


Step 6b: Monthly Average Closing Price (first 5):
Ticker           AAPL
Date                 
2020-01-31  75.417396
2020-02-29  75.401414
2020-03-31  63.606265
2020-04-30  66.015845
2020-05-31  75.283144


  monthly_close = df_stock['Close'].resample('M').mean()


In [27]:
# Calculate 50-day moving average to smooth stock price trends
df_stock['MA50'] = df_stock['Close'].rolling(window=50).mean()
print("\nStep 6c: 50-Day Moving Average (first 5):")
print(df_stock[['Close', 'MA50']].head())


Step 6c: 50-Day Moving Average (first 5):
Price           Close MA50
Ticker           AAPL     
Date                      
2020-01-02  72.620834  NaN
2020-01-03  71.914818  NaN
2020-01-06  72.487846  NaN
2020-01-07  72.146942  NaN
2020-01-08  73.307510  NaN


In [28]:
# Interactive: Ask for a window size for moving average
window_size = int(input("Step 6d: Enter a window size for moving average (e.g., 30): "))
df_stock['MA' + str(window_size)] = df_stock['Close'].rolling(window=window_size).mean()
print(f"\n{window_size}-Day Moving Average (first 5):")
print(df_stock[['Close', 'MA' + str(window_size)]].head())


30-Day Moving Average (first 5):
Price           Close MA30
Ticker           AAPL     
Date                      
2020-01-02  72.620834  NaN
2020-01-03  71.914818  NaN
2020-01-06  72.487846  NaN
2020-01-07  72.146942  NaN
2020-01-08  73.307510  NaN


In [29]:
# --- Section 7: Applying Custom Functions ---
# Define a function to categorize fares into 'Cheap', 'Moderate', 'Expensive'
def fare_category(fare):
    if fare < 10:
        return 'Cheap'
    elif fare < 50:
        return 'Moderate'
    else:
        return 'Expensive'
    
    # Apply the fare_category function to create a new feature
df_titanic['FareCategory'] = df_titanic['Fare'].apply(fare_category)
print("\nStep 7a: Added 'FareCategory' feature. First 5 rows:")
print(df_titanic[['Fare', 'FareCategory']].head())


Step 7a: Added 'FareCategory' feature. First 5 rows:
      Fare FareCategory
0   7.2500        Cheap
1  71.2833    Expensive
2   7.9250        Cheap
3  53.1000    Expensive
4   8.0500        Cheap


In [30]:
# Interactive: Ask students to define a custom category function
def custom_category(value, threshold1, threshold2, labels):
    if value < threshold1:
        return labels[0]
    elif value < threshold2:
        return labels[1]
    else:
        return labels[2]

col_to_categorize = input("Step 7b: Enter a numerical column to categorize (e.g., 'Age'): ")
if col_to_categorize in df_titanic.select_dtypes(include=[np.number]).columns:
    thresh1 = float(input("Enter first threshold (e.g., 20): "))
    thresh2 = float(input("Enter second threshold (e.g., 40): "))
    custom_labels = input("Enter three labels separated by commas (e.g., 'Young,Middle,Old'): ").split(',')
    df_titanic[col_to_categorize + '_Custom'] = df_titanic[col_to_categorize].apply(custom_category, args=(thresh1, thresh2, custom_labels))
    print(f"\nCustom categories for {col_to_categorize}:")
    print(df_titanic[[col_to_categorize, col_to_categorize + '_Custom']].head())
else:
    print("Please enter a valid numerical column like 'Age' or 'Fare'.")



Custom categories for Age:
    Age Age_Custom
0  22.0     Middle
1  38.0     Middle
2  26.0     Middle
3  35.0     Middle
4  35.0     Middle


In [31]:
# --- Section 8: Advanced Pandas Operations ---
# Create a MultiIndex DataFrame by setting 'Pclass' and 'Sex_male' as indices
df_titanic_multi = df_titanic.set_index(['Pclass', 'Sex_male'])
print("\nStep 8a: MultiIndex DataFrame (first 5):")
print(df_titanic_multi.head())



Step 8a: MultiIndex DataFrame (first 5):
                 PassengerId  Survived                                               Name   Age  SibSp  Parch            Ticket     Fare  FamilySize  Embarked_Q  Embarked_S     AgeGroup  Fare_scaled FareCategory Age_Custom
Pclass Sex_male                                                                                                                                                                                                               
3      True                1         0                            Braund, Mr. Owen Harris  22.0      1      0         A/5 21171   7.2500           2       False        True  Young Adult     0.014151        Cheap     Middle
1      False               2         1  Cumings, Mrs. John Bradley (Florence Briggs Th...  38.0      1      0          PC 17599  71.2833           2       False       False        Adult     0.139136    Expensive     Middle
3      False               3         1                            

In [32]:
# Use vectorized operation to calculate log of 'Fare' (adding 1 to avoid log(0))
df_titanic['LogFare'] = np.log1p(df_titanic['Fare'])
print("\nStep 8b: Log of Fare (first 5):")
print(df_titanic[['Fare', 'LogFare']].head())


Step 8b: Log of Fare (first 5):
      Fare   LogFare
0   7.2500  2.110213
1  71.2833  4.280593
2   7.9250  2.188856
3  53.1000  3.990834
4   8.0500  2.202765


In [33]:
# Optimize memory by converting 'AgeGroup' and 'FareCategory' to category type
df_titanic['AgeGroup'] = df_titanic['AgeGroup'].astype('category')
df_titanic['FareCategory'] = df_titanic['FareCategory'].astype('category')
print("\nStep 8c: Optimized 'AgeGroup' and 'FareCategory' to category type.")


Step 8c: Optimized 'AgeGroup' and 'FareCategory' to category type.


In [34]:
# --- Section 9: Merging and Joining ---
# Create a small DataFrame with ticket adjustments for merging demonstration
ticket_adjustment = pd.DataFrame({
    'Ticket': df_titanic['Ticket'].unique(),
    'Adjustment': np.random.uniform(-10, 10, size=len(df_titanic['Ticket'].unique()))
})
print("\nStep 9a: Ticket Adjustment DataFrame (first 5):")
print(ticket_adjustment.head())


Step 9a: Ticket Adjustment DataFrame (first 5):
             Ticket  Adjustment
0         A/5 21171    7.015100
1          PC 17599   -7.115407
2  STON/O2. 3101282   -5.268915
3            113803    8.259056
4            373450    7.132332


In [35]:
# Merge ticket adjustments with the main DataFrame
df_titanic = pd.merge(df_titanic, ticket_adjustment, on='Ticket', how='left')
print("\nStep 9b: Merged DataFrame (first 5):")
print(df_titanic[['Ticket', 'Adjustment']].head())


Step 9b: Merged DataFrame (first 5):
             Ticket  Adjustment
0         A/5 21171    7.015100
1          PC 17599   -7.115407
2  STON/O2. 3101282   -5.268915
3            113803    8.259056
4            373450    7.132332


In [36]:
# Calculate adjusted fare by adding the adjustment
df_titanic['AdjustedFare'] = df_titanic['Fare'] + df_titanic['Adjustment']
print("\nStep 9c: Adjusted Fare (first 5):")
print(df_titanic[['Fare', 'Adjustment', 'AdjustedFare']].head())


Step 9c: Adjusted Fare (first 5):
      Fare  Adjustment  AdjustedFare
0   7.2500    7.015100     14.265100
1  71.2833   -7.115407     64.167893
2   7.9250   -5.268915      2.656085
3  53.1000    8.259056     61.359056
4   8.0500    7.132332     15.182332


In [37]:
# Interactive: Ask students to merge with a custom DataFrame
custom_data = input("Step 9d: Enter a column to merge with (e.g., 'Name'), or skip: ")
if custom_data in df_titanic.columns:
    custom_df = pd.DataFrame({custom_data: df_titanic[custom_data].unique(), 'CustomValue': np.random.rand(len(df_titanic[custom_data].unique()))})
    df_titanic = pd.merge(df_titanic, custom_df, on=custom_data, how='left')
    print(f"\nMerged with custom DataFrame on {custom_data}:")
    print(df_titanic[[custom_data, 'CustomValue']].head())
else:
    print("Skipping custom merge or invalid column.")


Merged with custom DataFrame on Name:
                                                Name  CustomValue
0                            Braund, Mr. Owen Harris     0.320761
1  Cumings, Mrs. John Bradley (Florence Briggs Th...     0.333112
2                             Heikkinen, Miss. Laina     0.093483
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)     0.949955
4                           Allen, Mr. William Henry     0.991678


In [39]:
# --- Section 10: Preparing Data for Machine Learning ---
# Select numerical and encoded features for ML model training
df_titanic['Age_standardized'] = (df_titanic['Age'] - df_titanic['Age'].mean()) / df_titanic['Age'].std()
features = ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'FamilySize', 'Sex_male', 'Embarked_Q', 'Embarked_S', 'AgeGroup', 'Fare_scaled', 'Age_standardized']
X = df_titanic[features]
y = df_titanic['Survived']
print("\nStep 10a: Features selected for ML:", features)
print("Step 10b: Target variable:", y.name)


Step 10a: Features selected for ML: ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'FamilySize', 'Sex_male', 'Embarked_Q', 'Embarked_S', 'AgeGroup', 'Fare_scaled', 'Age_standardized']
Step 10b: Target variable: Survived


In [40]:
# Interactive: Ask students to add or remove a feature
feature_action = input("Step 10c: Add or remove a feature? (add/remove, then feature name, e.g., 'add AgeGroup'): ").split()
if feature_action[0].lower() == 'add' and feature_action[1] in df_titanic.columns and feature_action[1] not in features:
    features.append(feature_action[1])
    X = df_titanic[features]
    print(f"\nAdded {feature_action[1]} to features. New features:", features)
elif feature_action[0].lower() == 'remove' and feature_action[1] in features:
    features.remove(feature_action[1])
    X = df_titanic[features]
    print(f"\nRemoved {feature_action[1]} from features. New features:", features)
else:
    print("Invalid action or feature. Features unchanged:", features)

Invalid action or feature. Features unchanged: ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'FamilySize', 'Sex_male', 'Embarked_Q', 'Embarked_S', 'AgeGroup', 'Fare_scaled', 'Age_standardized']
