In [8]:
import pandas as pd
import numpy as np
import yfinance as yf

In [9]:
# Load the Titanic dataset from a GitHub URL
url = 'https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv'
df_titanic = pd.read_csv(url)
print("Step 1a: Titanic dataset loaded with", df_titanic.shape[0], "passengers and", df_titanic.shape[1], "columns.")

Step 1a: Titanic dataset loaded with 891 passengers and 12 columns.


In [10]:
# Fetch Apple stock data from 2020 to 2023 using yfinance for time series practice
df_stock = yf.download('AAPL', start='2020-01-01', end='2023-01-01', progress=False, auto_adjust=True)
print("Step 1b: Apple stock data loaded with", df_stock.shape[0], "days and", df_stock.shape[1], "columns.")

Step 1b: Apple stock data loaded with 756 days and 5 columns.


In [49]:
# --- Section 2: Data Inspection ---
# Display the first 5 rows of the Titanic dataset to understand its structure
pd.set_option('display.width', 1000)
print("\nStep 2a: First 5 rows of Titanic dataset:")
print(df_titanic.tail())


Step 2a: First 5 rows of Titanic dataset:
     PassengerId  Survived  Pclass                                      Name   Age  SibSp  Parch      Ticket   Fare     AgeGroup  Sex_male  Embarked_Q  Embarked_S
886          887         0       2                     Montvila, Rev. Juozas  27.0      0      0      211536  13.00  Young Adult      True       False        True
887          888         1       1              Graham, Miss. Margaret Edith  19.0      0      0      112053  30.00  Young Adult     False       False        True
888          889         0       3  Johnston, Miss. Catherine Helen "Carrie"  28.0      1      2  W./C. 6607  23.45  Young Adult     False       False        True
889          890         1       1                     Behr, Mr. Karl Howell  26.0      0      0      111369  30.00  Young Adult      True       False       False
890          891         0       3                       Dooley, Mr. Patrick  32.0      0      0      370376   7.75  Young Adult      True    

In [48]:
pd.set_option('display.width', 1000)
print("\nStep 2b: First 5 rows of Apple stock data:")
print(df_stock.tail())


Step 2b: First 5 rows of Apple stock data:
Price            Close        High         Low        Open    Volume
Ticker            AAPL        AAPL        AAPL        AAPL      AAPL
Date                                                                
2022-12-23  130.173767  130.726603  127.982155  129.245785  63814900
2022-12-27  128.367188  129.729545  127.073942  129.699930  69007800
2022-12-28  124.428215  129.354401  124.260391  128.011792  85438400
2022-12-29  127.952568  128.811438  126.096612  126.353282  75703700
2022-12-30  128.268463  128.288212  125.800440  126.767912  77034200


In [16]:
print(df_titanic.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB
None


In [17]:
print(df_stock.info())

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 756 entries, 2020-01-02 to 2022-12-30
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   (Close, AAPL)   756 non-null    float64
 1   (High, AAPL)    756 non-null    float64
 2   (Low, AAPL)     756 non-null    float64
 3   (Open, AAPL)    756 non-null    float64
 4   (Volume, AAPL)  756 non-null    int64  
dtypes: float64(4), int64(1)
memory usage: 35.4 KB
None


In [29]:
# Exploratory Data Analysis (EDA)
print("\nStep 2c: Titanic Summary Statistics:")
print(df_titanic.describe())
print(df_titanic.shape)


Step 2c: Titanic Summary Statistics:
       PassengerId    Survived      Pclass         Age       SibSp       Parch        Fare
count   891.000000  891.000000  891.000000  891.000000  891.000000  891.000000  891.000000
mean    446.000000    0.383838    2.308642   29.361582    0.523008    0.381594   32.204208
std     257.353842    0.486592    0.836071   13.019697    1.102743    0.806057   49.693429
min       1.000000    0.000000    1.000000    0.420000    0.000000    0.000000    0.000000
25%     223.500000    0.000000    2.000000   22.000000    0.000000    0.000000    7.910400
50%     446.000000    0.000000    3.000000   28.000000    0.000000    0.000000   14.454200
75%     668.500000    1.000000    3.000000   35.000000    1.000000    0.000000   31.000000
max     891.000000    1.000000    3.000000   80.000000    8.000000    6.000000  512.329200
(891, 12)


In [28]:
print("\nUnique values in each column of Titanic dataset:")
print(df_titanic.nunique())


Unique values in each column of Titanic dataset:
PassengerId    891
Survived         2
Pclass           3
Name           891
Sex              2
Age             88
SibSp            7
Parch            7
Ticket         681
Fare           248
Cabin          147
Embarked         3
dtype: int64


In [25]:
print("\nStep 2d: Apple Stock Summary Statistics:")
print(df_stock.describe())

print("\nShape:")
print(df_stock.shape)

print("\nUnique values in each column of AAPL Stock dataset:")
print(df_stock.nunique())


Step 2d: Apple Stock Summary Statistics:
Price        Close        High         Low        Open        Volume
Ticker        AAPL        AAPL        AAPL        AAPL          AAPL
count   756.000000  756.000000  756.000000  756.000000  7.560000e+02
mean    127.633103  129.181520  125.971434  127.555152  1.120920e+08
std      30.421463   30.698899   30.139550   30.449563  5.602586e+07
min      54.378582   55.379535   51.528416   55.277744  3.519590e+07
25%     112.709518  114.204950  111.251833  112.889681  7.636470e+07
50%     133.010811  134.143863  131.050657  132.832314  9.493580e+07
75%     148.221516  149.388626  146.276881  147.588766  1.296327e+08
max     178.645645  179.558473  175.809076  179.254206  4.265100e+08

Shape:
(756, 5)

Unique values in each column of AAPL Stock dataset:
Price   Ticker
Close   AAPL      747
High    AAPL      756
Low     AAPL      756
Open    AAPL      756
Volume  AAPL      754
dtype: int64


In [31]:
print(df_titanic.isnull().sum())

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         0
dtype: int64


In [27]:
# Filling values is a whole science in itself
median_age = df_titanic['Age'].median()
df_titanic['Age'] =  df_titanic['Age'].fillna(median_age)
print("Step 3b: Filled missing Age values with median age:", median_age)

Step 3b: Filled missing Age values with median age: 28.0


In [30]:
most_common_embarked = df_titanic['Embarked'].mode()[0]
print("Most common Embarked value:", most_common_embarked)
df_titanic['Embarked'] = df_titanic['Embarked'].fillna(most_common_embarked)

Most common Embarked value: S


In [38]:
df_titanic = df_titanic.drop(columns=['Cabin'], axis=1, errors='ignore')
print("\nStep 3c: Dropped 'Cabin' column from Titanic dataset.")


Step 3c: Dropped 'Cabin' column from Titanic dataset.


In [37]:
df_titanic = df_titanic.drop_duplicates()
print("\nStep 3d: Dropped duplicate rows from Titanic dataset.")
print(df_titanic.shape)


Step 3d: Dropped duplicate rows from Titanic dataset.
(891, 11)


In [39]:
older_than_30 = df_titanic[df_titanic['Age'] > 30]
print("\nStep 3e: Passengers older than 30 years:")
print(older_than_30.head())


Step 3e: Passengers older than 30 years:
    PassengerId  Survived  Pclass                                               Name     Sex   Age  SibSp  Parch    Ticket     Fare Embarked
1             2         1       1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1      0  PC 17599  71.2833        C
3             4         1       1       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1      0    113803  53.1000        S
4             5         0       3                           Allen, Mr. William Henry    male  35.0      0      0    373450   8.0500        S
6             7         0       1                            McCarthy, Mr. Timothy J    male  54.0      0      0     17463  51.8625        S
11           12         1       1                           Bonnell, Miss. Elizabeth  female  58.0      0      0    113783  26.5500        S


In [41]:
sorted_by_fare = df_titanic.sort_values(by='Fare', ascending=False)
print("\nStep 3f: Passengers sorted by Fare in descending order:")
print(sorted_by_fare.head())


Step 3f: Passengers sorted by Fare in descending order:
     PassengerId  Survived  Pclass                                Name     Sex   Age  SibSp  Parch    Ticket      Fare Embarked
679          680         1       1  Cardeza, Mr. Thomas Drake Martinez    male  36.0      0      1  PC 17755  512.3292        C
258          259         1       1                    Ward, Miss. Anna  female  35.0      0      0  PC 17755  512.3292        C
737          738         1       1              Lesurer, Mr. Gustave J    male  35.0      0      0  PC 17755  512.3292        C
88            89         1       1          Fortune, Miss. Mabel Helen  female  23.0      3      2     19950  263.0000        S
438          439         0       1                   Fortune, Mr. Mark    male  64.0      1      4     19950  263.0000        S


In [42]:
mean_fare_by_price = df_titanic.groupby('Pclass')['Fare'].mean()
print(mean_fare_by_price)

Pclass
1    84.154687
2    20.662183
3    13.675550
Name: Fare, dtype: float64


In [44]:
# One-hot encode 'Sex' and 'Embarked' for ML model compatibility
df_titanic = pd.get_dummies(df_titanic, columns=['Sex', 'Embarked'], drop_first=True)
print("\nStep 5b: One-hot encoded 'Sex' and 'Embarked'. New columns:")
print(list(df_titanic.columns))


Step 5b: One-hot encoded 'Sex' and 'Embarked'. New columns:
['PassengerId', 'Survived', 'Pclass', 'Name', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'AgeGroup', 'Sex_male', 'Embarked_Q', 'Embarked_S']


In [46]:
bins = [0, 12, 18, 35, 60, 100]
labels = ['Child', 'Teenager', 'Young Adult', 'Adult', 'Senior']
df_titanic['AgeGroup'] = pd.cut(df_titanic['Age'], bins=bins, labels=labels, right=False)
print("\nStep 5c: Age groups added to Titanic dataset:")
print(df_titanic[['Age', 'AgeGroup']].head())


Step 5c: Age groups added to Titanic dataset:
    Age     AgeGroup
0  22.0  Young Adult
1  38.0        Adult
2  26.0  Young Adult
3  35.0        Adult
4  35.0        Adult


In [47]:
ticket_adjustment = pd.DataFrame({
    'Ticket': df_titanic['Ticket'].unique(),
    'Adjustment': np.random.uniform(-10, 10, size=len(df_titanic['Ticket'].unique()))
})
print("\nStep 9a: Ticket Adjustment DataFrame (first 5):")
print(ticket_adjustment.head())


Step 9a: Ticket Adjustment DataFrame (first 5):
             Ticket  Adjustment
0         A/5 21171    6.413895
1          PC 17599    8.840818
2  STON/O2. 3101282   -6.784584
3            113803    9.154605
4            373450    5.217589


In [None]:
# Read data into dataframes -> clean data -> wrangle data -> feature engineering
