In [1]:
import pandas as pd
import numpy as np
from pathlib import Path

# Configure pandas display options to show all data on the same grid
pd.set_option('display.max_columns', None)  # Show all columns
pd.set_option('display.width', None)  # Auto-detect width
pd.set_option('display.max_colwidth', None)  # Show full column content
pd.set_option('display.expand_frame_repr', False)  # Don't wrap to multiple lines

csv_path = Path("./train.csv")


In [2]:
# Read the CSV file
df = pd.read_csv(csv_path)
data_portion = df.head(20)
print("Original data (first 20 rows):\n\n")
print(data_portion)


Original data (first 20 rows):


    PassengerId  Survived  Pclass                                                     Name     Sex   Age  SibSp  Parch            Ticket     Fare Cabin Embarked
0             1         0       3                                  Braund, Mr. Owen Harris    male  22.0      1      0         A/5 21171   7.2500   NaN        S
1             2         1       1      Cumings, Mrs. John Bradley (Florence Briggs Thayer)  female  38.0      1      0          PC 17599  71.2833   C85        C
2             3         1       3                                   Heikkinen, Miss. Laina  female  26.0      0      0  STON/O2. 3101282   7.9250   NaN        S
3             4         1       1             Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1      0            113803  53.1000  C123        S
4             5         0       3                                 Allen, Mr. William Henry    male  35.0      0      0            373450   8.0500   NaN        S
5

In [3]:
# Sorting by axis 0 sorts by index itself, by 1 sorts by column names, returns a new sorted DataFrame
sorted_by_index = data_portion.sort_index(axis=0, ascending=False)
print("\nSorted by index (descending):")
print(sorted_by_index)



Sorted by index (descending):
    PassengerId  Survived  Pclass                                                     Name     Sex   Age  SibSp  Parch            Ticket     Fare Cabin Embarked
19           20         1       3                                  Masselmani, Mrs. Fatima  female   NaN      0      0              2649   7.2250   NaN        C
18           19         0       3  Vander Planke, Mrs. Julius (Emelia Maria Vandemoortele)  female  31.0      1      0            345763  18.0000   NaN        S
17           18         1       2                             Williams, Mr. Charles Eugene    male   NaN      0      0            244373  13.0000   NaN        S
16           17         0       3                                     Rice, Master. Eugene    male   2.0      4      1            382652  29.1250   NaN        Q
15           16         1       2                         Hewlett, Mrs. (Mary D Kingcome)   female  55.0      0      0            248706  16.0000   NaN        S
14 

In [4]:
# Sorting by 'Survived' column in descending order. Can add more columns: hierarchical sorting
sorted_by_survived = data_portion.sort_values(by=["Survived"], ascending=False, axis=0)
print("\nSorted by 'Survived' column (descending):")
print(sorted_by_survived)



Sorted by 'Survived' column (descending):
    PassengerId  Survived  Pclass                                                     Name     Sex   Age  SibSp  Parch            Ticket     Fare Cabin Embarked
10           11         1       3                          Sandstrom, Miss. Marguerite Rut  female   4.0      1      1           PP 9549  16.7000    G6        S
8             9         1       3        Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)  female  27.0      0      2            347742  11.1333   NaN        S
17           18         1       2                             Williams, Mr. Charles Eugene    male   NaN      0      0            244373  13.0000   NaN        S
15           16         1       2                         Hewlett, Mrs. (Mary D Kingcome)   female  55.0      0      0            248706  16.0000   NaN        S
11           12         1       1                                 Bonnell, Miss. Elizabeth  female  58.0      0      0            113783  26.5500  C103 

In [5]:
# Slicing as normal (or by row)
sliced_data = data_portion[2:10]
print("\nSliced data (rows 2 to 10):")
print(sliced_data)



Sliced data (rows 2 to 10):
   PassengerId  Survived  Pclass                                               Name     Sex   Age  SibSp  Parch            Ticket     Fare Cabin Embarked
2            3         1       3                             Heikkinen, Miss. Laina  female  26.0      0      0  STON/O2. 3101282   7.9250   NaN        S
3            4         1       1       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1      0            113803  53.1000  C123        S
4            5         0       3                           Allen, Mr. William Henry    male  35.0      0      0            373450   8.0500   NaN        S
5            6         0       3                                   Moran, Mr. James    male   NaN      0      0            330877   8.4583   NaN        Q
6            7         0       1                            McCarthy, Mr. Timothy J    male  54.0      0      0             17463  51.8625   E46        S
7            8         0       3               

In [6]:
# Slicing based on label (or by col)
label_sliced_data = data_portion[["Survived", "Pclass"]]
print("\nLabel sliced data (Survived and Pclass columns):")
print(label_sliced_data)



Label sliced data (Survived and Pclass columns):
    Survived  Pclass
0          0       3
1          1       1
2          1       3
3          1       1
4          0       3
5          0       3
6          0       1
7          0       3
8          1       3
9          1       2
10         1       3
11         1       1
12         0       3
13         0       3
14         0       3
15         1       2
16         0       3
17         1       2
18         0       3
19         1       3


In [7]:
# Slicing based on both (notice the use of .loc)
row_and_col_slice = data_portion.loc[1:10, ["Survived", "Pclass"]]
print("\nRow and column slice (rows 1-10, Survived and Pclass):")
print(row_and_col_slice)



Row and column slice (rows 1-10, Survived and Pclass):
    Survived  Pclass
1          1       1
2          1       3
3          1       1
4          0       3
5          0       3
6          0       1
7          0       3
8          1       3
9          1       2
10         1       3


In [8]:
# Selecting a single value, returns a simple scalar
selection = data_portion.at[2, "Pclass"]
print("\nSingle value selection (row 2, Pclass column):")
print(selection)



Single value selection (row 2, Pclass column):
3


In [9]:
# Selecting based on position (rows 2 and 3, columns 1 and 4). Can also use slicing notation
positional_selection = data_portion.iloc[[2,3], [1,4]]
print("\nPositional selection (rows 2 and 3, columns 1 and 4):")
print(positional_selection)



Positional selection (rows 2 and 3, columns 1 and 4):
   Survived     Sex
2         1  female
3         1  female


In [10]:
# NOTE: use loc when referring to indices/labels, iloc for positions (think numpy indices, rather than the actual data index)

# Select ROWS where 'Survived' == 1
filtered_rows = data_portion[data_portion["Survived"] == 1]
print("\n\nFiltered rows (where Survived == 1):")
print(filtered_rows)




Filtered rows (where Survived == 1):
    PassengerId  Survived  Pclass                                                 Name     Sex   Age  SibSp  Parch            Ticket     Fare Cabin Embarked
1             2         1       1  Cumings, Mrs. John Bradley (Florence Briggs Thayer)  female  38.0      1      0          PC 17599  71.2833   C85        C
2             3         1       3                               Heikkinen, Miss. Laina  female  26.0      0      0  STON/O2. 3101282   7.9250   NaN        S
3             4         1       1         Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1      0            113803  53.1000  C123        S
8             9         1       3    Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)  female  27.0      0      2            347742  11.1333   NaN        S
9            10         1       2                  Nasser, Mrs. Nicholas (Adele Achem)  female  14.0      1      0            237736  30.0708   NaN        C
10           11    

In [11]:
# Filter out string cols, Select VALUES that are greater than 3
# Notice the difference in using no_strings for condition, but data_portion for selection
no_strings = data_portion.loc[:, data_portion.dtypes != object]
print("\n\nNumeric columns only:")
print(no_strings)

selected_values = data_portion[no_strings>=1]
print("\n\nSelected values (where numeric values >= 1):")
print(selected_values)




Numeric columns only:
    PassengerId  Survived  Pclass   Age  SibSp  Parch     Fare
0             1         0       3  22.0      1      0   7.2500
1             2         1       1  38.0      1      0  71.2833
2             3         1       3  26.0      0      0   7.9250
3             4         1       1  35.0      1      0  53.1000
4             5         0       3  35.0      0      0   8.0500
5             6         0       3   NaN      0      0   8.4583
6             7         0       1  54.0      0      0  51.8625
7             8         0       3   2.0      3      1  21.0750
8             9         1       3  27.0      0      2  11.1333
9            10         1       2  14.0      1      0  30.0708
10           11         1       3   4.0      1      1  16.7000
11           12         1       1  58.0      0      0  26.5500
12           13         0       3  20.0      0      0   8.0500
13           14         0       3  39.0      1      5  31.2750
14           15         0      

In [14]:
# add in missing data for demo
missing_data_portion = data_portion.copy()
missing_data_portion.iloc[0:2, 1:3] = np.nan
print("\n\nData with missing values added:")
print(missing_data_portion)



Data with missing values added:
    PassengerId  Survived  Pclass                                                     Name     Sex   Age  SibSp  Parch            Ticket     Fare Cabin Embarked
0             1       NaN     NaN                                  Braund, Mr. Owen Harris    male  22.0      1      0         A/5 21171   7.2500   NaN        S
1             2       NaN     NaN      Cumings, Mrs. John Bradley (Florence Briggs Thayer)  female  38.0      1      0          PC 17599  71.2833   C85        C
2             3       1.0     3.0                                   Heikkinen, Miss. Laina  female  26.0      0      0  STON/O2. 3101282   7.9250   NaN        S
3             4       1.0     1.0             Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1      0            113803  53.1000  C123        S
4             5       0.0     3.0                                 Allen, Mr. William Henry    male  35.0      0      0            373450   8.0500   NaN        S


In [15]:
# drop rows with missing data
dropped_missing = missing_data_portion.dropna(axis=0, how="any")

# drop cols with missing data
dropped_missing_cols = missing_data_portion.dropna(axis=1, how="any")

# To drop by all rows, change the how field to "all". Can also use thresh field
print("\n\nDropped rows with missing data:")
print(dropped_missing)
print("\n\nDropped columns with missing data:")
print(dropped_missing_cols)



Dropped rows with missing data:
    PassengerId  Survived  Pclass                                          Name     Sex   Age  SibSp  Parch   Ticket     Fare Cabin Embarked
3             4       1.0     1.0  Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1      0   113803  53.1000  C123        S
6             7       0.0     1.0                       McCarthy, Mr. Timothy J    male  54.0      0      0    17463  51.8625   E46        S
10           11       1.0     3.0               Sandstrom, Miss. Marguerite Rut  female   4.0      1      1  PP 9549  16.7000    G6        S
11           12       1.0     1.0                      Bonnell, Miss. Elizabeth  female  58.0      0      0   113783  26.5500  C103        S


Dropped columns with missing data:
    PassengerId                                                     Name     Sex  SibSp  Parch            Ticket     Fare Embarked
0             1                                  Braund, Mr. Owen Harris    male      1      

In [16]:
filled_data = missing_data_portion.fillna(value=0)
print("\n\nFilled missing data with 0:")
print(filled_data)



Filled missing data with 0:
    PassengerId  Survived  Pclass                                                     Name     Sex   Age  SibSp  Parch            Ticket     Fare Cabin Embarked
0             1       0.0     0.0                                  Braund, Mr. Owen Harris    male  22.0      1      0         A/5 21171   7.2500     0        S
1             2       0.0     0.0      Cumings, Mrs. John Bradley (Florence Briggs Thayer)  female  38.0      1      0          PC 17599  71.2833   C85        C
2             3       1.0     3.0                                   Heikkinen, Miss. Laina  female  26.0      0      0  STON/O2. 3101282   7.9250     0        S
3             4       1.0     1.0             Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1      0            113803  53.1000  C123        S
4             5       0.0     3.0                                 Allen, Mr. William Henry    male  35.0      0      0            373450   8.0500     0        S
5   

In [17]:
masked_data = missing_data_portion.isna()
print("\n\nMasked data (True for missing values):")
print(masked_data)



Masked data (True for missing values):
    PassengerId  Survived  Pclass   Name    Sex    Age  SibSp  Parch  Ticket   Fare  Cabin  Embarked
0         False      True    True  False  False  False  False  False   False  False   True     False
1         False      True    True  False  False  False  False  False   False  False  False     False
2         False     False   False  False  False  False  False  False   False  False   True     False
3         False     False   False  False  False  False  False  False   False  False  False     False
4         False     False   False  False  False  False  False  False   False  False   True     False
5         False     False   False  False  False   True  False  False   False  False   True     False
6         False     False   False  False  False  False  False  False   False  False  False     False
7         False     False   False  False  False  False  False  False   False  False   True     False
8         False     False   False  False  False  F

In [22]:
transformed_data = data_portion.transform(lambda x: x+10 if np.issubdtype(x.dtype, np.number) else x)
print("\n\nTransformed data (numeric values multiplied by 2):")
print(transformed_data)



Transformed data (numeric values multiplied by 2):
    PassengerId  Survived  Pclass                                                     Name     Sex   Age  SibSp  Parch            Ticket     Fare Cabin Embarked
0            11        10      13                                  Braund, Mr. Owen Harris    male  32.0     11     10         A/5 21171  17.2500   NaN        S
1            12        11      11      Cumings, Mrs. John Bradley (Florence Briggs Thayer)  female  48.0     11     10          PC 17599  81.2833   C85        C
2            13        11      13                                   Heikkinen, Miss. Laina  female  36.0     10     10  STON/O2. 3101282  17.9250   NaN        S
3            14        11      11             Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  45.0     11     10            113803  63.1000  C123        S
4            15        10      13                                 Allen, Mr. William Henry    male  45.0     10     10            373450  18.0

In [25]:
# group data by values in 'Survived' column, calculate mean of 'Pclass' for each group
grouped_data = data_portion.groupby("Survived")["Pclass"].mean()
print("\n\nGrouped data (mean Pclass by Survived):")
print(grouped_data)

ValueError: Cannot subset columns with a tuple with more than one element. Use a list instead.

In [27]:
# Group by multiple columns
multi_grouped_data = data_portion.groupby(["Sex", "Pclass"])["Survived"].sum()
print("\n\nMulti-grouped data (sum of Survived by Age and Pclass):")
print(multi_grouped_data)



Multi-grouped data (sum of Survived by Age and Pclass):
Sex     Pclass
female  1         3
        2         2
        3         4
male    1         0
        2         1
        3         0
Name: Survived, dtype: int64
