In [None]:
import pandas as pd
import numpy as np
from pathlib import Path

# Configure pandas display options to show all data on the same grid
pd.set_option('display.max_columns', None)  # Show all columns
pd.set_option('display.width', None)  # Auto-detect width
pd.set_option('display.max_colwidth', None)  # Show full column content
pd.set_option('display.expand_frame_repr', False)  # Don't wrap to multiple lines

csv_path = Path("./train.csv")


In [None]:
# Read the CSV file
df = pd.read_csv(csv_path)
data_portion = df.head(20)
print("Original data (first 20 rows):\n\n")
print(data_portion)


In [None]:
# Sorting by axis 0 sorts by index itself, by 1 sorts by column names, returns a new sorted DataFrame
sorted_by_index = data_portion.sort_index(axis=0, ascending=False)
print("\nSorted by index (descending):")
print(sorted_by_index)


In [None]:
# Sorting by 'Survived' column in descending order. Can add more columns: hierarchical sorting
sorted_by_survived = data_portion.sort_values(by=["Survived"], ascending=False, axis=0)
print("\nSorted by 'Survived' column (descending):")
print(sorted_by_survived)


In [None]:
# Slicing as normal (or by row)
sliced_data = data_portion[2:10]
print("\nSliced data (rows 2 to 10):")
print(sliced_data)


In [None]:
# Slicing based on label (or by col)
label_sliced_data = data_portion[["Survived", "Pclass"]]
print("\nLabel sliced data (Survived and Pclass columns):")
print(label_sliced_data)


In [None]:
# Slicing based on both (notice the use of .loc)
row_and_col_slice = data_portion.loc[1:10, ["Survived", "Pclass"]]
print("\nRow and column slice (rows 1-10, Survived and Pclass):")
print(row_and_col_slice)


In [None]:
# Selecting a single value, returns a simple scalar
selection = data_portion.at[2, "Pclass"]
print("\nSingle value selection (row 2, Pclass column):")
print(selection)


In [None]:
# Selecting based on position (rows 2 and 3, columns 1 and 4). Can also use slicing notation
positional_selection = data_portion.iloc[[2,3], [1,4]]
print("\nPositional selection (rows 2 and 3, columns 1 and 4):")
print(positional_selection)


In [None]:
# NOTE: use loc when referring to indices/labels, iloc for positions (think numpy indices, rather than the actual data index)

# Select ROWS where 'Survived' == 1
filtered_rows = data_portion[data_portion["Survived"] == 1]
print("\n\nFiltered rows (where Survived == 1):")
print(filtered_rows)


In [None]:
# Filter out string cols, Select VALUES that are greater than 3
# Notice the difference in using no_strings for condition, but data_portion for selection
no_strings = data_portion.loc[:, data_portion.dtypes != object]
print("\n\nNumeric columns only:")
print(no_strings)

selected_values = data_portion[no_strings>=1]
print("\n\nSelected values (where numeric values >= 1):")
print(selected_values)
