In [1]:
import pandas as pd
import numpy as np

# Create a sample DataFrame (df)

In [3]:
data = {
    'patient_id': [1, 2, 3, 4],
    'prior_visits_last_year': [5, 1, 12, 3],
    'age': [45, 62, 34, 78],
    'chronic_conditions': [1, 0, 2, 1],
    'readmitted_30_days': [0, 1, 0, 1],
    'other_data': ['A', 'B', 'C', 'D']
}

In [4]:
df = pd.DataFrame(data)
print("Original DataFrame:")
print(df)
print("-" * 30)

Original DataFrame:
   patient_id  prior_visits_last_year  age  chronic_conditions  \
0           1                       5   45                   1   
1           2                       1   62                   0   
2           3                      12   34                   2   
3           4                       3   78                   1   

   readmitted_30_days other_data  
0                   0          A  
1                   1          B  
2                   0          C  
3                   1          D  
------------------------------


# Define the columns you need

In [5]:
features = ["prior_visits_last_year", "age", "chronic_conditions"]

# Extract the features (X) and the target (y)

In [6]:
X = df[features]
y = df["readmitted_30_days"]

In [7]:
print("X (Features DataFrame):")
print(X)
print("-" * 30)

X (Features DataFrame):
   prior_visits_last_year  age  chronic_conditions
0                       5   45                   1
1                       1   62                   0
2                      12   34                   2
3                       3   78                   1
------------------------------


In [8]:
print("y (Target Series):")
print(y)

y (Target Series):
0    0
1    1
2    0
3    1
Name: readmitted_30_days, dtype: int64


# Iterating through the 'age' column

## 1. Using a for loop

The most straightforward way to iterate through each value in a specific column is using a standard **for** loop

In [9]:
for age_value in df['age']:
    print(f"Age: {age_value}")

Age: 45
Age: 62
Age: 34
Age: 78


In [10]:
for age_value in X['age']:
    print(f"Age: {age_value}")

Age: 45
Age: 62
Age: 34
Age: 78


## 2. Using df.iterrows() (Iterating over rows to access the column)

If you need to access other columns while iterating (e.g., pairing the age with the patient_id), you can iterate through the entire DataFrame row by row and access the specific column using its key. 

In [13]:
print("\n--- Iterating using iterrows() ---")
for index, row in df.iterrows():
    print(f"Patient ID: {row['patient_id']}, Age: {row['age']}")


--- Iterating using iterrows() ---
Patient ID: 1, Age: 45
Patient ID: 2, Age: 62
Patient ID: 3, Age: 34
Patient ID: 4, Age: 78


## 3. Using .apply() (For performing a function on each value)

If your goal is to do something to every age value and potentially create a new column, the .apply() method is more efficient and idiomatic in pandas than a standard loop.

In [14]:
def categorize_age(age):
    if age < 50:
        return 'Under 50'
    else:
        return '50+'

# Create a new 'age_group' column
df['age_group'] = df['age'].apply(categorize_age)

print("\n--- DataFrame after applying a function ---")
print(df[['age', 'age_group']])



--- DataFrame after applying a function ---
   age age_group
0   45  Under 50
1   62       50+
2   34  Under 50
3   78       50+


# Get element from the features

In [11]:
X['age'][1]

62

# Viewing the first/last few rows:
* **df.head():** Displays the first 5 rows of the DataFrame (you can specify a different number, e.g., df.head(10)).
* **df.tail():** Displays the last 5 rows of the DataFrame.

### View the first 5 rows

In [15]:
print("First 5 rows:")
print(df.head())

First 5 rows:
   patient_id  prior_visits_last_year  age  chronic_conditions  \
0           1                       5   45                   1   
1           2                       1   62                   0   
2           3                      12   34                   2   
3           4                       3   78                   1   

   readmitted_30_days other_data age_group  
0                   0          A  Under 50  
1                   1          B       50+  
2                   0          C  Under 50  
3                   1          D       50+  


# Checking data types and missing values:
* **df.info():** Provides a concise summary of the DataFrame, including the column names, their non-null counts, and data types (e.g., int64, float64, object).
* **df.isna().sum():** This combination checks for missing values (.isna()) across the entire DataFrame and then sums them per column to show the total count of missing entries in each column.

### Get data types and missing value counts

In [16]:
print("\nDataFrame Info:")
df.info()


DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 7 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   patient_id              4 non-null      int64 
 1   prior_visits_last_year  4 non-null      int64 
 2   age                     4 non-null      int64 
 3   chronic_conditions      4 non-null      int64 
 4   readmitted_30_days      4 non-null      int64 
 5   other_data              4 non-null      object
 6   age_group               4 non-null      object
dtypes: int64(5), object(2)
memory usage: 356.0+ bytes


### Check total missing values per column

In [17]:
print("\nMissing values per column:")
print(df.isna().sum())


Missing values per column:
patient_id                0
prior_visits_last_year    0
age                       0
chronic_conditions        0
readmitted_30_days        0
other_data                0
age_group                 0
dtype: int64


# Calculating summary statistics:
* **df.describe():** Generates descriptive statistics for numerical columns (e.g., count, mean, standard deviation, minimum, maximum, quartiles). You can use df.describe(include='all') to include statistics for all columns, including categorical ones.
* **df['column_name'].mean():** Calculates the mean (average) of a specific column.
* **df['column_name'].sum():** Calculates the sum of a specific column.
* **df['column_name'].value_counts():** Shows the frequency of unique values within a single column, which is very useful for categorical data. 

### Get summary statistics for numerical columns

In [18]:
print("\nSummary statistics for numerical columns:")
print(df.describe())


Summary statistics for numerical columns:
       patient_id  prior_visits_last_year       age  chronic_conditions  \
count    4.000000                4.000000   4.00000            4.000000   
mean     2.500000                5.250000  54.75000            1.000000   
std      1.290994                4.787136  19.31105            0.816497   
min      1.000000                1.000000  34.00000            0.000000   
25%      1.750000                2.500000  42.25000            0.750000   
50%      2.500000                4.000000  53.50000            1.000000   
75%      3.250000                6.750000  66.00000            1.250000   
max      4.000000               12.000000  78.00000            2.000000   

       readmitted_30_days  
count             4.00000  
mean              0.50000  
std               0.57735  
min               0.00000  
25%               0.00000  
50%               0.50000  
75%               1.00000  
max               1.00000  


### Calculate the mean of a specific column (replace 'Age' with your column name)

In [20]:
print(f"\nMean age: {df['age'].mean()}")


Mean age: 54.75


### Count the frequency of unique values in a specific column

In [21]:
print(df['readmitted_30_days'].value_counts())

readmitted_30_days
0    2
1    2
Name: count, dtype: int64
