In [5]:
import pandas as pd
# --- Pandas Series (1D labeled array) ---
# Creating a Series from a list
s_data = [10, 20, 30, 40, 50]
s_index = ['a', 'b', 'c', 'd', 'e']
my_series = pd.Series(data=s_data, index=s_index)
print("Pandas Series:")
print(my_series)
print(f"Value at index 'c': {my_series['c']}")
print(f"Values greater than 25:\n{my_series[my_series > 25]}")


Pandas Series:
a    10
b    20
c    30
d    40
e    50
dtype: int64
Value at index 'c': 30
Values greater than 25:
c    30
d    40
e    50
dtype: int64


In [11]:

# Creating a Series from a dictionary
dict_data = {'x': 100, 'y': 200, 'z': 300}
series_from_dict = pd.Series(dict_data)
print("\nSeries from dictionary:")
print(series_from_dict)


Series from dictionary:
x    100
y    200
z    300
dtype: int64


In [2]:
my_series

a    10
b    20
c    30
d    40
e    50
dtype: int64

In [24]:
import numpy as np
# --- Pandas DataFrames (2D labeled data structure with columns of potentially different types) ---
# Creating a DataFrame from a dictionary of lists
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve'],
    'Age': [25, 30, 35, 28, 22],
    'City': ['New York', 'Los Angeles', 'Chicago', 'Houston', 'Phoenix'],
    'Salary': [70000, 80000, 90000, 75000, 65000],
    "Optimized": np.array([1,2,3,4,5], dtype=np.int8)
}
df = pd.DataFrame(data)
print("\nPandas DataFrame:")
df



Pandas DataFrame:


Unnamed: 0,Name,Age,City,Salary,Optimized
0,Alice,25,New York,70000,1
1,Bob,30,Los Angeles,80000,2
2,Charlie,35,Chicago,90000,3
3,David,28,Houston,75000,4
4,Eve,22,Phoenix,65000,5


# Handling Data From a File

In [5]:
# Create a dummy CSV for demonstration if it doesn't exist
try:
    with open('sample_data.csv', 'w') as f:
        f.write("id,feature1,feature2,target\n")
        f.write("1,0.5,1.2,0\n")
        f.write("2,0.3,0.9,1\n")
        f.write("3,0.7,1.5,0\n")
        f.write("4,0.1,0.3,1\n")
        f.write("5,,1.1,0\n") # Note the missing value (NA)
        f.write("6,0.6,1.3,1\n")
    print("\n'sample_data.csv' created for demonstration.")
except IOError:
    print("\nCould not create 'sample_data.csv'. Please ensure you have write permissions or create it manually.")



'sample_data.csv' created for demonstration.


In [27]:
# Load data from a CSV file

df_from_csv = pd.read_csv('../sample_data.csv')
df_from_csv



Unnamed: 0,id,feature1,feature2,target
0,1,0.5,1.2,0
1,2,0.3,0.9,1
2,3,0.7,1.5,0
3,4,0.1,0.3,1
4,5,,1.1,0
5,6,0.6,1.3,1


In [28]:

# --- Exploring Datasets ---
print("\n--- Exploring the CSV DataFrame ---")
# Display the first N rows (default is 5)
print("\nHead (first 5 rows):")
print(df_from_csv.head())

# Display the last N rows
print("\nTail (last 3 rows):")
print(df_from_csv.tail(3))

# Get the dimensions of the DataFrame (rows, columns)
print(f"\nShape of the DataFrame: {df_from_csv.shape}")

# Get column names
print(f"\nColumn names: {df_from_csv.columns.tolist()}")

# Get data types of each column
print("\nData types of columns (dtypes):")
print(df_from_csv.dtypes)

# Get a concise summary of the DataFrame
print("\nInfo (summary of DataFrame):")
df_from_csv.info()

# Get descriptive statistics for numerical columns
print("\nDescriptive statistics (describe()):")
print(df_from_csv.describe())



--- Exploring the CSV DataFrame ---

Head (first 5 rows):
   id  feature1  feature2  target
0   1       0.5       1.2       0
1   2       0.3       0.9       1
2   3       0.7       1.5       0
3   4       0.1       0.3       1
4   5       NaN       1.1       0

Tail (last 3 rows):
   id  feature1  feature2  target
3   4       0.1       0.3       1
4   5       NaN       1.1       0
5   6       0.6       1.3       1

Shape of the DataFrame: (6, 4)

Column names: ['id', 'feature1', 'feature2', 'target']

Data types of columns (dtypes):
id            int64
feature1    float64
feature2    float64
target        int64
dtype: object

Info (summary of DataFrame):
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   id        6 non-null      int64  
 1   feature1  5 non-null      float64
 2   feature2  6 non-null      float64
 3   target    6 non-null      int64  
d

In [31]:
df_from_csv.describe(include='all')  # Include all columns for descriptive stats

Unnamed: 0,id,feature1,feature2,target
count,6.0,5.0,6.0,6.0
mean,3.5,0.44,1.05,0.5
std,1.870829,0.240832,0.41833,0.547723
min,1.0,0.1,0.3,0.0
25%,2.25,0.3,0.95,0.0
50%,3.5,0.5,1.15,0.5
75%,4.75,0.6,1.275,1.0
max,6.0,0.7,1.5,1.0


In [14]:

# --- Selecting Data ---
# Select a single column (returns a Series)
print("\nSelecting 'feature1' column:")
print(df_from_csv['feature1'])

# Select multiple columns (returns a DataFrame)
print("\nSelecting 'id' and 'target' columns:")
print(df_from_csv[['id', 'target']])

# Select rows by label (index) using .loc
# (assuming 'id' is not the index yet, so it uses default integer index)
print("\nSelecting row with index 2 (using .loc):")
print(df_from_csv.loc[2]) # Selects the third row (0-indexed)



Selecting 'feature1' column:
0    0.5
1    0.3
2    0.7
3    0.1
4    NaN
5    0.6
Name: feature1, dtype: float64

Selecting 'id' and 'target' columns:
   id  target
0   1       0
1   2       1
2   3       0
3   4       1
4   5       0
5   6       1

Selecting row with index 2 (using .loc):
id          3.0
feature1    0.7
feature2    1.5
target      0.0
Name: 2, dtype: float64


In [None]:

print("\nSelecting rows 0 to 2 and columns 'feature1', 'feature2' (using .loc):")
print(df_from_csv.loc[0:2, ['feature1', 'feature2']]) # Inclusive slice for .loc

# Select rows by integer position using .iloc
print("\nSelecting row at integer position 2 (using .iloc):")
print(df_from_csv.iloc[2]) # Selects the third row

print("\nSelecting rows 0 to 2 (exclusive for end) and columns 1 to 2 (exclusive for end) (using .iloc):")
print(df_from_csv.iloc[0:3, 1:3]) # Exclusive slice for .iloc





Selecting rows 0 to 2 and columns 'feature1', 'feature2' (using .loc):
   feature1  feature2
0       0.5       1.2
1       0.3       0.9
2       0.7       1.5

Selecting row at integer position 2 (using .iloc):
id          3.0
feature1    0.7
feature2    1.5
target      0.0
Name: 2, dtype: float64

Selecting rows 0 to 2 (exclusive for end) and columns 1 to 2 (exclusive for end) (using .iloc):
   feature1  feature2
0       0.5       1.2
1       0.3       0.9
2       0.7       1.5

Rows where 'target' is 1:
   id  feature1  feature2  target
1   2       0.3       0.9       1
3   4       0.1       0.3       1
5   6       0.6       1.3       1


In [51]:
# Conditional selection
print("\nRows where 'target' is 1:")
df_from_csv[df_from_csv['target'] == 1]


Rows where 'target' is 1:


Unnamed: 0,id,feature1,feature2,target
1,2,0.3,0.9,1
3,4,0.1,0.3,1
5,6,0.6,1.3,1


In [52]:

print("\nRows where 'feature2' > 1.0 and 'target' is 0:")
print(df_from_csv[(df_from_csv['feature2'] > 1.0) & (df_from_csv['target'] == 0)])



Rows where 'feature2' > 1.0 and 'target' is 0:
   id  feature1  feature2  target
0   1       0.5       1.2       0
2   3       0.7       1.5       0
4   5       NaN       1.1       0


In [53]:
df_from_csv

Unnamed: 0,id,feature1,feature2,target
0,1,0.5,1.2,0
1,2,0.3,0.9,1
2,3,0.7,1.5,0
3,4,0.1,0.3,1
4,5,,1.1,0
5,6,0.6,1.3,1


In [56]:
df_from_csv.iloc[0:2, 0:3]

Unnamed: 0,id,feature1,feature2
0,1,0.5,1.2
1,2,0.3,0.9
