##### M2W2
#### *Indexing, Selecting, and Assigning Data in Pandas* 

In [1]:
# Load library
import pandas as pd
import os

In [2]:
# Create a DataFrame from Python Dictionary
data = {
    'Name': ['Olivia', 'Emma', 'Liam', 'Sophia', 'Mason', 'Ava', 'Jacob', 'Isabella', 'Noah', 'Ethan'],
    'Age': [39, 29, 35, 18, 28, 40, 23, 26, 22, 45],
    'City': ['Sydney', 'Toronto', 'Madrid', 'Berlin', 'Rome', 'Los Angeles', 'New York', 'London', 'Paris', 'Tokyo'],
    'Year': [1985, 1995, 1989, 2006, 1996, 1984, 2001, 1998, 2002, 1979],
    'Social_Media': ['TikTok', 'Instagram', 'X', 'Facebook', 'Snapchat', 'TikTok', 'Instagram', 'X', 'Facebook', 'LinkedIn' ],
    'Sport': ['Soccer', 'Basketball', 'Tennis', 'Baseball', 'Swimming', 'Soccer', 'Basketball','Tennis', 'Baseball','Soccer']
}
df = pd.DataFrame(data)

# Convert the 'Year' column from int64 to object (string)
df['Year'] = df['Year'].astype(str)

# Create a new directory called 'data' if it doesn't exist 
data_directory = 'course_work/data' 
os.makedirs(data_directory, exist_ok=True) 

# Save the DataFrame to a CSV file in the 'data' directory 
csv_file_path = os.path.join(data_directory, 'survey.csv') 
df.to_csv(csv_file_path, index=False) 

print(f"DataFrame has been saved to {csv_file_path}") 

# Print the DataFrame
print("DataFrame:")
print(df)

# Check the data types to confirm the change
print("\nData types:")
print(df.dtypes)

# Select the 'Name' and 'City' columns from the DataFrame 
name_city = df[['Name','City']]

print('\nSelected Columns:')     
print(name_city)

DataFrame has been saved to course_work/data\survey.csv
DataFrame:
       Name  Age         City  Year Social_Media       Sport
0    Olivia   39       Sydney  1985       TikTok      Soccer
1      Emma   29      Toronto  1995    Instagram  Basketball
2      Liam   35       Madrid  1989            X      Tennis
3    Sophia   18       Berlin  2006     Facebook    Baseball
4     Mason   28         Rome  1996     Snapchat    Swimming
5       Ava   40  Los Angeles  1984       TikTok      Soccer
6     Jacob   23     New York  2001    Instagram  Basketball
7  Isabella   26       London  1998            X      Tennis
8      Noah   22        Paris  2002     Facebook    Baseball
9     Ethan   45        Tokyo  1979     LinkedIn      Soccer

Data types:
Name            object
Age              int64
City            object
Year            object
Social_Media    object
Sport           object
dtype: object

Selected Columns:
       Name         City
0    Olivia       Sydney
1      Emma      Toronto
2  

### _*Pandas .loc*_

##### Syntax: df.loc[row_indexer, column_indexer] 

In [3]:
# Select the row at index 2: (returns a DataFrame)
single_row_df = df.loc[[2]]
single_row_df

# (returns a Series)
# single_row = df.loc[2]
# single_row

Unnamed: 0,Name,Age,City,Year,Social_Media,Sport
2,Liam,35,Madrid,1989,X,Tennis


In [4]:
# # Select specific rows by index
row_list = df.loc[[0, 3, 4]]
row_list

Unnamed: 0,Name,Age,City,Year,Social_Media,Sport
0,Olivia,39,Sydney,1985,TikTok,Soccer
3,Sophia,18,Berlin,2006,Facebook,Baseball
4,Mason,28,Rome,1996,Snapchat,Swimming


In [5]:
# Select specific columns from all rows
column_list = df.loc[:,['Social_Media', 'Sport']] # # Use ':' to select all rows
column_list

Unnamed: 0,Social_Media,Sport
0,TikTok,Soccer
1,Instagram,Basketball
2,X,Tennis
3,Facebook,Baseball
4,Snapchat,Swimming
5,TikTok,Soccer
6,Instagram,Basketball
7,X,Tennis
8,Facebook,Baseball
9,LinkedIn,Soccer


In [6]:
# Access a specific value from the 'Name' column in the second row
specific_value = df.loc[1, 'Name']
specific_value

'Emma'

In [7]:
# Slice rows from index 1 to 3 (inclusive)
slice_rows = df.loc[1:3]

print("Sliced Rows:")
print(slice_rows)
print()

# Slice columns from 'Name' to 'City' for rows 1 to 3 (inclusive)
slice_columns = df.loc[1:3, 'Name':'City']

print("Sliced Columns:")
print(slice_columns)

Sliced Rows:
     Name  Age     City  Year Social_Media       Sport
1    Emma   29  Toronto  1995    Instagram  Basketball
2    Liam   35   Madrid  1989            X      Tennis
3  Sophia   18   Berlin  2006     Facebook    Baseball

Sliced Columns:
     Name  Age     City
1    Emma   29  Toronto
2    Liam   35   Madrid
3  Sophia   18   Berlin


### _*Boolean Indexing*_
##### Filtering DataFrame based on condition

In [8]:
# Filter the DataFrame to include rows where 'Age' is greater than 29
age_above_29 = df.loc[df['Age'] > 29]

print("Filtered DataFrame: ")
print(age_above_29)

# Logical Operators &(AND), |(OR) and ~(NOT)  with .loc[]

# & (AND) operator
# Extract rows where Age is greater than 30 and Sport is 'Soccer'
result_and = df.loc[(df['Age'] > 30) & (df['Sport'] == 'Soccer')]
print("AND Operator Result:\n", result_and)

# |(OR) operator
# Select rows where Age is less than 25 or Social_Media is 'Instagram'
result_or = df.loc[(df['Age'] < 25) | (df['Social_Media'] == 'Instagram')]
print("OR Operator Result:\n", result_or)

# ~ (NOT) operator 
# Select rows where Sport is not 'Soccer'
result_not = df.loc[~(df['Sport'] == 'Soccer')]
print("NOT Operator Result:\n", result_not)

# Combining & and | operators 
# Select rows where Age is greater than 30 and (Sport is 'Soccer' or Social_Media is 'TikTok')
result_combined = df.loc[(df['Age'] > 30) & ((df['Sport'] == 'Soccer') | (df['Social_Media'] == 'TikTok'))]
print("Combined Operators Result:\n", result_combined)

# # Create a boolean mask for the condition 
# mask = df['City'].isin (['Rome', 'Los Angeles', 'New York'])
# df_filtered = df.loc[mask] # filter the DataFrame based on the mask 
# print("Filtering with mask:\n",df_filtered) 

# Filtering using loc with query 
filtered_data = df.query("Name in ['Sophia', 'Mason', 'Ava']") 
# Then, use .loc[] to select all columns (or specific columns if needed) 
df = df.loc[:, :] # This selects all columns 
print("Filtering with loc and query :\n",filtered_data) 

# Keep rows that include 'X, Facebook' using query method 
result_query = df.query("Social_Media in ['X', 'Facebook']")
print("Query Result:\n", result_query)

# Dropping multiple values
sport_to_keep = ['Soccer', 'Basketball']
# Filter Dataframe to keep only the specified columns
sport_data_clean = df[df['Sport'].isin(sport_to_keep)] 
print("Multiple Values Result:\n", sport_data_clean )

Filtered DataFrame: 
     Name  Age         City  Year Social_Media   Sport
0  Olivia   39       Sydney  1985       TikTok  Soccer
2    Liam   35       Madrid  1989            X  Tennis
5     Ava   40  Los Angeles  1984       TikTok  Soccer
9   Ethan   45        Tokyo  1979     LinkedIn  Soccer
AND Operator Result:
      Name  Age         City  Year Social_Media   Sport
0  Olivia   39       Sydney  1985       TikTok  Soccer
5     Ava   40  Los Angeles  1984       TikTok  Soccer
9   Ethan   45        Tokyo  1979     LinkedIn  Soccer
OR Operator Result:
      Name  Age      City  Year Social_Media       Sport
1    Emma   29   Toronto  1995    Instagram  Basketball
3  Sophia   18    Berlin  2006     Facebook    Baseball
6   Jacob   23  New York  2001    Instagram  Basketball
8    Noah   22     Paris  2002     Facebook    Baseball
NOT Operator Result:
        Name  Age      City  Year Social_Media       Sport
1      Emma   29   Toronto  1995    Instagram  Basketball
2      Liam   35    Mad

### _*Pandas .iloc*_

##### Syntax: df.iloc[row_indexer, column_indexer] 

In [9]:
# Access specific rows by index (0, 3, and 4)
row_list = df.iloc[[0, 3, 4]]
print("List of Rows:")
print(row_list)
print()

# # Select rows 1 to 2 (exclusive of row 3) and columns 1 to 2 (exclusive of column 3)
selected_data = df.iloc[1:3, 1:3]
print("Multiple Rows:")
print(selected_data)
print()

List of Rows:
     Name  Age    City  Year Social_Media     Sport
0  Olivia   39  Sydney  1985       TikTok    Soccer
3  Sophia   18  Berlin  2006     Facebook  Baseball
4   Mason   28    Rome  1996     Snapchat  Swimming

Multiple Rows:
   Age     City
1   29  Toronto
2   35   Madrid



In [10]:
# Retrieve columns 1 and 2
column_list = df.iloc[:,[1, 2]]

print("List of Columns:")
print(column_list)
print()

# Retrieve a specific value
specific_value = df.iloc[0, 2] # Value located at the intersection of the first row and the third column 

print("Specific Value:")
print(specific_value)

List of Columns:
   Age         City
0   39       Sydney
1   29      Toronto
2   35       Madrid
3   18       Berlin
4   28         Rome
5   40  Los Angeles
6   23     New York
7   26       London
8   22        Paris
9   45        Tokyo

Specific Value:
Sydney


In [11]:
# Extract rows from position 1 to 3 (exclusive of position 4) 
slice_rows = df.iloc[1:4]  # Position 4 is not included; use df.iloc[1:5] to include 

print("Sliced Rows:")
print(slice_rows)
print() 


# Extract columns from position 0 to 1
slice_columns = df.iloc[:, 0:2]   # Single brackets for a range of columns (slicing)

print("Sliced Columns:")
print(slice_columns)

Sliced Rows:
     Name  Age     City  Year Social_Media       Sport
1    Emma   29  Toronto  1995    Instagram  Basketball
2    Liam   35   Madrid  1989            X      Tennis
3  Sophia   18   Berlin  2006     Facebook    Baseball

Sliced Columns:
       Name  Age
0    Olivia   39
1      Emma   29
2      Liam   35
3    Sophia   18
4     Mason   28
5       Ava   40
6     Jacob   23
7  Isabella   26
8      Noah   22
9     Ethan   45


In [12]:
# Retrieve the last element in the DataFrame
single_row = df.iloc[-1:]  
single_row


# # When you use a negative index with loc, it does not interpret it as a position from the end of the DataFrame. 
# Instead, it looks for a label that matches -1, which is does not exist, and as a result  it defaults to returning 
# the entire DataFrame.
# last_row = df.loc[-1:]  
# last_row

Unnamed: 0,Name,Age,City,Year,Social_Media,Sport
9,Ethan,45,Tokyo,1979,LinkedIn,Soccer


##### Integer-location based indexing 

In [13]:
# Get the indices of the rows that meet the condition 
indices = df.index[df['Social_Media'] == 'TikTok, Instagram']
# Use .iloc[] to select those rows
df= df.iloc[indices]
print("Result:\n", result_query)

Result:
        Name  Age    City  Year Social_Media     Sport
2      Liam   35  Madrid  1989            X    Tennis
3    Sophia   18  Berlin  2006     Facebook  Baseball
7  Isabella   26  London  1998            X    Tennis
8      Noah   22   Paris  2002     Facebook  Baseball


##### _*The main differences between .loc and .iloc are as follows:*_


| Basis           | .loc ( specific labels)*      |  .iloc (specific position)        |
|-----------------|-------------------------------|-----------------------------------|
|Indexing         | Label-based indexing          |	Integer-based indexing            |
|Endpoint         |	Endpoint is included          |	Endpoint is <span style="color: blue;">not</span> included|
|Boolean indexing |	Boolean indexing is supported |	Boolean indexing is <span style="color: blue;">not</span> supported|

*Integers may be used, but they are interpreted as labels.

In [14]:
# Example one: DataFrame with integer labels
data = {
    0: [1, 2, 3, 4, 5, 6],
    1: [10, 20, 30, 40, 50, 60],
    2: [100, 200, 300, 400, 500, 600],
    3: [1000, 2000, 3000, 4000, 5000, 6000],
    4: [10000, 20000, 30000, 40000, 50000, 60000],
    5: ['A', 'B', 'C', 'D', 'E', 'F']
}

# Create the DataFrame
df_1 = pd.DataFrame(data)

print("DataFrame:")
print(df_1)
print()


# Using .loc to select rows 1 to 4 (inclusive) and columns 1 to 5 (inclusive)
selected_data_loc = df_1.loc[1:4, 1:5]
print("Selected Data using .loc:")
print(selected_data_loc)
print()

# Using .iloc to select rows 1 to 4 (exclusive of row 4) and columns 1 to 5 (exclusive of column 5)
selected_data_iloc = df_1.iloc[1:4, 1:5]
print("Selected Data using .iloc:")
print(selected_data_iloc)
print()


DataFrame:
   0   1    2     3      4  5
0  1  10  100  1000  10000  A
1  2  20  200  2000  20000  B
2  3  30  300  3000  30000  C
3  4  40  400  4000  40000  D
4  5  50  500  5000  50000  E
5  6  60  600  6000  60000  F

Selected Data using .loc:
    1    2     3      4  5
1  20  200  2000  20000  B
2  30  300  3000  30000  C
3  40  400  4000  40000  D
4  50  500  5000  50000  E

Selected Data using .iloc:
    1    2     3      4
1  20  200  2000  20000
2  30  300  3000  30000
3  40  400  4000  40000



In [15]:
# Example two: DataFrame with object labels
data = {
    'Zero': [1, 2, 3, 4, 5], 
    'One': [10, 20, 30, 40, 50],
    'Two': [100, 200, 300, 400, 500], 
    'Three': [1000, 2000, 3000, 4000, 5000],
    'Four': [10000, 20000, 30000, 40000, 50000],
    'Five': ['A', 'B', 'C', 'D', 'E']
}

# Create the DataFrame
df_2 = pd.DataFrame(data)

print("DataFrame  with Object Labels:")
print(df_2)
print()


# Using .loc to select rows 1 to 3 (inclusive) and specific columns
# Correctly selecting columns by name
select_data_loc = df_2.loc[1:3, ['One', 'Two', 'Three']]
print("Selected Data using .loc:")
print(select_data_loc)
print()

# Error
# select_data = df_2.loc[1:3, 1:3] 

# Using .iloc to select rows 1 to 3 (exclusive of row 3) and columns 1 to 3 (exclusive of column 3)
# Correctly using integer positions
select_data_iloc = df_2.iloc[1:3, 1:3]
print("Selected Data using .iloc:")
print(select_data_iloc)

# Error
# select_data = df_2.iloc[1:3, ['One', 'Two', 'Three']] 

DataFrame  with Object Labels:
   Zero  One  Two  Three   Four Five
0     1   10  100   1000  10000    A
1     2   20  200   2000  20000    B
2     3   30  300   3000  30000    C
3     4   40  400   4000  40000    D
4     5   50  500   5000  50000    E

Selected Data using .loc:
   One  Two  Three
1   20  200   2000
2   30  300   3000
3   40  400   4000

Selected Data using .iloc:
   One  Two
1   20  200
2   30  300


### _*Assigning data in Pandas*_

In [16]:
# Read the CSV file into a DataFrame 
df = pd.read_csv('course_work/data/survey.csv')
df

Unnamed: 0,Name,Age,City,Year,Social_Media,Sport
0,Olivia,39,Sydney,1985,TikTok,Soccer
1,Emma,29,Toronto,1995,Instagram,Basketball
2,Liam,35,Madrid,1989,X,Tennis
3,Sophia,18,Berlin,2006,Facebook,Baseball
4,Mason,28,Rome,1996,Snapchat,Swimming
5,Ava,40,Los Angeles,1984,TikTok,Soccer
6,Jacob,23,New York,2001,Instagram,Basketball
7,Isabella,26,London,1998,X,Tennis
8,Noah,22,Paris,2002,Facebook,Baseball
9,Ethan,45,Tokyo,1979,LinkedIn,Soccer


In [17]:
# Change the value in the entire 'City' column to 'Dallas'
df['City'] = 'Dallas'
print("DataFrame after changing the entire 'City' column:")
df

DataFrame after changing the entire 'City' column:


Unnamed: 0,Name,Age,City,Year,Social_Media,Sport
0,Olivia,39,Dallas,1985,TikTok,Soccer
1,Emma,29,Dallas,1995,Instagram,Basketball
2,Liam,35,Dallas,1989,X,Tennis
3,Sophia,18,Dallas,2006,Facebook,Baseball
4,Mason,28,Dallas,1996,Snapchat,Swimming
5,Ava,40,Dallas,1984,TikTok,Soccer
6,Jacob,23,Dallas,2001,Instagram,Basketball
7,Isabella,26,Dallas,1998,X,Tennis
8,Noah,22,Dallas,2002,Facebook,Baseball
9,Ethan,45,Dallas,1979,LinkedIn,Soccer


In [18]:
# Assign a new value 'Rome' to the first three rows of the 'City' column
df.iloc[:3, 2] = 'Rome'

# Assign a new value 'Caracas' to the last row of the 'City' column
df.iloc[-1, 2] = 'Caracas'
print("DataFrame after modifying specific cells in the 'City' column:")
df

DataFrame after modifying specific cells in the 'City' column:


Unnamed: 0,Name,Age,City,Year,Social_Media,Sport
0,Olivia,39,Rome,1985,TikTok,Soccer
1,Emma,29,Rome,1995,Instagram,Basketball
2,Liam,35,Rome,1989,X,Tennis
3,Sophia,18,Dallas,2006,Facebook,Baseball
4,Mason,28,Dallas,1996,Snapchat,Swimming
5,Ava,40,Dallas,1984,TikTok,Soccer
6,Jacob,23,Dallas,2001,Instagram,Basketball
7,Isabella,26,Dallas,1998,X,Tennis
8,Noah,22,Dallas,2002,Facebook,Baseball
9,Ethan,45,Caracas,1979,LinkedIn,Soccer


In [19]:
# Append a new row to the DataFrame (using build-in function len())
# len(df.index) gives the current number of rows, which is used to find the next available index for adding a new row
df.loc[len(df.index)] = ['Amy', 37, 'Chicago', 1987, 'X', 'Cycling']
df

Unnamed: 0,Name,Age,City,Year,Social_Media,Sport
0,Olivia,39,Rome,1985,TikTok,Soccer
1,Emma,29,Rome,1995,Instagram,Basketball
2,Liam,35,Rome,1989,X,Tennis
3,Sophia,18,Dallas,2006,Facebook,Baseball
4,Mason,28,Dallas,1996,Snapchat,Swimming
5,Ava,40,Dallas,1984,TikTok,Soccer
6,Jacob,23,Dallas,2001,Instagram,Basketball
7,Isabella,26,Dallas,1998,X,Tennis
8,Noah,22,Dallas,2002,Facebook,Baseball
9,Ethan,45,Caracas,1979,LinkedIn,Soccer


In [20]:
# Insert row at specific index NOTE: using df.loc[3] will only overwrite row that positioned at index 3 which is (Sophia,18,...)
df.loc[3.5] = ['June', 40, 'Cairo', '1984', 'X', 'Soccer']
df = df.sort_index().reset_index(drop=True)
df

Unnamed: 0,Name,Age,City,Year,Social_Media,Sport
0,Olivia,39,Rome,1985,TikTok,Soccer
1,Emma,29,Rome,1995,Instagram,Basketball
2,Liam,35,Rome,1989,X,Tennis
3,Sophia,18,Dallas,2006,Facebook,Baseball
4,June,40,Cairo,1984,X,Soccer
5,Mason,28,Dallas,1996,Snapchat,Swimming
6,Ava,40,Dallas,1984,TikTok,Soccer
7,Jacob,23,Dallas,2001,Instagram,Basketball
8,Isabella,26,Dallas,1998,X,Tennis
9,Noah,22,Dallas,2002,Facebook,Baseball


In [21]:
# Define the new column order 
df = df.iloc[:, [0,1,3,2,5,4]] # selecting columns by their index position

# # Reindex method
# new_order = [0,1,3,2,5,4]
# df = df.reindex(columns=df.columns[new_order])
print("Modified DataFrame: New Columns Order")
df

Modified DataFrame: New Columns Order


Unnamed: 0,Name,Age,Year,City,Sport,Social_Media
0,Olivia,39,1985,Rome,Soccer,TikTok
1,Emma,29,1995,Rome,Basketball,Instagram
2,Liam,35,1989,Rome,Tennis,X
3,Sophia,18,2006,Dallas,Baseball,Facebook
4,June,40,1984,Cairo,Soccer,X
5,Mason,28,1996,Dallas,Swimming,Snapchat
6,Ava,40,1984,Dallas,Soccer,TikTok
7,Jacob,23,2001,Dallas,Basketball,Instagram
8,Isabella,26,1998,Dallas,Tennis,X
9,Noah,22,2002,Dallas,Baseball,Facebook


### Python Review: Tuple  

- Immutability: You cannot add, remove, or modify elements in a tuple after it has been created 

- Fixed Size: The size of a tuple is fixed at the time of creation. If you need a different size, you would have to create a new tuple. 

- Hashable: Because tuples are immutable, they can be used as keys in dictionaries or as elements of sets, while lists (which are mutable) cannot. 

In [22]:
# Tuple with multiple rows
my_tuple = ( (1, 'Alice', 30), 
            (2, 'Bob', 25), 
            (3, 'Charlie', 35) ) 

# Display the tuple 
print("Tuple with multiple rows:") 
for row in my_tuple: 
    print(row) 

# Attempt to change an element (this will raise an error)
try:
    my_tuple[0] = (4, 'David', 28) # This will raise an error 
except TypeError as e: 
    print(f"Error: {e}") 
    
# Attempt to add an element (this will also raise an error) 
try: 
    my_tuple.append((4, 'David', 28)) # This will also raise an error 
except AttributeError as e: 
    print(f"Error: {e}") 
    
# Display the original tuple 
print("Original tuple:", my_tuple) 

# Correct way to "add" an element to a tuple  
new_element = (4, 'David', 28)  

# Create a new tuple that includes the old elements and the new element  
my_tuple = my_tuple + (new_element,) 

print("Updated tuple:", my_tuple) 

Tuple with multiple rows:
(1, 'Alice', 30)
(2, 'Bob', 25)
(3, 'Charlie', 35)
Error: 'tuple' object does not support item assignment
Error: 'tuple' object has no attribute 'append'
Original tuple: ((1, 'Alice', 30), (2, 'Bob', 25), (3, 'Charlie', 35))
Updated tuple: ((1, 'Alice', 30), (2, 'Bob', 25), (3, 'Charlie', 35), (4, 'David', 28))


In [23]:
# Access elements directly from the tuple using standard indexing

my_tuple = ((1, 'Alice', 30),  
            (2, 'Bob', 25),  
            (3, 'Charlie', 35))  

# Accessing values using tuple indexing 
# Get the name of the second person (Bob)  
name_bob = my_tuple[1][1]  
print("Name of the second person:", name_bob) 

# Get the age of the third person (Charlie)  
age_charlie = my_tuple[2][2]  

print("Age of the third person:", age_charlie) 

Name of the second person: Bob
Age of the third person: 35


In [24]:
# Convert the tuple to a DataFrame and use at[], iat[], and loc[] for accessing values. 
data_t = pd.DataFrame(my_tuple, columns=['ID', 'Name', 'Age']) 

# Accessing values using pandas accessors  
# Get the name of the second person (Bob) using at[]  
name_bob = data_t.at[1, 'Name']  
print("Name of the second person using at[]:", name_bob)  

# Get the age of the third person (Charlie) using iat[]  
age_charlie = data_t.iat[2, 2]  
print("Age of the third person using iat[]:", age_charlie)  

# Get the name of the first person (Alice) using loc[]  
name_alice = data_t.loc[0, 'Name']  
print("Name of the first person using loc[]:", name_alice) 

# Using a callable with loc[] 
# Get the entire row for the second person (Bob) 
row_bob = data_t.loc[1] 
print("Row for the second person using loc[]:\n", row_bob) 

Name of the second person using at[]: Bob
Age of the third person using iat[]: 35
Name of the first person using loc[]: Alice
Row for the second person using loc[]:
 ID        2
Name    Bob
Age      25
Name: 1, dtype: object
