In [1]:
import pandas as pd
import numpy as np

In [2]:

# https://github.com/datasciencedojo/datasets
titanic_dataset_github_url = "https://raw.githubusercontent.com/datasciencedojo/datasets/refs/heads/master/titanic.csv"

# https://github.com/mwaskom/seaborn-data/tree/master

iris_dataset_github_url = "https://raw.githubusercontent.com/mwaskom/seaborn-data/refs/heads/master/iris.csv"

# kaggle
kaggle_link = "https://www.kaggle.com/datasets/yasserh/titanic-dataset/data"

In [15]:
df = pd.read_csv(titanic_dataset_github_url)

In [16]:
df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


# Find Position of Substring

In [4]:
df['Name'].str.find("Mr")

0       8
1       9
2      -1
3      10
4       7
       ..
886    -1
887    -1
888    -1
889     6
890     8
Name: Name, Length: 891, dtype: int64

In [5]:
# Count passengers with “Miss” in name

count = df['Name'].str.contains("Miss").sum()
print("Number of 'Miss':", count)

Number of 'Miss': 182


In [10]:
data = {
    'Name' : [
        'Monal K', 'Bhuvika', 'Niranjan', 'riyan'
    ],
    'Email' : [
        'm@gmail.com', 'b@gmail.com', 'n@gmail.com', 'r@gmail.com'
    ]
}

df = pd.DataFrame(data)

In [11]:
df.head()

Unnamed: 0,Name,Email
0,Monal K,m@gmail.com
1,Bhuvika,b@gmail.com
2,Niranjan,n@gmail.com
3,riyan,r@gmail.com


In [None]:
df['Name'].str.replace("K", "Kumar", regex=False)

0    Monal Kumar
1        Bhuvika
2       Niranjan
3          riyan
Name: Name, dtype: object

In [13]:
df['Name'].str.pad(width=20, side='left', fillchar='-')

0    -------------Monal K
1    -------------Bhuvika
2    ------------Niranjan
3    ---------------riyan
Name: Name, dtype: object

In [14]:
df['Name'].str.pad(width=20, side='right', fillchar='-')

0    Monal K-------------
1    Bhuvika-------------
2    Niranjan------------
3    riyan---------------
Name: Name, dtype: object

# Options and customizations

In [21]:
pd.set_option('display.max_rows', 3)

In [22]:
df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
890,891,0,3,"Dooley, Mr. Patrick",male,32.0,0,0,370376,7.75,,Q


In [28]:
# Show all columns
pd.set_option('display.max_columns', None)

In [33]:
df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
890,891,0,3,"Dooley, Mr. Patrick",male,32.0,0,0,370376,7.75,,Q


In [37]:
pd.set_option('display.max_rows', 30)

In [38]:
df[['Fare', 'Age']].head()

Unnamed: 0,Fare,Age
0,7.25,22.0
1,71.2833,38.0
2,7.925,26.0
3,53.1,35.0
4,8.05,35.0


In [39]:
# Set Float Precision to 2 Decimal Places
pd.set_option('display.float_format', '{:.2f}'.format)

In [40]:
df[['Fare', 'Age']].head()

Unnamed: 0,Fare,Age
0,7.25,22.0
1,71.28,38.0
2,7.92,26.0
3,53.1,35.0
4,8.05,35.0


In [41]:
# Check Current Value of an Option
pd.get_option('display.max_rows')

30

In [None]:
# Temporarily Set Option (Context Manager)
with pd.option_context('display.max_rows', 3):
    print(df)

     PassengerId  Survived  Pclass                     Name   Sex   Age  \
0              1         0       3  Braund, Mr. Owen Harris  male 22.00   
..           ...       ...     ...                      ...   ...   ...   
890          891         0       3      Dooley, Mr. Patrick  male 32.00   

     SibSp  Parch     Ticket  Fare Cabin Embarked  
0        1      0  A/5 21171  7.25   NaN        S  
..     ...    ...        ...   ...   ...      ...  
890      0      0     370376  7.75   NaN        Q  

[891 rows x 12 columns]


In [44]:
# Resetting to Default
pd.reset_option('display.max_rows')

In [45]:
pd.get_option('display.max_rows')

60

In [46]:
# You can also reset all options:
pd.reset_option('all')

  pd.reset_option('all')
  pd.reset_option('all')


In [49]:
import warnings


with warnings.catch_warnings():
    warnings.simplefilter('ignore', FutureWarning)
    pd.reset_option('all')

# Categorical Data

In [50]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [51]:
print(df['Sex'].dtype)

object


In [52]:
# Convert a column to categorical
df['Sex_C'] = df['Sex'].astype('category')

In [53]:
df[['Sex', 'Sex_C']].head()

Unnamed: 0,Sex,Sex_C
0,male,male
1,female,female
2,female,female
3,female,female
4,male,male


In [54]:
print(df['Sex'].dtype)
print(df['Sex_C'].dtype)

object
category


In [56]:
df['Sex_C_categorical_codes'] = df['Sex_C'].cat.codes

In [57]:
df[['Sex', 'Sex_C', 'Sex_C_categorical_codes']].head()

Unnamed: 0,Sex,Sex_C,Sex_C_categorical_codes
0,male,male,1
1,female,female,0
2,female,female,0
3,female,female,0
4,male,male,1


In [58]:
df['Sex_apply_category'] = df['Sex'].apply(lambda x : 0 if x == 'male' else 1)

In [59]:
df[['Sex', 'Sex_C', 'Sex_C_categorical_codes', 'Sex_apply_category']].head()

Unnamed: 0,Sex,Sex_C,Sex_C_categorical_codes,Sex_apply_category
0,male,male,1,0
1,female,female,0,1
2,female,female,0,1
3,female,female,0,1
4,male,male,1,0


In [60]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Sex_C,Sex_C_categorical_codes,Sex_apply_category
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,male,1,0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,female,0,1
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,female,0,1
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,female,0,1
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,male,1,0


In [64]:
df['Pclass'].unique()

array([3, 1, 2])

In [65]:
df['Pclass_new_weird'] = df['Pclass'].apply(lambda x : 'adf' if x == 3 else ('wer' if x == 2 else 'tyu'))

In [67]:
df[['Pclass', 'Pclass_new_weird']].head(10)

Unnamed: 0,Pclass,Pclass_new_weird
0,3,adf
1,1,tyu
2,3,adf
3,1,tyu
4,3,adf
5,3,adf
6,1,tyu
7,3,adf
8,3,adf
9,2,wer


In [None]:
# analysis : 
# tyu > adf > wer

In [68]:
print(df['Pclass'].dtype)
print(df['Pclass_new_weird'].dtype)

int64
object


In [70]:
df['Pclass_new_weird_ordered'] = pd.Categorical(
    df['Pclass_new_weird'],
    categories=['wer','adf', 'tyu'], # Custom order (ascending)
    ordered=True
)

In [71]:
print(df['Pclass_new_weird_ordered'].dtype)

category


In [72]:
df['Pclass_new_weird_ordered'].cat.categories

Index(['wer', 'adf', 'tyu'], dtype='object')

# Date Functionality in pandas

In [101]:
df = pd.DataFrame({
    'event': ['Concert', 'Conference', 'Wedding', 'Reception', 'last_day'],
    'date': ['2025-01-01', '2025-03-15', '2025-07-20', '2025-07-21', '2025-07-31']
})


In [102]:
df['date_pandas'] = pd.to_datetime(df['date'])

In [103]:
df.head()
# YYYY-MM-DD

Unnamed: 0,event,date,date_pandas
0,Concert,2025-01-01,2025-01-01
1,Conference,2025-03-15,2025-03-15
2,Wedding,2025-07-20,2025-07-20
3,Reception,2025-07-21,2025-07-21
4,last_day,2025-07-31,2025-07-31


In [104]:
print(df.dtypes[['event', 'date', 'date_pandas']])

event                  object
date                   object
date_pandas    datetime64[ns]
dtype: object


In [105]:
df['date_pandas'].dt.year

0    2025
1    2025
2    2025
3    2025
4    2025
Name: date_pandas, dtype: int32

In [106]:
df['date_pandas'].dt.month

0    1
1    3
2    7
3    7
4    7
Name: date_pandas, dtype: int32

In [107]:
df['date_pandas'].dt.day

0     1
1    15
2    20
3    21
4    31
Name: date_pandas, dtype: int32

In [108]:
df['date_pandas'].dt.weekday

0    2
1    5
2    6
3    0
4    3
Name: date_pandas, dtype: int32

In [109]:
df['date_pandas'].dt.day_name()

0    Wednesday
1     Saturday
2       Sunday
3       Monday
4     Thursday
Name: date_pandas, dtype: object

In [110]:
df['date_pandas'].dt.is_month_end

0    False
1    False
2    False
3    False
4     True
Name: date_pandas, dtype: bool

In [111]:
df[df['date_pandas'] > '2025-04-01']

Unnamed: 0,event,date,date_pandas
2,Wedding,2025-07-20,2025-07-20
3,Reception,2025-07-21,2025-07-21
4,last_day,2025-07-31,2025-07-31


In [112]:
df[df['date_pandas'].between('2025-01-01', '2025-06-01')]

Unnamed: 0,event,date,date_pandas
0,Concert,2025-01-01,2025-01-01
1,Conference,2025-03-15,2025-03-15


In [113]:
# Date range

In [114]:
pd.date_range(start='2024-01-01', end='2024-01-10', freq='D')

DatetimeIndex(['2024-01-01', '2024-01-02', '2024-01-03', '2024-01-04',
               '2024-01-05', '2024-01-06', '2024-01-07', '2024-01-08',
               '2024-01-09', '2024-01-10'],
              dtype='datetime64[ns]', freq='D')

In [115]:
pd.date_range(start='2024-01-01', end='2024-03-10', freq='ME')

DatetimeIndex(['2024-01-31', '2024-02-29'], dtype='datetime64[ns]', freq='ME')

In [116]:
pd.date_range(start='2024-01-01', end='2024-03-10', freq='MS')

DatetimeIndex(['2024-01-01', '2024-02-01', '2024-03-01'], dtype='datetime64[ns]', freq='MS')

# Date Arithmetic

In [117]:
df['next_week'] = df['date_pandas'] + pd.Timedelta(days=7)

In [118]:
df[['date_pandas', 'next_week']]

Unnamed: 0,date_pandas,next_week
0,2025-01-01,2025-01-08
1,2025-03-15,2025-03-22
2,2025-07-20,2025-07-27
3,2025-07-21,2025-07-28
4,2025-07-31,2025-08-07


In [119]:
df['days_to_event'] = df['date_pandas'] - pd.Timestamp.today()

In [120]:
df.head()

Unnamed: 0,event,date,date_pandas,next_week,days_to_event
0,Concert,2025-01-01,2025-01-01,2025-01-08,-165 days +01:59:08.488429
1,Conference,2025-03-15,2025-03-15,2025-03-22,-92 days +01:59:08.488429
2,Wedding,2025-07-20,2025-07-20,2025-07-27,35 days 01:59:08.488429
3,Reception,2025-07-21,2025-07-21,2025-07-28,36 days 01:59:08.488429
4,last_day,2025-07-31,2025-07-31,2025-08-07,46 days 01:59:08.488429


In [None]:
# Assignment - 1

# TODO - While creating a task, ask user about the deadline

# Start date - Pick automatically from today's date
# Deadline - 7, 10, 10

# # to-DO

# 1. Task-1 | Days left  | Green-color
# 2. Task-2 | Days left  | Red-color
# 3. Task-3 | Days left  | Green-color
# 4. Task-4 | Days left  | Red-color

5.5