## Pandas : Series and DataFrames

In [1]:
import pandas as pd
data = [10,20,30,40]
series = pd.Series(data)
series

0    10
1    20
2    30
3    40
dtype: int64

In [2]:
series =pd.Series(data,index=['a','b','c','d'])
series

a    10
b    20
c    30
d    40
dtype: int64

In [4]:
#Creating a DataFram from a dictionary
data = {
    'Name': ['Alic','Bob','Charlie'],
    'Age': [25,30,25],
    'City': ['New York', 'Los Angeles', 'Chicago']
}
df = pd.DataFrame(data)
df

Unnamed: 0,Name,Age,City
0,Alic,25,New York
1,Bob,30,Los Angeles
2,Charlie,25,Chicago


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Name    3 non-null      object
 1   Age     3 non-null      int64 
 2   City    3 non-null      object
dtypes: int64(1), object(2)
memory usage: 200.0+ bytes


In [6]:
df.describe()

Unnamed: 0,Age
count,3.0
mean,26.666667
std,2.886751
min,25.0
25%,25.0
50%,25.0
75%,27.5
max,30.0


In [7]:
df.dtypes

Name    object
Age      int64
City    object
dtype: object

In [9]:
df['Age'] = df['Age'].astype('float32')
df.dtypes

Name     object
Age     float32
City     object
dtype: object

# Importing Data: Working with CSV Files

In [11]:
import pandas as pd
df = pd.read_csv('model_logs.csv')
df.head()

Unnamed: 0,Date,Prompt,Response Time (ms),Tokens Generated
0,2023-01-01,Generate a creative story about space travel,62,287
1,2023-01-02,Write a sci-fi short story set in 2050,45,361
2,2023-01-03,Write a poem about the future of technology,33,221
3,2023-01-04,Explain quantum computing in simple terms,53,290
4,2023-01-05,Explain quantum computing in simple terms,75,392


In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 4 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   Date                100 non-null    object
 1   Prompt              100 non-null    object
 2   Response Time (ms)  100 non-null    int64 
 3   Tokens Generated    100 non-null    int64 
dtypes: int64(2), object(2)
memory usage: 3.2+ KB


In [13]:
df.describe()

Unnamed: 0,Response Time (ms),Tokens Generated
count,100.0,100.0
mean,66.65,342.93
std,21.114108,90.095264
min,30.0,203.0
25%,47.5,274.75
50%,70.0,324.5
75%,86.0,427.0
max,99.0,500.0


In [15]:
df['Date'] = pd.to_datetime(df['Date'])
df.dtypes

Date                  datetime64[ns]
Prompt                        object
Response Time (ms)             int64
Tokens Generated               int64
dtype: object

In [16]:
df=pd.read_csv('model_logs.csv', parse_dates=['Date'])
df.dtypes

Date                  datetime64[ns]
Prompt                        object
Response Time (ms)             int64
Tokens Generated               int64
dtype: object

In [17]:
mask = df['Response Time (ms)']>50
slow_responses = df[mask]
slow_responses

Unnamed: 0,Date,Prompt,Response Time (ms),Tokens Generated
0,2023-01-01,Generate a creative story about space travel,62,287
3,2023-01-04,Explain quantum computing in simple terms,53,290
4,2023-01-05,Explain quantum computing in simple terms,75,392
5,2023-01-06,Generate marketing copy for a new tech product,61,250
6,2023-01-07,Write a sci-fi short story set in 2050,87,370
...,...,...,...,...
93,2023-04-04,Explain quantum computing in simple terms,97,325
95,2023-04-06,Explain quantum computing in simple terms,61,304
96,2023-04-07,Write a sci-fi short story set in 2050,57,238
98,2023-04-09,Generate marketing copy for a new tech product,77,296


In [19]:
slow_responses.to_csv('slow_responses.csv', index=False)
print("Filtered data saved to 'slow_responses.csv'!")

Filtered data saved to 'slow_responses.csv'!


# Exporting Data to Different Formats: Excel, JSON, SQL, YAML

In [20]:
import pandas as pd
df = pd.read_csv('model_logs.csv')
df.head()

Unnamed: 0,Date,Prompt,Response Time (ms),Tokens Generated
0,2023-01-01,Generate a creative story about space travel,62,287
1,2023-01-02,Write a sci-fi short story set in 2050,45,361
2,2023-01-03,Write a poem about the future of technology,33,221
3,2023-01-04,Explain quantum computing in simple terms,53,290
4,2023-01-05,Explain quantum computing in simple terms,75,392


### Exporting to Excel

In [21]:
pip install openpyxl -q

Note: you may need to restart the kernel to use updated packages.


In [22]:
df.to_excel('data.xlsx', index=False)
print("'model_logs.csv' Data exported to 'data.xslx' in Excel Format")

'model_logs.csv' Data exported to 'data.xslx' in Excel Format


### Exporting to JSON

In [23]:
df.to_json('data.json',orient = 'columns')
print("'model_logs.csv' Data exported to 'data.json' in JSON Format")

'model_logs.csv' Data exported to 'data.json' in JSON Format


### Most LLMs require Line-delimited JSON

In [24]:
df.to_json('llm_data.jsonl',orient='records',lines=True)
print("'model_logs.csv' Data exported to 'llm_data.jsonl' in Line-delimited JSON Format required for LLMs")

'model_logs.csv' Data exported to 'llm_data.jsonl' in Line-delimited JSON Format required for LLMs


### Exporting to SQL

In [27]:
import sqlite3
conn = sqlite3.connect('chat_data.db')
df.to_sql('chat_data.db', conn, if_exists='replace',index=False)
print("New 'chat_db.db' database created")
conn.close

New 'chat_db.db' database created


<function Connection.close()>

### Exporting to YAML

In [28]:
import yaml
data_dict = df.to_dict(orient='records') # 'model_logs.csv' file converted to_dict format for YAML
data_dict

[{'Date': '2023-01-01',
  'Prompt': 'Generate a creative story about space travel',
  'Response Time (ms)': 62,
  'Tokens Generated': 287},
 {'Date': '2023-01-02',
  'Prompt': 'Write a sci-fi short story set in 2050',
  'Response Time (ms)': 45,
  'Tokens Generated': 361},
 {'Date': '2023-01-03',
  'Prompt': 'Write a poem about the future of technology',
  'Response Time (ms)': 33,
  'Tokens Generated': 221},
 {'Date': '2023-01-04',
  'Prompt': 'Explain quantum computing in simple terms',
  'Response Time (ms)': 53,
  'Tokens Generated': 290},
 {'Date': '2023-01-05',
  'Prompt': 'Explain quantum computing in simple terms',
  'Response Time (ms)': 75,
  'Tokens Generated': 392},
 {'Date': '2023-01-06',
  'Prompt': 'Generate marketing copy for a new tech product',
  'Response Time (ms)': 61,
  'Tokens Generated': 250},
 {'Date': '2023-01-07',
  'Prompt': 'Write a sci-fi short story set in 2050',
  'Response Time (ms)': 87,
  'Tokens Generated': 370},
 {'Date': '2023-01-08',
  'Prompt': '

In [29]:
with open('chat_data.yaml', 'w') as file: # exporting data_dict to yaml in write mode as file
    yaml.dump(data_dict, file)
    print("'model_logs.csv' from dataframe df to data_dict to YAML file converted")
    

'model_logs.csv' from dataframe df to data_dict to YAML file converted


# Modifying Data: Adding and Dropping Columns and Rows

In [30]:
import pandas as pd

#Sample DataFrame
data ={ 
    'Name':['Alice','Bob','Charlie'],
    'Age': [25,30,35]
}
df = pd.DataFrame(data)
df.head()

Unnamed: 0,Name,Age
0,Alice,25
1,Bob,30
2,Charlie,35


In [31]:
df['City']=['New York','Los Angeles','Chicago'] # Added a new Column City
df

Unnamed: 0,Name,Age,City
0,Alice,25,New York
1,Bob,30,Los Angeles
2,Charlie,35,Chicago


In [33]:
df['Age Group'] = ['Young' if age <30 else 'Mature' for age in df['Age']] 
# Pandas in action to add meaningful information to the given data into the dataframes
df

Unnamed: 0,Name,Age,City,Age Group
0,Alice,25,New York,Young
1,Bob,30,Los Angeles,Mature
2,Charlie,35,Chicago,Mature


In [34]:
df = df.drop('Age Group', axis=1) # Important to mention the 'axis' to drop 1 column
df

Unnamed: 0,Name,Age,City
0,Alice,25,New York
1,Bob,30,Los Angeles
2,Charlie,35,Chicago


In [35]:
df = df.drop(1) #Drop record in index 1 that is Bob
df

Unnamed: 0,Name,Age,City
0,Alice,25,New York
2,Charlie,35,Chicago


In [36]:
df_copy = df.copy() #To create a copy of dataframe, first thing to do before you start modifying data(recommended)
df_copy

Unnamed: 0,Name,Age,City
0,Alice,25,New York
2,Charlie,35,Chicago


In [37]:
df = df.assign(Discount=[5,10],Total_Spend=[100,200]) # Another way of adding columns or value assignments
df

Unnamed: 0,Name,Age,City,Discount,Total_Spend
0,Alice,25,New York,5,100
2,Charlie,35,Chicago,10,200


# Accessing Data: Using df.iloc[] and df.loc[]

In [38]:
import pandas as pd

#Sample DataFrame
data ={
    'Name':['Alice','Bob','Charlie','David','Eva'],
    'Age': [25,30,35,40,45],
    'City':['New York','Los Angeles','Chicago','Houston','Seattle']
}
df = pd.DataFrame(data)
df

Unnamed: 0,Name,Age,City
0,Alice,25,New York
1,Bob,30,Los Angeles
2,Charlie,35,Chicago
3,David,40,Houston
4,Eva,45,Seattle


## iloc[] = integer based data, data access by index numbers (columns)
## loc[] = label based access, data access by names (rows)

In [39]:
row = df.loc[2]
row

Name    Charlie
Age          35
City    Chicago
Name: 2, dtype: object

In [40]:
cell = df.loc[3,'City']
cell

'Houston'

In [41]:
subset_rows = df.loc[1:3] #Slice dice data
subset_rows

Unnamed: 0,Name,Age,City
1,Bob,30,Los Angeles
2,Charlie,35,Chicago
3,David,40,Houston


In [43]:
subset = df.loc[1:4,['Name','City']] #Slice dice data
subset

Unnamed: 0,Name,City
1,Bob,Los Angeles
2,Charlie,Chicago
3,David,Houston
4,Eva,Seattle


In [44]:
#Using both iloc and loc
mask = df['Age'] > 30
filtered = df.loc[mask].iloc[:,0] #iloc works with zero base indexes
filtered

#What this does? Finds everyone older than 30 and returns the values from the first column for those people.
### Step 1 — df.loc[mask]
### Uses the Boolean mask to filter rows
### Keeps only rows where Age > 30
### Drops all rows where Age <= 30

### Step 2 — .iloc[:, 0]
### : → means “all remaining rows”
### 0 → means “the first column (by position)”
### So this selects:
### The first column of the filtered rows
### It does NOT look at column names — only their position.

2    Charlie
3      David
4        Eva
Name: Name, dtype: object

In [45]:
df

Unnamed: 0,Name,Age,City
0,Alice,25,New York
1,Bob,30,Los Angeles
2,Charlie,35,Chicago
3,David,40,Houston
4,Eva,45,Seattle


In [46]:
df.iloc[0]

Name       Alice
Age           25
City    New York
Name: 0, dtype: object

In [47]:
df.iloc[1:4] # index = 1 included, index = 4 excluded

Unnamed: 0,Name,Age,City
1,Bob,30,Los Angeles
2,Charlie,35,Chicago
3,David,40,Houston


# Sampling and Previewing Data: Using df.sample() and df.head()

In [48]:
import pandas as pd
df = pd.read_csv('model_logs.csv')
df.head(15)

Unnamed: 0,Date,Prompt,Response Time (ms),Tokens Generated
0,2023-01-01,Generate a creative story about space travel,62,287
1,2023-01-02,Write a sci-fi short story set in 2050,45,361
2,2023-01-03,Write a poem about the future of technology,33,221
3,2023-01-04,Explain quantum computing in simple terms,53,290
4,2023-01-05,Explain quantum computing in simple terms,75,392
5,2023-01-06,Generate marketing copy for a new tech product,61,250
6,2023-01-07,Write a sci-fi short story set in 2050,87,370
7,2023-01-08,Describe the impact of AI on healthcare,36,373
8,2023-01-09,Generate marketing copy for a new tech product,96,427
9,2023-01-10,Outline the ethical considerations of AI,37,457


In [52]:
random_rows = df.sample(n=3)
random_rows

Unnamed: 0,Date,Prompt,Response Time (ms),Tokens Generated
56,2023-02-26,Summarize the latest research on climate change,86,500
90,2023-04-01,Generate a creative story about space travel,71,482
82,2023-03-24,Create a recipe using plant-based ingredients,88,206


In [53]:
# What does this do? => Give me 5 random rows from this dataset, but make sure I get the same 5 every time I run this.
# df.sample(...) → tells Pandas to take a random sample of rows
# n=5 → return exactly 5 rows
# random_state=42 → fixes the randomness so the same 5 rows are chosen every time
# Without random_state, Pandas would choose different rows on each run.

random_sample = df.sample(n=5, random_state=42) 
random_sample

Unnamed: 0,Date,Prompt,Response Time (ms),Tokens Generated
83,2023-03-25,Summarize the latest research on climate change,70,361
53,2023-02-23,Generate marketing copy for a new tech product,74,318
70,2023-03-12,Explain quantum computing in simple terms,80,450
45,2023-02-15,Outline the ethical considerations of AI,72,385
44,2023-02-14,Explain quantum computing in simple terms,52,300


In [54]:
fraction_sample = df.sample(frac=0.3) #Represent 30% of the dataset
fraction_sample

Unnamed: 0,Date,Prompt,Response Time (ms),Tokens Generated
46,2023-02-16,Describe the impact of AI on healthcare,73,302
88,2023-03-30,Explain blockchain for beginners,51,461
83,2023-03-25,Summarize the latest research on climate change,70,361
34,2023-02-04,Outline the ethical considerations of AI,42,231
84,2023-03-26,Write a sci-fi short story set in 2050,89,255
8,2023-01-09,Generate marketing copy for a new tech product,96,427
91,2023-04-02,Explain quantum computing in simple terms,69,462
0,2023-01-01,Generate a creative story about space travel,62,287
68,2023-03-10,Explain blockchain for beginners,59,316
18,2023-01-19,Outline the ethical considerations of AI,78,242


# Filtering Data: Masks and pandas.Series.betweeen()

In [55]:
import pandas as pd

#Sample DataFrame
data ={
    'Name':['Alice','Bob','Charlie','David','Eva'],
    'Age': [25,30,35,40,45],
    'City':['New York','Los Angeles','Chicago','Houston','Seattle']
}
df = pd.DataFrame(data)
df

Unnamed: 0,Name,Age,City
0,Alice,25,New York
1,Bob,30,Los Angeles
2,Charlie,35,Chicago
3,David,40,Houston
4,Eva,45,Seattle


In [56]:
mask = df['Age']>30 #boolean series
mask

0    False
1    False
2     True
3     True
4     True
Name: Age, dtype: bool

In [57]:
filtered_df = df[mask] #gives only the rows where the previos mentioned condition Age >30 is true
filtered_df

Unnamed: 0,Name,Age,City
2,Charlie,35,Chicago
3,David,40,Houston
4,Eva,45,Seattle


In [58]:
mask = (df['Age']>30) & (df['City']!='Houston') 
#In Python the & and OR operators have higher precendences than != so the parenthesis becomes important to be correct
filtered_df = df[mask]
filtered_df

Unnamed: 0,Name,Age,City
2,Charlie,35,Chicago
4,Eva,45,Seattle


In [59]:
mask

0    False
1    False
2     True
3    False
4     True
dtype: bool

In [60]:
df

Unnamed: 0,Name,Age,City
0,Alice,25,New York
1,Bob,30,Los Angeles
2,Charlie,35,Chicago
3,David,40,Houston
4,Eva,45,Seattle


### pandas.Series.between() make data filtering easy by combining condition or using with 'isin() and mask()

In [61]:
df

Unnamed: 0,Name,Age,City
0,Alice,25,New York
1,Bob,30,Los Angeles
2,Charlie,35,Chicago
3,David,40,Houston
4,Eva,45,Seattle


In [62]:
filtered_df = df[df['Age'].between(30,40)]
filtered_df

Unnamed: 0,Name,Age,City
1,Bob,30,Los Angeles
2,Charlie,35,Chicago
3,David,40,Houston


In [63]:
mask = df['City'].isin(['Chicago', 'Houston'])
filtered_df = df[mask]
filtered_df

Unnamed: 0,Name,Age,City
2,Charlie,35,Chicago
3,David,40,Houston


# Sorting Data

In [64]:
import pandas as pd

#Sample DataFrame
data ={
    'Name':['Alice','Bob','Charlie','David','Eva'],
    'Age': [25,30,35,40,45],
    'City':['New York','Los Angeles','Chicago','Houston','Seattle']
}
df = pd.DataFrame(data)
df

Unnamed: 0,Name,Age,City
0,Alice,25,New York
1,Bob,30,Los Angeles
2,Charlie,35,Chicago
3,David,40,Houston
4,Eva,45,Seattle


In [66]:
sorted_df = df.sort_values(by='Age')
sorted_df

Unnamed: 0,Name,Age,City
0,Alice,25,New York
1,Bob,30,Los Angeles
2,Charlie,35,Chicago
3,David,40,Houston
4,Eva,45,Seattle


In [67]:
sorted_df = df.sort_values(by='Age', ascending = False)
sorted_df

Unnamed: 0,Name,Age,City
4,Eva,45,Seattle
3,David,40,Houston
2,Charlie,35,Chicago
1,Bob,30,Los Angeles
0,Alice,25,New York


In [68]:
sorted_df = df.sort_values(by=['City','Age'])
sorted_df

Unnamed: 0,Name,Age,City
2,Charlie,35,Chicago
3,David,40,Houston
1,Bob,30,Los Angeles
0,Alice,25,New York
4,Eva,45,Seattle


In [69]:
sorted_df = df.sort_index()
sorted_df


Unnamed: 0,Name,Age,City
0,Alice,25,New York
1,Bob,30,Los Angeles
2,Charlie,35,Chicago
3,David,40,Houston
4,Eva,45,Seattle


In [70]:
df.sort_values(by='Age', ascending=False, inplace=True) #Inplace modifies the original dataframe without creating a new one
df

Unnamed: 0,Name,Age,City
4,Eva,45,Seattle
3,David,40,Houston
2,Charlie,35,Chicago
1,Bob,30,Los Angeles
0,Alice,25,New York


In [71]:
data = {
    'Player':['Alice', 'Bob', 'Charlie', 'Diana', 'Eve'],
    'Score':[150,200,125,300,175]
}
df = pd.DataFrame(data)
df

Unnamed: 0,Player,Score
0,Alice,150
1,Bob,200
2,Charlie,125
3,Diana,300
4,Eve,175


In [74]:
sorted_df = df.sort_values(by='Score', ascending=False).reset_index(drop=True) #drop=True allows removing the old indexes
sorted_df['Rank'] = sorted_df.index + 1 #Rank columns is added with Ranks given based on sorted scores
sorted_df
#Note that the indexes changed and the data was sorted in descending order

Unnamed: 0,Player,Score,Rank
0,Diana,300,1
1,Bob,200,2
2,Eve,175,3
3,Alice,150,4
4,Charlie,125,5


# Handling Missing Data

In [76]:
import pandas as pd
data = {
    'Name': ['Alice','Bob','Claire','Diana'],
    'Age': [25,None,35,40],
    'City': ['New York', 'Los Angeles', None, 'Houston']    
}
df = pd.DataFrame(data)
df

Unnamed: 0,Name,Age,City
0,Alice,25.0,New York
1,Bob,,Los Angeles
2,Claire,35.0,
3,Diana,40.0,Houston


In [77]:
df.isna()

Unnamed: 0,Name,Age,City
0,False,False,False
1,False,True,False
2,False,False,True
3,False,False,False


In [78]:
df_cleaned = df.dropna()
df_cleaned

Unnamed: 0,Name,Age,City
0,Alice,25.0,New York
3,Diana,40.0,Houston


In [79]:
df

Unnamed: 0,Name,Age,City
0,Alice,25.0,New York
1,Bob,,Los Angeles
2,Claire,35.0,
3,Diana,40.0,Houston


In [80]:
df_cleaned = df.dropna(axis = 1) #To drop columns with missing data use axis=1
df_cleaned

Unnamed: 0,Name
0,Alice
1,Bob
2,Claire
3,Diana


In [81]:
df

Unnamed: 0,Name,Age,City
0,Alice,25.0,New York
1,Bob,,Los Angeles
2,Claire,35.0,
3,Diana,40.0,Houston


In [82]:
df['Age'] = df['Age'].fillna(30)
df

Unnamed: 0,Name,Age,City
0,Alice,25.0,New York
1,Bob,30.0,Los Angeles
2,Claire,35.0,
3,Diana,40.0,Houston


In [83]:
df['Age'] = df['Age'].fillna(df['Age'].mean()) #to fill the Age column with the mean data of the column
df

Unnamed: 0,Name,Age,City
0,Alice,25.0,New York
1,Bob,30.0,Los Angeles
2,Claire,35.0,
3,Diana,40.0,Houston


In [84]:
df['City'] = df['City'].fillna('Unknown')
df

Unnamed: 0,Name,Age,City
0,Alice,25.0,New York
1,Bob,30.0,Los Angeles
2,Claire,35.0,Unknown
3,Diana,40.0,Houston


In [85]:
df.info() #Always use intially to find the nulls and non-null values in each column

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Name    4 non-null      object 
 1   Age     4 non-null      float64
 2   City    4 non-null      object 
dtypes: float64(1), object(2)
memory usage: 224.0+ bytes


# Aggregrations and Grouping Data

In [86]:
import pandas as pd
data ={
    'Category':['Electronics','Electronics', 'Furniture','Furniture', 'Apparel'],
    'Product': ['Laptop','Mouse','Chair','Table','Shoes'],
    'Sales':[1200,100,400,300,50]
}
df = pd.DataFrame(data)
df

Unnamed: 0,Category,Product,Sales
0,Electronics,Laptop,1200
1,Electronics,Mouse,100
2,Furniture,Chair,400
3,Furniture,Table,300
4,Apparel,Shoes,50


In [91]:
grouped = df.groupby('Category')['Sales'].sum().reset_index() #Group by Category and add sales
grouped

Unnamed: 0,Category,Sales
0,Apparel,50
1,Electronics,1300
2,Furniture,700


In [89]:
grouped = df.groupby(['Category','Product'])['Sales'].sum() #Grouped by Category and Product and calculated Sales
grouped

Category     Product
Apparel      Shoes        50
Electronics  Laptop     1200
             Mouse       100
Furniture    Chair       400
             Table       300
Name: Sales, dtype: int64

In [90]:
summary = df.groupby('Category')['Sales'].agg(['sum','mean', 'count'])
summary

Unnamed: 0_level_0,sum,mean,count
Category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Apparel,50,50.0,1
Electronics,1300,650.0,2
Furniture,700,350.0,2


# Project: Analyzing Website Traffic Data

In [92]:
import pandas as pd
df = pd.read_csv('website_traffic_data.csv')
df

Unnamed: 0,Source,SessionDuration
0,Organic,8.15
1,Organic,17.28
2,Direct,8.62
3,Paid,17.95
4,Paid,27.04
...,...,...
95,Referral,24.42
96,Referral,6.52
97,Paid,3.81
98,Direct,13.50


In [93]:
grouped_data= df.groupby('Source')
average_durations = grouped_data['SessionDuration'].mean()
average_durations

Source
Direct      15.151364
Organic     16.472609
Paid        12.542083
Referral    14.485333
Social      17.070625
Name: SessionDuration, dtype: float64

# Time Series Data Manipulation in Pandas

In [94]:
import pandas as pd
data={
    'OrderID':[1001,1002,1003,1004],
    'OrderDate':['2023-11-01','2023-11-02','2023-11-03','2023-11-04'],
    'Sales':[250,400,150,300]
}
df = pd.DataFrame(data)
df

Unnamed: 0,OrderID,OrderDate,Sales
0,1001,2023-11-01,250
1,1002,2023-11-02,400
2,1003,2023-11-03,150
3,1004,2023-11-04,300


In [95]:
df.dtypes

OrderID       int64
OrderDate    object
Sales         int64
dtype: object

In [96]:
df['OrderDate'] = pd.to_datetime(df['OrderDate'])
df.dtypes

OrderID               int64
OrderDate    datetime64[ns]
Sales                 int64
dtype: object

In [97]:
df.set_index('OrderDate',inplace=True) #set index based on datetime
df

Unnamed: 0_level_0,OrderID,Sales
OrderDate,Unnamed: 1_level_1,Unnamed: 2_level_1
2023-11-01,1001,250
2023-11-02,1002,400
2023-11-03,1003,150
2023-11-04,1004,300


In [98]:
df['Year']=df.index.year
df['Month']=df.index.month
df['Day']=df.index.day
df

Unnamed: 0_level_0,OrderID,Sales,Year,Month,Day
OrderDate,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2023-11-01,1001,250,2023,11,1
2023-11-02,1002,400,2023,11,2
2023-11-03,1003,150,2023,11,3
2023-11-04,1004,300,2023,11,4


In [99]:
weekly_sales = df['Sales'].resample('W').sum() #resample W stands for weekly calculation of sales
weekly_sales

OrderDate
2023-11-05    1100
Freq: W-SUN, Name: Sales, dtype: int64

In [100]:
df = pd.read_csv('website_traffic_data_datetime.csv')
df

Unnamed: 0,Source,SessionDuration,Visits,Date
0,Direct,27.53,267,2023-01-01
1,Referral,25.65,476,2023-01-02
2,Social,14.03,441,2023-01-03
3,Referral,3.77,473,2023-01-04
4,Referral,11.75,434,2023-01-05
...,...,...,...,...
95,Social,3.79,226,2023-04-06
96,Referral,28.28,277,2023-04-07
97,Paid,12.53,286,2023-04-08
98,Paid,16.01,97,2023-04-09


In [101]:
df['Date']=pd.to_datetime(df['Date'])
df

Unnamed: 0,Source,SessionDuration,Visits,Date
0,Direct,27.53,267,2023-01-01
1,Referral,25.65,476,2023-01-02
2,Social,14.03,441,2023-01-03
3,Referral,3.77,473,2023-01-04
4,Referral,11.75,434,2023-01-05
...,...,...,...,...
95,Social,3.79,226,2023-04-06
96,Referral,28.28,277,2023-04-07
97,Paid,12.53,286,2023-04-08
98,Paid,16.01,97,2023-04-09


In [102]:
df['Date']=pd.to_datetime(df['Date'])
df.set_index('Date', inplace=True) #Set the index to date columns
df

Unnamed: 0_level_0,Source,SessionDuration,Visits
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2023-01-01,Direct,27.53,267
2023-01-02,Referral,25.65,476
2023-01-03,Social,14.03,441
2023-01-04,Referral,3.77,473
2023-01-05,Referral,11.75,434
...,...,...,...
2023-04-06,Social,3.79,226
2023-04-07,Referral,28.28,277
2023-04-08,Paid,12.53,286
2023-04-09,Paid,16.01,97


In [104]:
monthly_visits = df['Visits'].resample('M').sum()
# resample M sets the sum calculation on Month basis
# M = Month End
# MS = Month Start 
# Resampling only works if your DataFrame has a DatetimeIndex.
monthly_visits

Date
2023-01-31    7813
2023-02-28    7137
2023-03-31    8384
2023-04-30    2233
Freq: M, Name: Visits, dtype: int64