In [2]:
 import pandas as pd
 import numpy as np

In [3]:
import pandas as pd
import random
from datetime import datetime, timedelta

# ---------- 1. Customers ----------
customer_ids = [f'CUST{100+i}' for i in range(15)]
customers = pd.DataFrame({
    'customer_id': customer_ids,
    'customer_name': [f'Customer_{i}' for i in range(len(customer_ids))],
    'country': random.choices(['USA', 'UK', 'Germany', 'India'], k=len(customer_ids))
})
customers.to_csv("customers.csv", index=False)

# ---------- 2. Products ----------
product_ids = [f'PROD{200+i}' for i in range(10)]
products = pd.DataFrame({
    'product_id': product_ids,
    'product_name': [f'Product_{i}' for i in range(len(product_ids))],
    'price': [random.randint(20, 200) for _ in range(len(product_ids))]
})
products.to_csv("products.csv", index=False)

# ---------- 3. Orders (simulate realistic purchases) ----------
orders = []

def random_date():
    start = datetime(2025, 7, 1)
    end = datetime(2025, 7, 20)
    return (start + timedelta(days=random.randint(0, (end - start).days))).strftime("%Y-%m-%d")

for i in range(60):  # 60 orders to simulate multiple purchases
    customer = random.choice(customer_ids)
    product = random.choice(product_ids)
    order = {
        'order_id': f'ORD{i+1:04}',
        'customer_id': customer,
        'product_id': product,
        'quantity': random.randint(1, 5),
        'order_date': random_date()
    }
    orders.append(order)

orders_df = pd.DataFrame(orders)
orders_df.to_csv("orders.csv", index=False)

In [4]:
orders = pd.DataFrame(orders)
orders

Unnamed: 0,order_id,customer_id,product_id,quantity,order_date
0,ORD0001,CUST106,PROD207,3,2025-07-14
1,ORD0002,CUST114,PROD206,3,2025-07-20
2,ORD0003,CUST112,PROD208,3,2025-07-10
3,ORD0004,CUST112,PROD202,1,2025-07-15
4,ORD0005,CUST101,PROD204,2,2025-07-08
5,ORD0006,CUST109,PROD206,5,2025-07-03
6,ORD0007,CUST101,PROD203,2,2025-07-01
7,ORD0008,CUST105,PROD203,1,2025-07-08
8,ORD0009,CUST103,PROD206,3,2025-07-19
9,ORD0010,CUST106,PROD201,4,2025-07-04


In [5]:
customers =  pd.DataFrame(customers)
customers

Unnamed: 0,customer_id,customer_name,country
0,CUST100,Customer_0,UK
1,CUST101,Customer_1,USA
2,CUST102,Customer_2,UK
3,CUST103,Customer_3,USA
4,CUST104,Customer_4,UK
5,CUST105,Customer_5,UK
6,CUST106,Customer_6,UK
7,CUST107,Customer_7,UK
8,CUST108,Customer_8,USA
9,CUST109,Customer_9,USA


In [6]:
products = pd.DataFrame(products)
products

Unnamed: 0,product_id,product_name,price
0,PROD200,Product_0,60
1,PROD201,Product_1,43
2,PROD202,Product_2,50
3,PROD203,Product_3,187
4,PROD204,Product_4,82
5,PROD205,Product_5,169
6,PROD206,Product_6,178
7,PROD207,Product_7,190
8,PROD208,Product_8,147
9,PROD209,Product_9,116


In [7]:
orders['order_date'] = pd.to_datetime(orders['order_date'])
orders

Unnamed: 0,order_id,customer_id,product_id,quantity,order_date
0,ORD0001,CUST106,PROD207,3,2025-07-14
1,ORD0002,CUST114,PROD206,3,2025-07-20
2,ORD0003,CUST112,PROD208,3,2025-07-10
3,ORD0004,CUST112,PROD202,1,2025-07-15
4,ORD0005,CUST101,PROD204,2,2025-07-08
5,ORD0006,CUST109,PROD206,5,2025-07-03
6,ORD0007,CUST101,PROD203,2,2025-07-01
7,ORD0008,CUST105,PROD203,1,2025-07-08
8,ORD0009,CUST103,PROD206,3,2025-07-19
9,ORD0010,CUST106,PROD201,4,2025-07-04


In [8]:
table  = orders.set_index('product_id').join(products.set_index('product_id'), on = 'product_id', how = 'left')
table

Unnamed: 0_level_0,order_id,customer_id,quantity,order_date,product_name,price
product_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
PROD207,ORD0001,CUST106,3,2025-07-14,Product_7,190
PROD206,ORD0002,CUST114,3,2025-07-20,Product_6,178
PROD208,ORD0003,CUST112,3,2025-07-10,Product_8,147
PROD202,ORD0004,CUST112,1,2025-07-15,Product_2,50
PROD204,ORD0005,CUST101,2,2025-07-08,Product_4,82
PROD206,ORD0006,CUST109,5,2025-07-03,Product_6,178
PROD203,ORD0007,CUST101,2,2025-07-01,Product_3,187
PROD203,ORD0008,CUST105,1,2025-07-08,Product_3,187
PROD206,ORD0009,CUST103,3,2025-07-19,Product_6,178
PROD201,ORD0010,CUST106,4,2025-07-04,Product_1,43


In [9]:
table['revenue'] = table['quantity']*table['price']
table

Unnamed: 0_level_0,order_id,customer_id,quantity,order_date,product_name,price,revenue
product_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
PROD207,ORD0001,CUST106,3,2025-07-14,Product_7,190,570
PROD206,ORD0002,CUST114,3,2025-07-20,Product_6,178,534
PROD208,ORD0003,CUST112,3,2025-07-10,Product_8,147,441
PROD202,ORD0004,CUST112,1,2025-07-15,Product_2,50,50
PROD204,ORD0005,CUST101,2,2025-07-08,Product_4,82,164
PROD206,ORD0006,CUST109,5,2025-07-03,Product_6,178,890
PROD203,ORD0007,CUST101,2,2025-07-01,Product_3,187,374
PROD203,ORD0008,CUST105,1,2025-07-08,Product_3,187,187
PROD206,ORD0009,CUST103,3,2025-07-19,Product_6,178,534
PROD201,ORD0010,CUST106,4,2025-07-04,Product_1,43,172


In [10]:
monthly_revenue = table.groupby(table['order_date'].dt.to_period('M'))['revenue'].sum()
monthly_revenue

Unnamed: 0_level_0,revenue
order_date,Unnamed: 1_level_1
2025-07,18156


In [11]:
# calculate the which countrywise customers purchase more

customer_table = orders.set_index('customer_id').join(customers.set_index('customer_id'),on = 'customer_id', how = 'left')
customer_table


Unnamed: 0_level_0,order_id,product_id,quantity,order_date,customer_name,country
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
CUST106,ORD0001,PROD207,3,2025-07-14,Customer_6,UK
CUST114,ORD0002,PROD206,3,2025-07-20,Customer_14,UK
CUST112,ORD0003,PROD208,3,2025-07-10,Customer_12,UK
CUST112,ORD0004,PROD202,1,2025-07-15,Customer_12,UK
CUST101,ORD0005,PROD204,2,2025-07-08,Customer_1,USA
CUST109,ORD0006,PROD206,5,2025-07-03,Customer_9,USA
CUST101,ORD0007,PROD203,2,2025-07-01,Customer_1,USA
CUST105,ORD0008,PROD203,1,2025-07-08,Customer_5,UK
CUST103,ORD0009,PROD206,3,2025-07-19,Customer_3,USA
CUST106,ORD0010,PROD201,4,2025-07-04,Customer_6,UK


In [12]:
df = customer_table.groupby(['country'])['customer_name'].max()
df

Unnamed: 0_level_0,customer_name
country,Unnamed: 1_level_1
Germany,Customer_13
UK,Customer_7
USA,Customer_9


In [13]:
df = pd.DataFrame({'year': [2015, 2016],
                   'month': [2, 3],
                   'day': [4, 5]})
pd.to_datetime(df)

Unnamed: 0,0
0,2015-02-04
1,2016-03-05


In [14]:
pd.to_datetime(1490195805, unit='s')

Timestamp('2017-03-22 15:16:45')

In [15]:
pd.to_datetime([1, 2, 3], unit='D',
               origin=pd.Timestamp('1960-01-01'))

DatetimeIndex(['1960-01-02', '1960-01-03', '1960-01-04'], dtype='datetime64[ns]', freq=None)

In [16]:
pd.to_datetime('2018-10-26 12:00:00.0000000011',
               format='%Y-%m-%d %H:%M:%S.%f')

Timestamp('2018-10-26 12:00:00.000000001')

In [17]:
date = pd.to_datetime('12000003',format='%Y%m%d', errors = 'coerce')
date

NaT

In [18]:
result = pd.to_datetime(['2018-10-26 12:00:00', '2018-10-26 13:00:15'])
result

DatetimeIndex(['2018-10-26 12:00:00', '2018-10-26 13:00:15'], dtype='datetime64[ns]', freq=None)

In [19]:
from datetime import datetime
pd.to_datetime(["2020-01-01 01:00:00-01:00",
                datetime(2020, 1, 1, 3, 0)])

  pd.to_datetime(["2020-01-01 01:00:00-01:00",


Index([2020-01-01 01:00:00-01:00, 2020-01-01 03:00:00], dtype='object')

In [21]:
time = pd.to_datetime(['2020-10-25 02:00 +0200','2020-10-25 04:00 +0100'])
print(time)


Index([2020-10-25 02:00:00+02:00, 2020-10-25 04:00:00+01:00], dtype='object')


  time = pd.to_datetime(['2020-10-25 02:00 +0200','2020-10-25 04:00 +0100'])


In [22]:
pd.to_datetime(['2018-10-26 12:00 -0530', '2018-10-26 12:00 -0500'],
               utc=True)

DatetimeIndex(['2018-10-26 17:30:00+00:00', '2018-10-26 17:00:00+00:00'], dtype='datetime64[ns, UTC]', freq=None)

In [23]:
df = pd.DataFrame({'date_col':['2023-01-15','2024-07-20','2025-03-05']})


df['date_col'] = pd.to_datetime(df['date_col'])


df['year'] = df['date_col'].dt.year
df['month'] = df['date_col'].dt.month
df['year'] = df['date_col'].dt.day

In [25]:
date = "2010-02-10 14:30:00"
obj = pd.to_datetime(date)
print(obj)
print(type(obj))

2010-02-10 14:30:00
<class 'pandas._libs.tslibs.timestamps.Timestamp'>


In [26]:
data = ['2010-01-01', '2023-01-02','2024-05-01']
dt = pd.to_datetime(data)
dt

DatetimeIndex(['2010-01-01', '2023-01-02', '2024-05-01'], dtype='datetime64[ns]', freq=None)

In [37]:
df = pd.DataFrame({'datetime': pd.to_datetime(['2024-07-03 10:00', '2025-08-05 14:00'])})
print(df)

df['year'] = df['datetime'].dt.year
df

             datetime
0 2024-07-03 10:00:00
1 2025-08-05 14:00:00


Unnamed: 0,datetime,year
0,2024-07-03 10:00:00,2024
1,2025-08-05 14:00:00,2025


In [41]:
import pandas as pd

# Create a sample DataFrame with a DatetimeIndex
data = {'value': [10, 12, 8, 15, 20, 18, 25, 22]}
dates = pd.to_datetime(['2025-01-01 09:00', '2025-01-01 10:00', '2025-01-01 11:00',
                        '2025-01-02 09:00', '2025-01-02 10:00', '2025-01-02 11:00',
                        '2025-01-03 09:00', '2025-01-03 10:00'])
df = pd.DataFrame(data, index=dates)

df_daily_mean = df.resample('D').mean()
print("Daily Mean:\n", df_daily_mean)



# Upsample from hourly to 30-minute intervals, forward filling missing values
df_30min_ffill = df.resample('30min').ffill()
print("\n30-minute intervals (forward fill):\n", df_30min_ffill)

Daily Mean:
                 value
2025-01-01  10.000000
2025-01-02  17.666667
2025-01-03  23.500000

30-minute intervals (forward fill):
                      value
2025-01-01 09:00:00     10
2025-01-01 09:30:00     10
2025-01-01 10:00:00     12
2025-01-01 10:30:00     12
2025-01-01 11:00:00      8
...                    ...
2025-01-03 08:00:00     18
2025-01-03 08:30:00     18
2025-01-03 09:00:00     25
2025-01-03 09:30:00     25
2025-01-03 10:00:00     22

[99 rows x 1 columns]


In [42]:
index = pd.date_range('1/1/2000', periods=9, freq='min')
series = pd.Series(range(9), index=index)
series

Unnamed: 0,0
2000-01-01 00:00:00,0
2000-01-01 00:01:00,1
2000-01-01 00:02:00,2
2000-01-01 00:03:00,3
2000-01-01 00:04:00,4
2000-01-01 00:05:00,5
2000-01-01 00:06:00,6
2000-01-01 00:07:00,7
2000-01-01 00:08:00,8


In [45]:
# Sample time-series data
data = {
    'date': pd.date_range('2023-01-01', periods=10, freq='D'),
    'sales': [200, 220, 250, 230, 210, 300, 280, 270, 260, 240]
}
df = pd.DataFrame(data)
df['date'] = pd.to_datetime(df['date'])

# Resampling sales data to get weekly sums
weekly_sales = df.resample('W', on='date').sum()

print(weekly_sales)

            sales
date             
2023-01-01    200
2023-01-08   1760
2023-01-15    500


In [61]:
index = pd.date_range('1/07/2025', '30/07/2025')
series = pd.Series(index=index)
series

Unnamed: 0,0
2025-01-07,
2025-01-08,
2025-01-09,
2025-01-10,
2025-01-11,
...,...
2025-07-26,
2025-07-27,
2025-07-28,
2025-07-29,


In [62]:
series.resample('3min').sum()

Unnamed: 0,0
2025-01-07 00:00:00,0.0
2025-01-07 00:03:00,0.0
2025-01-07 00:06:00,0.0
2025-01-07 00:09:00,0.0
2025-01-07 00:12:00,0.0
...,...
2025-07-29 23:48:00,0.0
2025-07-29 23:51:00,0.0
2025-07-29 23:54:00,0.0
2025-07-29 23:57:00,0.0


In [63]:
series.resample('3min', label='right', closed='right').sum()

Unnamed: 0,0
2025-01-07 00:00:00,0.0
2025-01-07 00:03:00,0.0
2025-01-07 00:06:00,0.0
2025-01-07 00:09:00,0.0
2025-01-07 00:12:00,0.0
...,...
2025-07-29 23:48:00,0.0
2025-07-29 23:51:00,0.0
2025-07-29 23:54:00,0.0
2025-07-29 23:57:00,0.0


In [64]:
series.resample('30s').asfreq()[0:5]

Unnamed: 0,0
2025-01-07 00:00:00,
2025-01-07 00:00:30,
2025-01-07 00:01:00,
2025-01-07 00:01:30,
2025-01-07 00:02:00,


In [68]:
series.resample('3min').sum()

Unnamed: 0,0
2025-01-07 00:00:00,0.0
2025-01-07 00:03:00,0.0
2025-01-07 00:06:00,0.0
2025-01-07 00:09:00,0.0
2025-01-07 00:12:00,0.0
...,...
2025-07-29 23:48:00,0.0
2025-07-29 23:51:00,0.0
2025-07-29 23:54:00,0.0
2025-07-29 23:57:00,0.0


In [74]:
series.resample('3min', label ='left').sum()

Unnamed: 0,0
2025-01-07 00:00:00,0.0
2025-01-07 00:03:00,0.0
2025-01-07 00:06:00,0.0
2025-01-07 00:09:00,0.0
2025-01-07 00:12:00,0.0
...,...
2025-07-29 23:48:00,0.0
2025-07-29 23:51:00,0.0
2025-07-29 23:54:00,0.0
2025-07-29 23:57:00,0.0


In [75]:
def custom_resampler(arraylike):
    return np.sum(arraylike) + 5

series.resample('3min').apply(custom_resampler)

Unnamed: 0,0
2025-01-07 00:00:00,5.0
2025-01-07 00:03:00,5.0
2025-01-07 00:06:00,5.0
2025-01-07 00:09:00,5.0
2025-01-07 00:12:00,5.0
...,...
2025-07-29 23:48:00,5.0
2025-07-29 23:51:00,5.0
2025-07-29 23:54:00,5.0
2025-07-29 23:57:00,5.0


In [76]:
days = pd.date_range('2025-03-20',periods=30,)

Unnamed: 0,Unnamed: 1,price,volume
2000-01-01,morning,10,50
2000-01-01,afternoon,11,60
2000-01-02,morning,9,40
2000-01-02,afternoon,13,100
2000-01-03,morning,14,50
2000-01-03,afternoon,18,100
2000-01-04,morning,17,40
2000-01-04,afternoon,19,50


In [77]:
days = pd.date_range('1/1/2000', periods=4, freq='D')
d2 = {'price': [10, 11, 9, 13, 14, 18, 17, 19],
      'volume': [50, 60, 40, 100, 50, 100, 40, 50]}
df2 = pd.DataFrame(
    d2,
    index=pd.MultiIndex.from_product(
        [days, ['morning', 'afternoon']]
    )
)
df2

Unnamed: 0,Unnamed: 1,price,volume
2000-01-01,morning,10,50
2000-01-01,afternoon,11,60
2000-01-02,morning,9,40
2000-01-02,afternoon,13,100
2000-01-03,morning,14,50
2000-01-03,afternoon,18,100
2000-01-04,morning,17,40
2000-01-04,afternoon,19,50


In [79]:
start, end = '2000-10-01 23:30:00', '2000-10-02 00:30:00'
rng = pd.date_range(start, end, freq='7min')
ts = pd.Series(np.arange(len(rng)) * 3, index=rng)
ts

Unnamed: 0,0
2000-10-01 23:30:00,0
2000-10-01 23:37:00,3
2000-10-01 23:44:00,6
2000-10-01 23:51:00,9
2000-10-01 23:58:00,12
2000-10-02 00:05:00,15
2000-10-02 00:12:00,18
2000-10-02 00:19:00,21
2000-10-02 00:26:00,24


In [81]:
ts.resample('17min', origin='2000-01-01').sum()

Unnamed: 0,0
2000-10-01 23:24:00,3
2000-10-01 23:41:00,15
2000-10-01 23:58:00,45
2000-10-02 00:15:00,45


In [None]:
ts.resample('17min', origin='start').sum()

In [82]:
ts.resample('17min', offset='23h30min').sum()

Unnamed: 0,0
2000-10-01 23:30:00,9
2000-10-01 23:47:00,21
2000-10-02 00:04:00,54
2000-10-02 00:21:00,24


In [83]:
ts.resample('17min', origin='end').sum()

Unnamed: 0,0
2000-10-01 23:35:00,0
2000-10-01 23:52:00,18
2000-10-02 00:09:00,27
2000-10-02 00:26:00,63


In [104]:
import pandas as pd
import datetime as dt

# Create a sample DataFrame with a DatetimeIndex
dates = pd.to_datetime(['2023-01-01', '2023-01-02', '2023-01-03', '2023-01-04', '2023-01-05'])
data = {'Value': [10, 20, 15, 25, 30]}
df1 = pd.DataFrame(data, index=dates)



# Slice by a specific date range (inclusive of start and end)
df_range = df1['2023-01-02':'2023-01-04']

# Slice using datetime objects
start_date = dt.datetime(2023, 1, 3)
end_date = dt.datetime(2023, 1, 5)
df_datetime_range = df1[start_date:end_date]

print(df_datetime_range)


            Value
2023-01-03     15
2023-01-04     25
2023-01-05     30


In [118]:
import pandas as pd

# Create a sample DataFrame with a DatetimeIndex
dates = pd.date_range(start='2020-01-01', end='2022-12-31', freq='D')
df = pd.DataFrame({'value': range(len(dates))}, index=dates)

# Slice for a specific year (e.g., 2021)
df_2021 = df['2021']
print(df_2021.head())

KeyError: '2021'

In [119]:
original_df = pd.DataFrame(
    {"foo": range(5), "bar": range(5, 10)}
   )
original_df

Unnamed: 0,foo,bar
0,0,5
1,1,6
2,2,7
3,3,8
4,4,9


In [122]:
df_parquet_bytes = original_df.to_parquet()
from io import BytesIO
restored_df = pd.read_parquet(BytesIO(df_parquet_bytes))
restored_df

Unnamed: 0,foo,bar
0,0,5
1,1,6
2,2,7
3,3,8
4,4,9


In [166]:
# Exercise 18:
# ● Parse date column as datetime.
# ● Resample data to weekly aggregates.
# ● Calculate % change week over week.

date = { 'values':[20,10,15,14,18,23,22,40]}
dates = pd.to_datetime(['2025-01-01 09:00', '2025-01-01 10:00', '2025-01-01 11:00',
                        '2025-01-02 09:00', '2025-01-02 10:00', '2025-01-02 11:00',
                        '2025-01-03 09:00', '2025-01-03 10:00'])


df = pd.DataFrame(data)

In [167]:
df['date'] = pd.to_datetime(dates)
df['date']

Unnamed: 0,date
0,2025-01-01 09:00:00
1,2025-01-01 10:00:00
2,2025-01-01 11:00:00
3,2025-01-02 09:00:00
4,2025-01-02 10:00:00
5,2025-01-02 11:00:00
6,2025-01-03 09:00:00
7,2025-01-03 10:00:00


In [168]:
df.set_index('date',inplace=True)
weekly = df.resample("W").sum()
weekly

Unnamed: 0_level_0,values
date,Unnamed: 1_level_1
2025-01-05,162


In [170]:
  weekly_percent_change = weekly.pct_change()
  weekly_percent_change

Unnamed: 0_level_0,values
date,Unnamed: 1_level_1
2025-01-05,


In [172]:
import pandas as pd
import numpy as np
import random
from datetime import datetime, timedelta

# Set random seed for reproducibility
random.seed(42)
np.random.seed(42)

# Generate 150 random dates
start_date = datetime(2023, 1, 1)
date_list = [start_date + timedelta(days=i) for i in range(150)]
random_dates = random.choices(date_list, k=150)

# --- WEATHER DATASET ---
weather_data = {
    'date': [d.strftime('%Y-%m-%d') if i % 10 != 0 else d.strftime('%d/%m/%Y') for i, d in enumerate(random_dates)],
    'Temp': [round(np.random.uniform(15, 40), 1) if i % 8 != 0 else 'hot' for i in range(150)],
    'humidity(%)': [np.random.randint(20, 100) if i % 7 != 0 else np.nan for i in range(150)],
    'wind_speed': [round(np.random.uniform(0, 20), 2) if i % 5 != 0 else "fast" for i in range(150)],
    'city': [random.choice(['New York', 'Los Angeles', 'Chicago', np.nan, '']) for _ in range(150)],
}

weather_df = pd.DataFrame(weather_data)

# Add some duplicate rows
weather_df = pd.concat([weather_df, weather_df.iloc[0:5]], ignore_index=True)

# --- SALES DATASET ---
sales_data = {
    'date': [d.strftime('%Y-%m-%d') if i % 6 != 0 else d.strftime('%B %d, %Y') for i, d in enumerate(random_dates)],
    'sales_amount': [round(np.random.uniform(1000, 10000), 2) if i % 10 != 0 else 'high' for i in range(150)],
    'product_id': [random.choice(['P001', 'P002', 'P003', np.nan, 'p001']) for _ in range(150)],
    'units_sold': [np.random.randint(1, 50) if i % 9 != 0 else None for i in range(150)],
    'store_location': [random.choice(['New York', 'Chicago', 'Houston', '', np.nan]) for _ in range(150)],
}

sales_df = pd.DataFrame(sales_data)

# Add some duplicate rows
sales_df = pd.concat([sales_df, sales_df.iloc[2:4]], ignore_index=True)

# --- SAVE TO CSV FILES ---
weather_df.to_csv("weather.csv", index=False)
sales_df.to_csv("sales.csv", index=False)

print("Generated messy 'weather.csv' and 'sales.csv' with 150+ rows each.")

Generated messy 'weather.csv' and 'sales.csv' with 150+ rows each.


In [173]:
weather_data = pd.read_csv('/content/weather.csv')
sales_data  = pd.read_csv('/content/sales.csv')



Unnamed: 0,date,sales_amount,product_id,units_sold,store_location
0,"April 06, 2023",high,P003,,
1,2023-01-04,6727.63,P003,28.0,
2,2023-02-11,8208.54,P002,36.0,New York
3,2023-02-03,7094.52,,26.0,Houston
4,2023-04-21,6160.3,p001,8.0,Houston
...,...,...,...,...,...
147,2023-04-20,4550.7,p001,34.0,Houston
148,2023-01-31,1960.8,p001,6.0,Houston
149,2023-02-16,4020.94,P003,2.0,
150,2023-02-11,8208.54,P002,36.0,New York


In [174]:
weather_data.describe()

Unnamed: 0,humidity(%)
count,132.0
mean,57.25
std,22.514584
min,20.0
25%,39.0
50%,56.0
75%,75.25
max,99.0


In [176]:
weather_data.shape

(155, 5)

In [177]:
sales_data.shape

(152, 5)

In [178]:

weather_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 155 entries, 0 to 154
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   date         155 non-null    object 
 1   Temp         155 non-null    object 
 2   humidity(%)  132 non-null    float64
 3   wind_speed   155 non-null    object 
 4   city         92 non-null     object 
dtypes: float64(1), object(4)
memory usage: 6.2+ KB


In [179]:
weather_data.isnull().sum()

Unnamed: 0,0
date,0
Temp,0
humidity(%),23
wind_speed,0
city,63


In [197]:
weather_data.fillna(method='ffill',inplace=True)
weather_data

  weather_data.fillna(method='ffill',inplace=True)


Unnamed: 0,date,Temp,humidity(%),wind_speed,city
0,06/04/2023,hot,,fast,Chicago
1,2023-01-04,24.4,56.0,15.37,Chicago
2,2023-02-11,38.8,70.0,0.87,Chicago
3,2023-02-03,33.3,63.0,19.89,Chicago
4,2023-04-21,30.0,43.0,9.4,New York
...,...,...,...,...,...
150,06/04/2023,hot,54.0,fast,Chicago
151,2023-01-04,24.4,56.0,15.37,Chicago
152,2023-02-11,38.8,70.0,0.87,Chicago
153,2023-02-03,33.3,63.0,19.89,Chicago


In [200]:
weather_data.isna().sum()

Unnamed: 0,0
date,0
Temp,0
humidity(%),1
wind_speed,0
city,0


In [207]:
df_cleaned = weather_data.dropna(subset=['humidity(%)'])
print(df_cleaned)

           date  Temp  humidity(%) wind_speed         city
1    2023-01-04  24.4         56.0      15.37      Chicago
2    2023-02-11  38.8         70.0       0.87      Chicago
3    2023-02-03  33.3         63.0      19.89      Chicago
4    2023-04-21  30.0         43.0        9.4     New York
5    2023-04-12  18.9         98.0       fast  Los Angeles
..          ...   ...          ...        ...          ...
150  06/04/2023   hot         54.0       fast      Chicago
151  2023-01-04  24.4         56.0      15.37      Chicago
152  2023-02-11  38.8         70.0       0.87      Chicago
153  2023-02-03  33.3         63.0      19.89      Chicago
154  2023-04-21  30.0         43.0        9.4     New York

[154 rows x 5 columns]


In [None]:
value_to_drop = 'Na'
column_to_check = 'City'

# Get the indices of rows where the 'City' column contains 'New York'
indices_to_drop = df[df[column_to_check] == value_to_drop].index

# Drop the rows using their indices
df_cleaned = df.drop(indices_to_drop)

print(df_cleaned)

In [208]:
df_cleaned

(154, 5)

In [209]:
sales_data.isna().sum()

Unnamed: 0,0
date,0
sales_amount,0
product_id,24
units_sold,17
store_location,74


In [210]:
sales_data.fillna(method='ffill',inplace=True)
sales_data

  sales_data.fillna(method='ffill',inplace=True)


Unnamed: 0,date,sales_amount,product_id,units_sold,store_location
0,"April 06, 2023",high,P003,,
1,2023-01-04,6727.63,P003,28.0,
2,2023-02-11,8208.54,P002,36.0,New York
3,2023-02-03,7094.52,P002,26.0,Houston
4,2023-04-21,6160.3,p001,8.0,Houston
...,...,...,...,...,...
147,2023-04-20,4550.7,p001,34.0,Houston
148,2023-01-31,1960.8,p001,6.0,Houston
149,2023-02-16,4020.94,P003,2.0,Houston
150,2023-02-11,8208.54,P002,36.0,New York
