# Tabluar data pre-processong

# import all required libraries

In [3]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import random

# Set seed for reproducibility
np.random.seed(42)
random.seed(42)


# load data set

In [4]:
df=pd.read_csv('sample_data.csv')

In [5]:
df.head(100)

Unnamed: 0,Temperature,Humidity,Condition,Date,WindSpeed,CityCode
0,19.966733,76.0,Cloudy,01/14/2025,23.91,1782
1,13.404275,54.0,Rainy,13-01-2025,31.72,1009
2,24.022196,38.0,Rainy,01/12/2025,26.20,1983
3,19.213931,33.0,Cloudy,11-01-2025,39.69,1377
4,25.402299,43.0,Cloudy,01/10/2025,11.35,1632
...,...,...,...,...,...,...
95,16.007805,61.0,Sunny,11-10-2024,9.77,1998
96,26.706560,72.0,Sunny,10/10/2024,33.24,1707
97,15.581111,82.0,Sunny,09-10-2024,34.58,1780
98,22.869835,59.0,Sunny,10/08/2024,30.74,1560


In [6]:
# checck dataframe description

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Temperature  200 non-null    float64
 1   Humidity     194 non-null    float64
 2   Condition    191 non-null    object 
 3   Date         200 non-null    object 
 4   WindSpeed    195 non-null    float64
 5   CityCode     200 non-null    int64  
dtypes: float64(3), int64(1), object(2)
memory usage: 9.5+ KB


In [7]:
#check null values

df.isna().sum()

Temperature    0
Humidity       6
Condition      9
Date           0
WindSpeed      5
CityCode       0
dtype: int64

In [8]:
# Handle missing values

df['Humidity'].fillna(df['Humidity'].mean(), inplace=True)
df['WindSpeed'].fillna(df['WindSpeed'].mean(), inplace=True)
df['Condition'].fillna('Unknown', inplace=True)


In [8]:
df.isna().sum()

Temperature    0
Humidity       0
Condition      0
Date           0
WindSpeed      0
CityCode       0
dtype: int64

# Changing Data Types

# checking irregularity in date

In [9]:
df['Date'].unique()

array(['01/14/2025', '13-01-2025', '01/12/2025', '11-01-2025',
       '01/10/2025', '09-01-2025', '01/08/2025', '07-01-2025',
       '01/06/2025', '05-01-2025', '01/04/2025', '03-01-2025',
       '01/02/2025', '01-01-2025', '12/31/2024', '30-12-2024',
       '12/29/2024', '28-12-2024', '12/27/2024', '26-12-2024',
       '12/25/2024', '24-12-2024', '12/23/2024', '22-12-2024',
       '12/21/2024', '20-12-2024', '12/19/2024', '18-12-2024',
       '12/17/2024', '16-12-2024', '12/15/2024', '14-12-2024',
       '12/13/2024', '12-12-2024', '12/11/2024', '10-12-2024',
       '12/09/2024', '08-12-2024', '12/07/2024', '06-12-2024',
       '12/05/2024', '04-12-2024', '12/03/2024', '02-12-2024',
       '12/01/2024', '30-11-2024', '11/29/2024', '28-11-2024',
       '11/27/2024', '26-11-2024', '11/25/2024', '24-11-2024',
       '11/23/2024', '22-11-2024', '11/21/2024', '20-11-2024',
       '11/19/2024', '18-11-2024', '11/17/2024', '16-11-2024',
       '11/15/2024', '14-11-2024', '11/13/2024', '12-11

In [10]:
# Function to parse date strings in different formats
def parse_date(date_str):
    for fmt in ('%m/%d/%Y', '%d-%m-%Y'):
        try:
            return datetime.strptime(date_str, fmt)
        except ValueError:
            pass
    raise ValueError(f"No valid date format found for {date_str}")

# Applying the function to convert date strings to datetime
df['Date'] = df['Date'].apply(parse_date)

# Change the CityCode column from string to integer type
df['CityCode'] = df['CityCode'].astype(int)

# Display updated data types and information
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   Temperature  200 non-null    float64       
 1   Humidity     200 non-null    float64       
 2   Condition    200 non-null    object        
 3   Date         200 non-null    datetime64[ns]
 4   WindSpeed    200 non-null    float64       
 5   CityCode     200 non-null    int32         
dtypes: datetime64[ns](1), float64(3), int32(1), object(1)
memory usage: 8.7+ KB


In [11]:
df.head(10)

Unnamed: 0,Temperature,Humidity,Condition,Date,WindSpeed,CityCode
0,19.966733,76.0,Cloudy,2025-01-14,23.91,1782
1,13.404275,54.0,Rainy,2025-01-13,31.72,1009
2,24.022196,38.0,Rainy,2025-01-12,26.2,1983
3,19.213931,33.0,Cloudy,2025-01-11,39.69,1377
4,25.402299,43.0,Cloudy,2025-01-10,11.35,1632
5,10.775474,66.0,Cloudy,2025-01-09,14.37,1636
6,14.914039,63.0,Sunny,2025-01-08,15.35,1681
7,22.583347,40.0,Rainy,2025-01-07,18.63,1583
8,16.004447,70.0,Sunny,2025-01-06,33.44,1151
9,19.177322,85.0,Sunny,2025-01-05,9.43,1719


In [12]:
# Summary statistics
df.describe()

Unnamed: 0,Temperature,Humidity,WindSpeed,CityCode
count,200.0,200.0,200.0,200.0
mean,19.373721,58.675258,20.191128,1526.425
std,5.133649,16.227426,10.838986,272.212661
min,6.703243,30.0,0.1,1009.0
25%,15.906874,44.0,11.745,1298.75
50%,19.387027,58.675258,20.885,1553.0
75%,22.955097,70.0,28.32,1740.5
max,32.849634,89.0,39.73,1998.0
