In [4]:
import pandas as pd
import numpy as np

# Load the dataset
file_path = 'GlobalLandTemperaturesByCity.csv'
df = pd.read_csv('GlobalLandTemperaturesByCity.csv')

# Display the first few rows of the dataset to understand its structure
df.head()


Unnamed: 0,dt,AverageTemperature,AverageTemperatureUncertainty,City,Country,Latitude,Longitude
0,1743-11-01,6.068,1.737,Århus,Denmark,57.05N,10.33E
1,1743-12-01,,,Århus,Denmark,57.05N,10.33E
2,1744-01-01,,,Århus,Denmark,57.05N,10.33E
3,1744-02-01,,,Århus,Denmark,57.05N,10.33E
4,1744-03-01,,,Århus,Denmark,57.05N,10.33E


In [69]:
# Parse the 'dt' column as dates
df['dt'] = pd.to_datetime(df['dt'])

In [70]:
# Check for missing values and handle them
# This is a simple approach, consider other methods depending on your needs
df = df.dropna(subset=['AverageTemperature'])

In [71]:
# Ensure correct data types
df['AverageTemperature'] = pd.to_numeric(df['AverageTemperature'], errors='coerce')
df['AverageTemperatureUncertainty'] = pd.to_numeric(df['AverageTemperatureUncertainty'], errors='coerce')
df['Latitude'] = pd.to_numeric(df['Latitude'], errors='coerce')
df['Longitude'] = pd.to_numeric(df['Longitude'], errors='coerce')

In [72]:
# Feature engineering: Adding year and month columns
df['Year'] = df['dt'].dt.year
df['Month'] = df['dt'].dt.month

In [73]:
# Filter out data before the year 1760
df = df[df['dt'].dt.year >= 1760]

In [74]:
#Removing the column AverageTemperatureUncertainty
df = df.drop('AverageTemperatureUncertainty', axis=1)

In [75]:
# Calculating Q1 and Q3
Q1 = df['AverageTemperature'].quantile(0.25)
Q3 = df['AverageTemperature'].quantile(0.75)
IQR = Q3 - Q1

# Filtering out the outliers
filtered_df = df[~((df['AverageTemperature'] < (Q1 - 1.5 * IQR)) |(df['AverageTemperature'] > (Q3 + 1.5 * IQR)))]
# The resulting DataFrame 'filtered_df' will not contain outliers in the 'AverageTemperature' column

In [76]:
# Save the cleaned data if needed
df.to_csv('cleaned_data.csv', index=False)


In [77]:
df.head()

Unnamed: 0,dt,AverageTemperature,City,Country,Latitude,Longitude,Year,Month
194,1760-01-01,-4.787,Århus,Denmark,,,1760,1
195,1760-02-01,-0.386,Århus,Denmark,,,1760,2
196,1760-03-01,0.869,Århus,Denmark,,,1760,3
197,1760-04-01,5.651,Århus,Denmark,,,1760,4
198,1760-05-01,11.112,Århus,Denmark,,,1760,5


In [78]:
df.describe()

Unnamed: 0,dt,AverageTemperature,Latitude,Longitude,Year,Month
count,8146208,8146208.0,0.0,0.0,8146208.0,8146208.0
mean,1913-01-02 03:47:04.929981440,16.81198,,,1912.547,6.49482
min,1760-01-01 00:00:00,-42.704,,,1760.0,1.0
25%,1867-07-24 06:00:00,10.45,,,1867.0,3.0
50%,1916-12-01 00:00:00,18.94,,,1916.0,6.0
75%,1965-04-01 00:00:00,25.253,,,1965.0,9.0
max,2013-09-01 00:00:00,39.651,,,2013.0,12.0
std,,10.33583,,,62.40187,3.451076


In [79]:
df.count()

dt                    8146208
AverageTemperature    8146208
City                  8146208
Country               8146208
Latitude                    0
Longitude                   0
Year                  8146208
Month                 8146208
dtype: int64