In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split

In [4]:
## Reading dataset from csv file
df = pd.read_csv('Datasets/temparatures.csv')

In [5]:
## Dropping NaN values from dataset
newdf = df.dropna()

## Convertiong str column to datetime object
newdf['dt'] = pd.to_datetime(newdf['dt'])

## Updating dataframe 1850 and later
newdf = newdf[newdf['dt'] > '1850-01-01']

# Splitting date column
newdf["Year"] = newdf["dt"].dt.year
newdf["Month"] = newdf["dt"].dt.month
newdf["Day"] = newdf["dt"].dt.day

# Create an instance of the LabelEncoder class
encoder = LabelEncoder()

# Fit the encoder to the "Country" column
encoder.fit(newdf["Country"])

# Transform the "Country" column and store the result in a new column
newdf["Country_encoded"] = encoder.transform(newdf["Country"])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  newdf['dt'] = pd.to_datetime(newdf['dt'])


In [6]:
newdf.reset_index(level=0, inplace=True)
newdf.head()

Unnamed: 0,index,dt,AverageTemperature,AverageTemperatureUncertainty,Country,Year,Month,Day,Country_encoded
0,1275,1850-02-01,-2.309,1.603,Åland,1850,2,1,241
1,1276,1850-03-01,-4.801,3.033,Åland,1850,3,1,241
2,1277,1850-04-01,1.242,2.008,Åland,1850,4,1,241
3,1278,1850-05-01,7.92,0.881,Åland,1850,5,1,241
4,1279,1850-06-01,12.704,0.948,Åland,1850,6,1,241


In [7]:
# Split the dataset into features (X) and target (y)
X = newdf[["Country_encoded", "Year", "Month", "Day"]]
y = newdf["AverageTemperature"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Build the model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Calculate the mean absolute error
mae = mean_absolute_error(y_test, y_pred)

# Display the error
print(f"Mean Absolute Error: {mae:.2f}")

Mean Absolute Error: 8.44


In [8]:
# Create a NumPy array with the desired values
data = np.array([[223, year] for year in range(2025, 2501)])

# Create the DataFrame with the desired column names
tempDf = pd.DataFrame(data, columns=["Country_encoded", "Year"])

# Add the "Month" and "Day" columns with a single value repeated in each row
tempDf["Month"] = 1
tempDf["Day"] = 1

turkeyPredictions = model.predict(tempDf)

In [9]:
# Create a DataFrame with the year and value columns
turkeyDf = pd.DataFrame({"Year": range(2025, 2501), "Average Temperature": turkeyPredictions})

# Create the timeseries plot
fig = px.line(turkeyDf, x="Year", y="Average Temperature")

# Show the plot
fig.show()
