In [1]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
import statsmodels.api as sm
from statsmodels.formula.api import ols
import warnings
warnings.filterwarnings("ignore")

ModuleNotFoundError: No module named 'statsmodels'

## Loading Data

In [None]:
data = pd.read_csv(r"C:\Users\DELL\CSV FILES\city_temperature.csv", low_memory=False)

## Information About Data

In [None]:
data.head(5)

*First 5 data of the dataset*

In [None]:
data.tail(5)

*Last 5 data of the dataset*

In [None]:
data.shape

*There is 2906327 row i.e. data and 8 column* 

In [None]:
data.columns 

*These are the column name or attributes*

In [None]:
data.info()

*This is the datatypes of the attributes*

In [None]:
data['Region'].unique()

***Quick Overview Of Data***

In [None]:
pd.options.display.float_format = '{:.2f}'.format # to avoid scientific notation
data.describe()

*This is the Descriptive Statistics of the Dataset*

## Data Preprocessing 

***Missing Value Check***

In [None]:
# Count the number of missing values in each column
missing_value_counts = data.isnull().sum()

# Print the missing value counts
print("Missing value counts:")
print(missing_value_counts)
print()


*After checking, it is found that there are 1,450,990 missing values in the State column.*

***Duplicate Data Check***

In [None]:
# Count the number of duplicated rows
duplicated_rows_count = len(data[data.duplicated()])

# Print the count of duplicated rows
print("The number of duplicated rows: {}".format(duplicated_rows_count))

*The dataset contains 20,715 duplicated rows.*

***Handling Missing Value***

In [None]:
# Remove the 'State' column from the DataFrame
data = data.drop(['State'], axis=1)
data = data.drop(data[data['Country'] == 'Burundi'].index)


*In order to handle missing values in the 'state' column, we have decided to remove the column entirely. This decision was made because approximately half of the data entries in this column were found to be missing. Replacing the missing values with mean or median values may introduce more errors and potentially distort the analysis. Therefore, it was deemed more appropriate to remove the column from the dataset altogether.*

***Handling Duplicate Value***

In [None]:

# Remove duplicate rows from the DataFrame
data = data.drop_duplicates()
duplicated_rows_count = len(data[data.duplicated()])

# Print the count of duplicated rows
print("The number of duplicated rows: {}".format(duplicated_rows_count))

*After careful examination, it was observed that there are no duplicated rows in the dataset.*

## Exploratory Data Analysis 

***Outlier Detection***

In [None]:
for i in data[['Month', 'Day', 'Year', 'AvgTemperature']]:
    plt.figure()
    boxplot_color = 'green'
    sns.boxplot(data[i], color=boxplot_color, boxprops=dict(color=boxplot_color))

*From the above boxplot, it is evident that there are outliers in the 'year' and 'AvgTemperature' variables.*

In [None]:
data.groupby('Year')['AvgTemperature'].count()

*Based on the above observation that the years 200, 201, and 2020 have unusually small average temperatures, it has been decided to remove these three years from the dataset. This step is taken to mitigate potential errors in future analyses and modeling.*

In [None]:
# Filter the DataFrame to include years greater than 1994
data = data[data['Year'] >= 1995]

# Filter the DataFrame to include years less than 2020
data = data[data['Year'] <= 2019]

# Filter the DataFrame to include average temperatures greater than -80
data = data[data['AvgTemperature'] > -80]

# print the filtered Data
data.shape

*After performing data preprocessing steps, including the removal of null values, duplicate entries, and outliers, the dataset now consists of 2,766,880 rows or data points and 7 columns. These steps were undertaken to ensure the quality and reliability of the dataset for subsequent analysis and modeling.*

***Changing The Scale Of AvgTemperature***

In [None]:
# transfering the scale of the AvgTemperature Fahrenheit to celcious
data['AvgTemperature'] = (data['AvgTemperature'] - 32)*(5/9)

*In order to enhance the understanding of temperature, the scale of the 'AvgTemperature' variable has been converted from Fahrenheit to Celsius. This conversion allows for a more intuitive interpretation and analysis of the temperature values in the dataset.*

***Visualization***

In [None]:
for i in data:
    print(i ,': ', data[i].nunique())

*In the dataset, there are 7 unique regions, 124 unique countries, 320 unique cities, 25 unique years, and a total of 1,514 average temperature values. These figures provide an overview of the diversity and granularity of the data across different geographic regions, time periods, and temperature measurements.*

In [None]:
corr_val = data.corr()

f,ax = plt.subplots(figsize=(6,6))
sns.heatmap(corr_val,annot=True,linewidth=0.5,linecolor="White",fmt=".2f",ax=ax)
plt.show()

In [None]:
# Group the data by year and calculate the average temperature for all regions
global_avg_temp = data.groupby('Year')['AvgTemperature'].mean().reset_index()

# Create a line plot to show global average temperature trends over the years
plt.figure(figsize=(10, 4))
plt.plot(global_avg_temp['Year'], global_avg_temp['AvgTemperature'], marker='o', linewidth=2)
plt.title('Global Average Temperature Trend')
plt.xlabel('Year')
plt.ylabel('Average Temperature (°C)')
plt.grid(True)
plt.show()

*From the above graph, it is evident that the global average temperature has been steadily increasing from the year 1995 to 2019. This observation indicates a clear upward trend in global temperatures over the analyzed period.*

In [None]:
region_temp_trend = data.groupby(['Region', 'Year'])['AvgTemperature'].mean().reset_index()
sns.lmplot(data= region_temp_trend, x= 'Year', y= 'AvgTemperature', hue= 'Region', height= 6)
plt.title('Average temperature trend by Region over the years', fontsize= 16)
plt.ylabel('Average temperature (°C)')
plt.xlabel('Year')
plt.show()


*From the above graph, it is evident that the average temperature is increasing over the years. This trend suggests a long-term rise in global temperatures, indicating the phenomenon of global warming.*

In [None]:
data['Region'].unique()

In [None]:
# Filter the data for the "Asia" region
Asia_data = data[data['Region'] == 'Asia']

# Group the data by country and year, and calculate the average temperature
country_avg_temp = Asia_data.groupby(['Country', 'Year'])['AvgTemperature'].mean().reset_index()

# Plot the temperature of every country in Asia
plt.figure(figsize=(6, 3))
sns.lineplot(x='Year', y='AvgTemperature', hue='Country', units='Country',marker='^', dashes=False, estimator=None, lw=1, data=country_avg_temp)

plt.title("Daily Average Temperature in every Country in Asia")
plt.xlabel("Year")
plt.ylabel("Average Temperature")
plt.legend(bbox_to_anchor = (1, 1), shadow = True, fontsize = 'small', title = 'Countries In Asia')
plt.show()


In [None]:
# Filter the data for the "Africa" region
Africa_data = data[data['Region'] == 'Africa']

# Group the data by country and year, and calculate the average temperature
country_avg_temp = Africa_data.groupby(['Country', 'Year'])['AvgTemperature'].mean().reset_index()

# Plot the temperature of every country in Africa
plt.figure(figsize=(6, 3))
sns.lineplot(x='Year', y='AvgTemperature', hue='Country', units='Country',marker='^', dashes=False, estimator=None, lw=1, data=country_avg_temp)

plt.title("Daily Average Temperature in every Country in Africa")
plt.xlabel("Year")
plt.ylabel("Average Temperature")
plt.legend(bbox_to_anchor = (1, 1), shadow = True, fontsize = 'small', title = 'Countries In Africa')
plt.show()


In [None]:

# Filter the data for the "Africa" region
America_Carribean_data = data[data['Region'] == 'South/Central America & Carribean']

# Group the data by country and year, and calculate the average temperature
country_avg_temp = America_Carribean_data.groupby(['Country', 'Year'])['AvgTemperature'].mean().reset_index()

# Plot the temperature of every country in Africa
plt.figure(figsize=(6, 3))
sns.lineplot(x='Year', y='AvgTemperature', hue='Country', units='Country',marker='^', dashes=False, estimator=None, lw=1, data=country_avg_temp)

plt.title("Daily Average Temperature in every Country in South/Central America & Carribean")
plt.xlabel("Year")
plt.ylabel("Average Temperature")
plt.legend(bbox_to_anchor = (1, 1), shadow = True, fontsize = 'small', title = 'Countries In South/Central America & Carribean')
plt.show()


In [None]:

# Filter the data for the "Africa" region
africa_data = data[data['Region'] == 'Europe']

# Group the data by country and year, and calculate the average temperature
country_avg_temp = africa_data.groupby(['Country', 'Year'])['AvgTemperature'].mean().reset_index()

# Plot the temperature of every country in Africa
plt.figure(figsize=(10, 5))
sns.lineplot(x='Year', y='AvgTemperature', hue='Country', units='Country', markers=True, dashes=False,
             estimator=None, lw=1, data=country_avg_temp)

plt.title("Daily Average Temperature in every Country in Europe")
plt.xlabel("Year")
plt.ylabel("Average Temperature")
plt.legend(bbox_to_anchor = (1, 1), shadow = True, fontsize = 'small', title = 'Countries In Europe')
plt.show()


***The trend analysis by region over the years did not provide a clear visualization of the temperature increase,*** 

*To obtain a clearer visualization of the temperature trend and understand how the average temperatures are changing, it would be beneficial to explore the average temperature trends by individual countries and cities. By analyzing the average temperature variations within specific geographic locations, we can gain more detailed insights into temperature patterns and potentially observe clearer evidence of temperature increase or other trends at a more localized level.*

In [None]:
Europe_data = data[data['Region'] == 'Europe']
Europe_data['Country'].unique()

In [None]:
avg_temp_Austria = data[data['Country'] == 'Austria'].groupby(data.Year).mean() # Europe Region Country

plt.title("Temperature in every year in Austria")

plt.xlabel('Years')
plt.ylabel('Temperatures (C)')
plt.errorbar(avg_temp_Austria.index, avg_temp_Austria.AvgTemperature, color='green', fmt='--', label='Data')
plt.plot(avg_temp_Austria.index, avg_temp_Austria.AvgTemperature, color='black', linewidth=2, label='Line')
plt.legend()
plt.show()

In [None]:
avg_temp_Macedonia = data[data['Country'] == 'Macedonia'].groupby(data.Year).mean() # Europe Region Country

plt.title("Temperature in every year in Macedonia")

plt.xlabel('Years')
plt.ylabel('Temperatures (C)')
plt.errorbar(avg_temp_Macedonia.index, avg_temp_Macedonia.AvgTemperature, color='red', fmt='--', label='Data')
plt.plot(avg_temp_Macedonia.index, avg_temp_Macedonia.AvgTemperature, color='black', linewidth=2, label='Line')
plt.legend()
plt.show()

In [None]:
avg_temp_Cyprus = data[data['Country'] == 'Cyprus'].groupby(data.Year).mean()

plt.title("Temperature in every year in Cyprus")

plt.xlabel('Years')
plt.ylabel('Temperatures (C)')
plt.errorbar(avg_temp_Cyprus.index, avg_temp_Cyprus.AvgTemperature, color='red', fmt='--', label='Data')
plt.plot(avg_temp_Cyprus.index, avg_temp_Cyprus.AvgTemperature, color='blue', linewidth=2, label='Line')
plt.legend()
plt.show()


In [None]:
avg_temp_Iceland = data[data['Country'] == 'Iceland'].groupby(data.Year).mean() # Europe Region Country

plt.title("Temperature in every year in Iceland")

plt.xlabel('Years')
plt.ylabel('Temperatures (C)')
plt.errorbar(avg_temp_Iceland.index, avg_temp_Iceland.AvgTemperature, color='red', fmt='--', label='Data')
plt.plot(avg_temp_Iceland.index, avg_temp_Iceland.AvgTemperature, color='green', linewidth=2, label='Line')
plt.legend()
plt.show()

In [None]:
Europe_data = data[data['Region'] == 'Europe']
Europe_data['City'].unique()

In [None]:
avg_temp_Vienna = data[data['City'] == 'Vienna'].groupby(data.Year).mean()

plt.title("Temperature in every year in Vienna")

plt.xlabel('Years')
plt.ylabel('Temperatures (C)')
plt.errorbar(avg_temp_Vienna.index, avg_temp_Vienna.AvgTemperature, color='black', fmt='*', label='Data')
plt.plot(avg_temp_Vienna.index, avg_temp_Vienna.AvgTemperature, color='magenta', linewidth=3, label='Line')
plt.legend()
plt.show()


In [None]:
avg_temp_Belgrade = data[data['City'] == 'Belgrade'].groupby(data.Year).mean()

plt.title("Temperature in every year in Belgrade")

plt.xlabel('Years')
plt.ylabel('Temperatures (C)')
plt.errorbar(avg_temp_Belgrade.index, avg_temp_Belgrade.AvgTemperature, color='red', fmt='*', label='Data')
plt.plot(avg_temp_Belgrade.index, avg_temp_Belgrade.AvgTemperature, color='black', linewidth=3, label='Line')
plt.legend()
plt.show()


In [None]:
avg_temp_Moscow = data[data['City'] == 'Moscow'].groupby(data.Year).mean()

plt.title("Temperature in every year in Moscow")

plt.xlabel('Years')
plt.ylabel('Temperatures (C)')
plt.errorbar(avg_temp_Moscow.index, avg_temp_Moscow.AvgTemperature, color='red', fmt='*', label='Data')
plt.plot(avg_temp_Moscow.index, avg_temp_Moscow.AvgTemperature, color='green', linewidth=3, label='Line')
plt.legend()
plt.show()


In [None]:
import matplotlib.pyplot as plt
avg_temp_Warsaw = data[data['City'] == 'Warsaw'].groupby(data.Year).mean()

plt.title("Temperature in every year in Warsaw")

plt.xlabel('Years')
plt.ylabel('Temperatures (C)')
plt.errorbar(avg_temp_Warsaw.index, avg_temp_Warsaw.AvgTemperature, color='black', fmt='*', label='Data')
plt.plot(avg_temp_Warsaw.index, avg_temp_Warsaw.AvgTemperature, color='blue', linewidth=2.5, label='Line')
plt.legend()
plt.show()


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Filter the data for the "Africa" region
africa_data = data[data['Region'] == 'Asia']

# Group the data by country and year, and calculate the average temperature
country_avg_temp = africa_data.groupby(['Country', 'Year'])['AvgTemperature'].mean().reset_index()

# Plot the temperature of every country in Africa
plt.figure(figsize=(8, 5))
sns.lineplot(x='Year', y='AvgTemperature', hue='Country', units='Country', markers=True, dashes=False,
             estimator=None, lw=1, data=country_avg_temp)

plt.title("Daily Average Temperature in every Country in Asia")
plt.xlabel("Year")
plt.ylabel("Average Temperature")
plt.legend(bbox_to_anchor = (1, 1), shadow = True, fontsize = 'small', title = 'Countries In Asia')
plt.show()


*Based on the above figure of daily average temperature in every country in Asia, it can be inferred, although not clearly visible, that the temperatures in Asian countries are increasing over time. This suggests a possible trend of rising temperatures in the region. However, further analysis and visualization may be required to establish a more conclusive understanding of temperature patterns in African countries.*

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Filter the data for the "Africa" region
Middle_East_data = data[data['Region'] == 'Middle East']

# Group the data by country and year, and calculate the average temperature
country_avg_temp_Middle_East = Middle_East_data.groupby(['Country', 'Year'])['AvgTemperature'].mean().reset_index()

# Plot the temperature of every country in Middle East
plt.figure(figsize=(6, 3))
sns.lineplot(x='Year', y='AvgTemperature', hue='Country', units='Country',marker='^', dashes=False, estimator=None, lw=1, data=country_avg_temp_Middle_East)

plt.title("Daily Average Temperature in every Country in Middle East")
plt.xlabel("Year")
plt.ylabel("Average Temperature")
plt.legend(bbox_to_anchor = (1, 1), shadow = True, fontsize = 'large', title = 'Countries In Middle East')
plt.show()


In [None]:
avg_temp_Kuwait = data[data['Country'] == 'Kuwait'].groupby(data.Year).mean() # Europe Region Country

plt.title("Temperature in every year in Kuwait")

plt.xlabel('Years')
plt.ylabel('Temperatures (C)')
plt.errorbar(avg_temp_Kuwait.index, avg_temp_Kuwait.AvgTemperature, color='green', fmt='^', label='Data')
plt.plot(avg_temp_Kuwait.index, avg_temp_Kuwait.AvgTemperature, color='magenta', linewidth=2, label='Line')
plt.legend()
plt.show()

In [None]:
avg_temp_Turkey = data[data['Country'] == 'Turkey'].groupby(data.Year).mean() # Europe Region Country

plt.title("Temperature in every year in Turkey")

plt.xlabel('Years')
plt.ylabel('Temperatures (C)')
plt.errorbar(avg_temp_Turkey.index, avg_temp_Turkey.AvgTemperature, color='red', fmt='^', label='Data')
plt.plot(avg_temp_Turkey.index, avg_temp_Turkey.AvgTemperature, color='black', linewidth=2, label='Line')
plt.legend()
plt.show()

In [None]:
avg_temp_Arabia = data[data['Country'] == 'Saudi Arabia'].groupby(data.Year).mean() # Europe Region Country

plt.title("Temperature in every year in Saudi Arabia")

plt.xlabel('Years')
plt.ylabel('Temperatures (C)')
plt.errorbar(avg_temp_Arabia.index, avg_temp_Arabia.AvgTemperature, color='red', fmt='^', label='Data')
plt.plot(avg_temp_Arabia.index, avg_temp_Arabia.AvgTemperature, color='grey', linewidth=2, label='Line')
plt.legend()
plt.show()

In [None]:
avg_temp_Bahrain = data[data['Country'] == 'Bahrain'].groupby(data.Year).mean() # Europe Region Country

plt.title("Temperature in every year in Bahrain")

plt.xlabel('Years')
plt.ylabel('Temperatures (C)')
plt.errorbar(avg_temp_Bahrain.index, avg_temp_Bahrain.AvgTemperature, color='blue', fmt='^', label='Data')
plt.plot(avg_temp_Bahrain.index, avg_temp_Bahrain.AvgTemperature, color='green', linewidth=3, label='Line')
plt.legend()
plt.show()

In [None]:
Europe_data = data[data['Region'] == 'Middle East']
Europe_data['City'].unique()

In [None]:
avg_temp_Riyadh = data[data['City'] == 'Riyadh'].groupby(data.Year).mean() # Europe Region Country

plt.title("Temperature in every year in Riyadh")

plt.xlabel('Years')
plt.ylabel('Temperatures (C)')
plt.errorbar(avg_temp_Riyadh.index, avg_temp_Riyadh.AvgTemperature, color='white', fmt='-', label='Data')
plt.plot(avg_temp_Riyadh.index, avg_temp_Riyadh.AvgTemperature, color='black', linewidth=3, label='Line')
plt.legend()
plt.show()

In [None]:
avg_temp_Dhahran = data[data['City'] == 'Dhahran'].groupby(data.Year).mean() # Europe Region Country

plt.title("Temperature in every year in Dhahran")

plt.xlabel('Years')
plt.ylabel('Temperatures (C)')
plt.errorbar(avg_temp_Dhahran.index, avg_temp_Dhahran.AvgTemperature, color='cyan', fmt='-', label='Data')
plt.plot(avg_temp_Dhahran.index, avg_temp_Dhahran.AvgTemperature, color='black', linewidth=3, label='Line')
plt.legend()
plt.show()

In [None]:
avg_temp_Dubai = data[data['City'] == 'Dubai'].groupby(data.Year).mean() # Europe Region Country

plt.title("Temperature in every year in Dubai")

plt.xlabel('Years')
plt.ylabel('Temperatures (C)')
plt.errorbar(avg_temp_Dubai.index, avg_temp_Dubai.AvgTemperature, color='yellow', fmt='-', label='Data')
plt.plot(avg_temp_Dubai.index, avg_temp_Dubai.AvgTemperature, color='black', linewidth=3, label='Line')
plt.legend()
plt.show()

In [None]:
avg_temp_Istanbul = data[data['City'] == 'Istanbul'].groupby(data.Year).mean() # Europe Region Country

plt.title("Temperature in every year in Istanbul")

plt.xlabel('Years')
plt.ylabel('Temperatures (C)')
plt.errorbar(avg_temp_Istanbul.index, avg_temp_Istanbul.AvgTemperature, color='green', fmt='-', label='Data')
plt.plot(avg_temp_Istanbul.index, avg_temp_Istanbul.AvgTemperature, color='black', linewidth=2, label='Line')
plt.legend()
plt.show()

In [None]:
import matplotlib.pyplot as plt
avg_temp_Australia = data[data['Country'] == 'Australia'].groupby(data.Year).mean()

plt.title("Temperature in every year in Australia")

plt.xlabel('Years')
plt.ylabel('Temperatures (C)')
plt.errorbar(avg_temp_Australia.index, avg_temp_Australia.AvgTemperature, color='blue', fmt='--', label='Data')
plt.plot(avg_temp_Australia.index, avg_temp_Australia.AvgTemperature, color='green', linewidth=2, label='Line')
plt.legend()
plt.show()


In [None]:
import matplotlib.pyplot as plt
avg_temp_India = data[data['Country'] == 'India'].groupby(data.Year).mean()

plt.title("Temperature in every year in India")

plt.xlabel('Years')
plt.ylabel('Temperatures (C)')
plt.errorbar(avg_temp_India.index, avg_temp_India.AvgTemperature, color='black', fmt='--', label='Data')
plt.plot(avg_temp_India.index, avg_temp_India.AvgTemperature, color='green', linewidth=2, label='Line')
plt.legend()
plt.show()


In [None]:
import matplotlib.pyplot as plt
avg_temp_China = data[data['Country'] == 'China'].groupby(data.Year).mean()

plt.title("Temperature in every year in China")

plt.xlabel('Years')
plt.ylabel('Temperatures (C)')
plt.errorbar(avg_temp_China.index, avg_temp_China.AvgTemperature, color='black', fmt='--', label='Data')
plt.plot(avg_temp_China.index, avg_temp_China.AvgTemperature, color='red', linewidth=2, label='Line')
plt.legend()
plt.show()


In [None]:
data.groupby('Region')['City'].unique()

In [None]:
africa_data = data[data['Region'] == 'South/Central America & Carribean']
africa_data['City'].unique()

In [None]:
avg_temp_Brasilia = data[data['City'] == 'Brasilia'].groupby(data.Year).mean() # Region South/Central America & Carribean

plt.title("Temperature in every year in Brasilia")

plt.xlabel('Years')
plt.ylabel('Temperatures (C)')
plt.errorbar(avg_temp_Brasilia.index, avg_temp_Brasilia.AvgTemperature, color='grey', fmt='*', label='Data')
plt.plot(avg_temp_Brasilia.index, avg_temp_Brasilia.AvgTemperature, color='red', linewidth=2, label='Line')
plt.legend()
plt.show()

In [None]:
avg_temp_Honolulu = data[data['City'] == 'Honolulu'].groupby(data.Year).mean()

plt.title("Temperature in every year in Honolulu")

plt.xlabel('Years')
plt.ylabel('Temperatures (C)')
plt.errorbar(avg_temp_Honolulu.index, avg_temp_Honolulu.AvgTemperature, color='black', fmt='o', label='Data')
plt.plot(avg_temp_Honolulu.index, avg_temp_Honolulu.AvgTemperature, color='grey', linewidth=2, label='Line')
plt.legend()
plt.show()

In [None]:
avg_temp_Brisbane = data[data['City'] == 'Brisbane'].groupby(data.Year).mean()

plt.title("Temperature in every year in Brisbane")

plt.xlabel('Years')
plt.ylabel('Temperatures (C)')
plt.errorbar(avg_temp_Brisbane.index, avg_temp_Brisbane.AvgTemperature, color='pink', fmt='--', label='Data')
plt.plot(avg_temp_Brisbane.index, avg_temp_Brisbane.AvgTemperature, color='green', linewidth=2, label='Line')
plt.legend()
plt.show()

In [None]:
avg_temp_Moscow = data[data['City'] == 'Moscow'].groupby(data.Year).mean()

plt.title("Temperature in every year in Moscow")

plt.xlabel('Years')
plt.ylabel('Temperatures (C)')
plt.errorbar(avg_temp_Moscow.index, avg_temp_Moscow.AvgTemperature, color='black', fmt='*', label='Data')
plt.plot(avg_temp_Moscow.index, avg_temp_Moscow.AvgTemperature, color='purple', linewidth=2, label='Line')
plt.legend()
plt.show()


In [None]:
avg_temp_Zagreb = data[data['City'] == 'Zagreb'].groupby(data.Year).mean()

plt.title("Temperature in every year in Zagreb")

plt.xlabel('Years')
plt.ylabel('Temperatures (C)')
plt.errorbar(avg_temp_Zagreb.index, avg_temp_Zagreb.AvgTemperature, color='black', fmt='--', label='Data')
plt.plot(avg_temp_Zagreb.index, avg_temp_Zagreb.AvgTemperature, color='blue', linewidth=2, label='Line')
plt.legend()
plt.show()

In [None]:
avg_temp_Chennai = data[data['City'] == 'Chennai (Madras)'].groupby(data.Year).mean()

plt.title("Temperature in every year in Chennai (Madras)")

plt.xlabel('Years')
plt.ylabel('Temperatures (C)')
plt.errorbar(avg_temp_Chennai.index, avg_temp_Chennai.AvgTemperature, color='black', fmt='--', label='Data')
plt.plot(avg_temp_Chennai.index, avg_temp_Chennai.AvgTemperature, color='green', linewidth=2, label='Line')
plt.legend()
plt.show()

In [None]:
# The filtered data is 
df= data.sample(2000)
df

## Feature Selection 

In [None]:
df_x = df.loc[:,["Region","Country","City","Month","Day","Year"]]
df_y = df.loc[:,["AvgTemperature"]]

## Feature Engineering

***Get Dummy variable***

In [None]:
region_oht = pd.get_dummies(df_x["Region"])
country_oht = pd.get_dummies(df_x["Country"])
city_oht = pd.get_dummies(df_x["City"])
month_oht = pd.get_dummies(df_x["Month"]) 
day_oht = pd.get_dummies(df_x["Day"]) 
year_oht = pd.get_dummies(df_x["Year"])
print("Region OHT Shape: ", region_oht.shape)
print("Country OHT Shape: ", country_oht.shape)
print("City OHT Shape: ", city_oht.shape)
print("Mounth OHT Shape: ",month_oht.shape)
print("Day OHT Shape: ",day_oht.shape)
print("Year OHT Shape: ",year_oht.shape)

In [None]:
df_x_oht = pd.concat([region_oht,day_oht,month_oht,year_oht],axis=1)

df_x_oht.sample(n=5)

In [None]:
print("Input Shape: ",df_x.shape)
print("Input Shape with OneHotEncoder: ",df_x_oht.shape)
print("Output Shape: ",df_y.shape) 

***Splitting The Data Into Train Test***

In [None]:
from sklearn.model_selection import train_test_split

x_train,x_test,y_train,y_test = train_test_split(df_x_oht,df_y,test_size=0.30)

In [None]:
print("x_train Shape ->",x_train.shape)
print("x_test Shape ->",x_test.shape)
print("y_train Shape ->",y_train.shape)
print("y_test Shape ->",y_test.shape)

## Model Fitting 

***Linear Regression***

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.metrics import r2_score,mean_absolute_error
lreg = LinearRegression()
lreg.fit(x_train, y_train)
# Make predictions on the test set
y_pred = lreg.predict(x_test)

In [None]:
# Flatten df_y to make it one-dimensional
df_y_flat = np.array(y_test).flatten()
lr_pred_flat = np.array(y_pred).flatten()

In [None]:
# Create a data frame with df_y_flat and svr_pred
result_df_lr = pd.DataFrame({'Actual': df_y_flat, 'Predicted': lr_pred_flat})
result_df_lr.head(10)

In [None]:
# Calculate R-squared
r2 = r2_score(y_test, y_pred)

# Calculate mean squared error
mse = mean_squared_error(y_test, y_pred)


# Calculate mean absolute error (MAE)
mae = mean_absolute_error(y_test, y_pred)

# Print the evaluation metrics
print("R-squared:", r2)
print("Mean Squared Error:", mse)
print("Mean Absolute Error:", mae)


In [None]:
# Get the coefficients and intercept
coefficients = lreg.coef_
intercept = lreg.intercept_

# Print the coefficients and intercept
print("Coefficients:", coefficients)
print("Intercept:", intercept)

## SVM

In [None]:
from sklearn.svm import SVR

svr_reg = SVR(kernel="rbf") 

svr_reg.fit(x_train,y_train)

svr_pred = svr_reg.predict(x_test)

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Calculate mean squared error (MSE)
mse = mean_squared_error(y_test, svr_pred)
print("Mean Squared Error:", mse)

# Calculate mean absolute error (MAE)
mae = mean_absolute_error(y_test, svr_pred)
print("Mean Absolute Error:", mae)

# Calculate R-squared score
r2 = r2_score(y_test, svr_pred)
print("R-squared Score:", r2)

In [None]:
# Flatten df_y to make it one-dimensional
df_y_flat = np.array(y_test).flatten()
svr_pred_flat = np.array(svr_pred).flatten()

In [None]:
# Create a data frame with df_y_flat and svr_pred
result_df_svr = pd.DataFrame({'Actual': df_y_flat, 'Predicted': svr_pred_flat})
result_df_svr.head(10)

***Random Forest Decision Tree Regression***

In [None]:
from sklearn.ensemble import RandomForestRegressor


mdl_rf = RandomForestRegressor(n_estimators=11,random_state=0) 

mdl_rf.fit(x_train,y_train)

rf_pred = mdl_rf.predict(x_test)

In [None]:
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

# Calculate R-squared score
r2_score_rfreg = r2_score(y_test, mdl_rf.predict(x_test))
print("R-squared Score:", r2_score_rfreg)

# Calculate Mean Squared Error (MSE)
mse_rfreg = mean_squared_error(y_test, mdl_rf.predict(x_test))
print("Mean Squared Error:", mse_rfreg)

# Calculate Mean Absolute Error (MAE)
mae_rfreg = mean_absolute_error(y_test, mdl_rf.predict(x_test))
print("Mean Absolute Error:", mae_rfreg)

In [None]:
import pandas as pd
import numpy as np

# Flatten df_y to make it one-dimensional
y_test_flat = np.array(y_test).flatten()

# Create a data frame with df_y_flat and svr_pred
result_df_rf = pd.DataFrame({'Actual': y_test_flat, 'Predicted': rf_pred})

result_df_rf.head(10)

***Polynomial Regression***

In [None]:
from sklearn.preprocessing import PolynomialFeatures

poly_2 = PolynomialFeatures(degree=2)

x_poly_2 = poly_2.fit_transform(x_train)

lr_mdl_2 = LinearRegression()

lr_mdl_2.fit(x_poly_2,y_train)

lr2_pred = lr_mdl_2.predict(poly_2.fit_transform(x_test))

In [None]:
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

# Calculate R-squared score
r2Score_lr2 = r2_score(y_train, lr_mdl_2.predict(x_poly_2))
print("R-squared Score:", r2Score_lr2)

# Calculate Mean Squared Error (MSE)
mse_lr2 = mean_squared_error(y_train, lr_mdl_2.predict(x_poly_2))
print("Mean Squared Error:", mse_lr2)

# Calculate Mean Absolute Error (MAE)
mae_lr2 = mean_absolute_error(y_train, lr_mdl_2.predict(x_poly_2))
print("Mean Absolute Error:", mae_lr2)

In [None]:
import numpy as np

# Flatten df_y to make it one-dimensional
df_y_flat = np.array(y_test).flatten()
lr2_pred_flat = np.array(lr2_pred).flatten()

In [None]:
import pandas as pd

# Create a data frame with df_y_flat and svr_pred
result_df_poly = pd.DataFrame({'Actual': df_y_flat, 'Predicted': lr2_pred_flat})

result_df_poly.head(10)

In [None]:
import pickle

In [None]:
# Save the trained model
filename = 'trained_model.sav'
pickle.dump(lr_mdl_2, open(filename, 'wb'))

In [None]:
import numpy as np

# Load the saved model
loaded_model = pickle.load(open('trained_model.sav', 'rb'))

# Prepare the input data
input_data = np.array([[77, 821, 0, 5, 1, 4, 5, 4, 4, 5, 4, 4, 2, 4, 4, 3, 1, 1, 0, 0, 1]])

# Check the number of features in the input data
num_features = input_data.shape[1]

# Validate the number of features
if num_features != loaded_model.coef_.shape[1]:
    print(f"Error: The input data has {num_features} features, but the model expects {loaded_model.coef_.shape[1]} features.")
else:
    # Make the prediction
    prediction = loaded_model.predict(input_data)
    print(prediction[0])
