# Importing the data, performing summary statistics, and creating our test set

In [41]:
# Importing dependencies

import pandas as pd
from sklearn.model_selection import train_test_split

climate_df = pd.read_csv('climate_change_dataset.csv') # Loading the dataset

climate_df.head() # Printing the first 5 rows

Unnamed: 0,Year,Country,Avg Temperature (°C),CO2 Emissions (Tons/Capita),Sea Level Rise (mm),Rainfall (mm),Population,Renewable Energy (%),Extreme Weather Events,Forest Area (%)
0,2006,UK,8.9,9.3,3.1,1441,530911230,20.4,14,59.8
1,2019,USA,31.0,4.8,4.2,2407,107364344,49.2,8,31.0
2,2014,France,33.9,2.8,2.2,1241,441101758,33.3,9,35.5
3,2010,Argentina,5.9,1.8,3.2,1892,1069669579,23.7,7,17.7
4,2007,Germany,26.9,5.6,2.4,1743,124079175,12.5,4,17.4


In [42]:
climate_df.info() # Rows x column length and checking for null values and data types

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 10 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   Year                         1000 non-null   int64  
 1   Country                      1000 non-null   object 
 2   Avg Temperature (°C)         1000 non-null   float64
 3   CO2 Emissions (Tons/Capita)  1000 non-null   float64
 4   Sea Level Rise (mm)          1000 non-null   float64
 5   Rainfall (mm)                1000 non-null   int64  
 6   Population                   1000 non-null   int64  
 7   Renewable Energy (%)         1000 non-null   float64
 8   Extreme Weather Events       1000 non-null   int64  
 9   Forest Area (%)              1000 non-null   float64
dtypes: float64(5), int64(4), object(1)
memory usage: 78.3+ KB


In [43]:
climate_df.describe() # Summary statistics

Unnamed: 0,Year,Avg Temperature (°C),CO2 Emissions (Tons/Capita),Sea Level Rise (mm),Rainfall (mm),Population,Renewable Energy (%),Extreme Weather Events,Forest Area (%)
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,2011.432,19.8831,10.4258,3.0096,1738.761,705383000.0,27.3005,7.291,40.572
std,7.147199,8.542897,5.614665,1.146081,708.976616,409391000.0,12.970808,4.422655,17.398998
min,2000.0,5.0,0.5,1.0,501.0,3660891.0,5.1,0.0,10.1
25%,2005.0,12.175,5.575,2.0,1098.75,343624200.0,16.1,3.0,25.6
50%,2012.0,20.1,10.7,3.0,1726.0,713116600.0,27.15,8.0,41.15
75%,2018.0,27.225,15.4,4.0,2362.5,1073868000.0,38.925,11.0,55.8
max,2023.0,34.9,20.0,5.0,2999.0,1397016000.0,50.0,14.0,70.0


In [44]:
climate_df = climate_df.drop(['Year', 'Country'], axis=1) # Dropping non-predictive columns

df = climate_df.rename(columns = {
    'Avg Temperature (°C)': 'avg_temperature', 
    'CO2 Emissions (Tons/Capita)': 'co2_emissions', 
    'Sea Level Rise (mm)': 'sea_level', 
    'Rainfall (mm)': 'rainfall', 
    'Population': 'population', 
    'Renewable Energy (%)': 'renewable_energy_pct', 
    'Extreme Weather Events': 'extreme_weather_events', 
    'Forest Area (%)': 'forest_area_pct'
}) # Renaming column for easier access


In [45]:
df.columns

Index(['avg_temperature', 'co2_emissions', 'sea_level', 'rainfall',
       'population', 'renewable_energy_pct', 'extreme_weather_events',
       'forest_area_pct'],
      dtype='object')

In [None]:
X = df.drop('avg_temperature', axis=1) # Defining feature variable
y = df['avg_temperature'] # Defining target variable            

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
X_test.to_csv("test_set_X.csv", index=False) # Saving X_test as a csv file
y_test.to_csv("test_set_y.csv", index=False) # Saving y_test as a csv file