# <center>Working with CSV Files</center>

In [12]:
import os
import pandas as pd

base_dir = os.getcwd()
file_path = os.path.join(base_dir,"data","temperatures.csv")
temperature1 = pd.read_csv(filepath_or_buffer=file_path,index_col=0)
temperature1.head()

Unnamed: 0,date,city,country,avg_temp_c
0,2000-01-01,Abidjan,Côte D'Ivoire,27.293
1,2000-02-01,Abidjan,Côte D'Ivoire,27.685
2,2000-03-01,Abidjan,Côte D'Ivoire,29.061
3,2000-04-01,Abidjan,Côte D'Ivoire,28.162
4,2000-05-01,Abidjan,Côte D'Ivoire,27.547


In [9]:
temperature1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 16500 entries, 0 to 16499
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   date        16500 non-null  object 
 1   city        16500 non-null  object 
 2   country     16500 non-null  object 
 3   avg_temp_c  16407 non-null  float64
dtypes: float64(1), object(3)
memory usage: 644.5+ KB


In [10]:
# To convert the date field to datetime data type, use the parse_dates parameter
temperature = pd.read_csv(filepath_or_buffer=file_path,index_col=0,parse_dates=['date'])
print(temperature.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 16500 entries, 0 to 16499
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   date        16500 non-null  datetime64[ns]
 1   city        16500 non-null  object        
 2   country     16500 non-null  object        
 3   avg_temp_c  16407 non-null  float64       
dtypes: datetime64[ns](1), float64(1), object(2)
memory usage: 644.5+ KB
None


In [11]:
# Extraction of year from date column
temperature["year"] = temperature["date"].dt.year
temperature.head()

Unnamed: 0,date,city,country,avg_temp_c,year
0,2000-01-01,Abidjan,Côte D'Ivoire,27.293,2000
1,2000-02-01,Abidjan,Côte D'Ivoire,27.685,2000
2,2000-03-01,Abidjan,Côte D'Ivoire,29.061,2000
3,2000-04-01,Abidjan,Côte D'Ivoire,28.162,2000
4,2000-05-01,Abidjan,Côte D'Ivoire,27.547,2000


In [13]:
# Aggregating column "avg_temp_c" on city and year
temperature_new = temperature.groupby(["city","year"])['avg_temp_c'].sum()
temperature_new.head()

city     year
Abidjan  2000    321.037
         2001    320.323
         2002    322.092
         2003    324.608
         2004    323.832
Name: avg_temp_c, dtype: float64

In [14]:
# Adding aggregated value in the original dataframe
temperature = temperature.merge(temperature_new, on=['city', 'year'])

In [15]:
temperature.head()

Unnamed: 0,date,city,country,avg_temp_c_x,year,avg_temp_c_y
0,2000-01-01,Abidjan,Côte D'Ivoire,27.293,2000,321.037
1,2000-02-01,Abidjan,Côte D'Ivoire,27.685,2000,321.037
2,2000-03-01,Abidjan,Côte D'Ivoire,29.061,2000,321.037
3,2000-04-01,Abidjan,Côte D'Ivoire,28.162,2000,321.037
4,2000-05-01,Abidjan,Côte D'Ivoire,27.547,2000,321.037


In [16]:
# This will download the file on the Path specified
out_file_path = os.path.join(base_dir,"data","output.csv")
temperature.to_csv(out_file_path)