In [1]:
import pandas as pd
pd.options.display.float_format = '{:,.2f}'.format
pd.set_option('display.width', 85)
pd.set_option('display.max_columns', 8)
landtemps

In [2]:
landtemps = pd.read_csv('data/landtempssample.csv',
                        names=['stationid', 'year', 'month', 'avgtemp', 'latitude',
                               'longitude', 'elevation', 'station', 'countryid', 'country'],
                        # skip the header
                        skiprows=1,
                        parse_dates=[['month', 'year']],
                        # reduce the usage of memory during import process
                        # but finally, the data will be read into RAM
                        low_memory=False)

In [5]:
landtemps.head(7)

Unnamed: 0,month_year,stationid,avgtemp,latitude,...,elevation,station,countryid,country
0,2000-04-01,USS0010K01S,5.27,39.9,...,2773.7,INDIAN_CANYON,US,United States
1,1940-05-01,CI000085406,18.04,-18.35,...,58.0,ARICA,CI,Chile
2,2013-12-01,USC00036376,6.22,34.37,...,61.0,SAINT_CHARLES,US,United States
3,1963-02-01,ASN00024002,22.93,-34.28,...,65.5,BERRI_IRRIGATION,AS,Australia
4,2001-11-01,ASN00028007,,-14.78,...,79.4,MUSGRAVE,AS,Australia
5,1991-04-01,USW00024151,5.59,42.15,...,1362.5,MALAD_CITY,US,United States
6,1993-12-01,RSM00022641,-10.17,63.9,...,13.0,ONEGA,RS,Russia


In [6]:
landtemps.dtypes

month_year    datetime64[ns]
stationid             object
avgtemp              float64
latitude             float64
longitude            float64
elevation            float64
station               object
countryid             object
country               object
dtype: object

In [8]:
# Give a better name
landtemps.rename(columns={'month_year':'mesureday'}, inplace=True)

In [9]:
landtemps.dtypes

mesureday    datetime64[ns]
stationid            object
avgtemp             float64
latitude            float64
longitude           float64
elevation           float64
station              object
countryid            object
country              object
dtype: object

In [11]:
landtemps.avgtemp.describe()

count   85,554.00
mean        10.92
std         11.52
min        -70.70
25%          3.46
50%         12.22
75%         19.57
max         39.95
Name: avgtemp, dtype: float64

In [14]:
# Looking for missing value
# this is an example of chaining methods
# there is no rules about when to use or not to use chaining methods
# but it's helpful cause it's may be interpreted like a single step
# Chaining has also a side benefit : not creating extra objects
print(landtemps.isnull().sum())
print(landtemps.shape)
# Drop rows which have Nan
# subset is used to indicate which column to check for Nan value
landtemps.dropna(subset=['avgtemp'], inplace=True)
landtemps.shape

mesureday    0
stationid    0
avgtemp      0
latitude     0
longitude    0
elevation    0
station      0
countryid    0
country      2
dtype: int64
(85554, 9)


(85554, 9)

In [15]:
# read_csv can read a compressed file ZIP
landtemps = pd.read_csv('data/landtempssample.zip',
                        compression='zip',
                        names=['stationid', 'year', 'month', 'avgtemp', 'latitude',
                               'longitude', 'elevation', 'station', 'countryid', 'country'],
                        # skip the header
                        skiprows=1,
                        parse_dates=[['month', 'year']],
                        # reduce the usage of memory during import process
                        # but finally, the data will be read into RAM
                        low_memory=False)
landtemps

Unnamed: 0,month_year,stationid,avgtemp,latitude,...,elevation,station,countryid,country
0,2000-04-01,USS0010K01S,5.27,39.90,...,2773.70,INDIAN_CANYON,US,United States
1,1940-05-01,CI000085406,18.04,-18.35,...,58.00,ARICA,CI,Chile
2,2013-12-01,USC00036376,6.22,34.37,...,61.00,SAINT_CHARLES,US,United States
3,1963-02-01,ASN00024002,22.93,-34.28,...,65.50,BERRI_IRRIGATION,AS,Australia
4,2001-11-01,ASN00028007,,-14.78,...,79.40,MUSGRAVE,AS,Australia
...,...,...,...,...,...,...,...,...,...
99995,1991-04-01,MXXLT347415,29.02,17.78,...,65.00,VALLE_NACIONAL_VALLE,MX,Mexico
99996,1991-11-01,RSM00032287,-2.81,57.08,...,3.00,UST_HAIRYUZOVO,RS,Russia
99997,1937-04-01,ARM00087166,19.42,-27.45,...,61.90,CORRIENTES,AR,Argentina
99998,1958-10-01,CA006137361,10.02,42.78,...,236.00,ST_THOMAS,CA,Canada
