## Import Necessary Libraries

In [1]:
import pandas as pd

## Importing csv data into a pandas Dataframe

In [2]:
# Import Data into pandas Dataframe

df = pd.read_csv("./raw_data/landtempssample.csv")

In [4]:
# Check the first five rows of the data

df.head(5)

Unnamed: 0,locationid,year,month,temp,latitude,longitude,stnelev,station,countryid,country
0,USS0010K01S,2000,4,5.27,39.9,-110.75,2773.7,INDIAN_CANYON,US,United States
1,CI000085406,1940,5,18.04,-18.35,-70.333,58.0,ARICA,CI,Chile
2,USC00036376,2013,12,6.22,34.3703,-91.1242,61.0,SAINT_CHARLES,US,United States
3,ASN00024002,1963,2,22.93,-34.2833,140.6,65.5,BERRI_IRRIGATION,AS,Australia
4,ASN00028007,2001,11,,-14.7803,143.5036,79.4,MUSGRAVE,AS,Australia


In [5]:
# Check the column names

df.columns

Index(['locationid', 'year', 'month', 'temp', 'latitude', 'longitude',
       'stnelev', 'station', 'countryid', 'country'],
      dtype='object')

## Data Cleaning

In [6]:
# Rename the locationid, temp and and stnelev columns

df.rename(columns = {"locationid": "station_id",
                     "temp" : "avg_temp",
                     "stnelev" : "elevation"},inplace= True) 



In [7]:
df.head(5)

Unnamed: 0,station_id,year,month,avg_temp,latitude,longitude,elevation,station,countryid,country
0,USS0010K01S,2000,4,5.27,39.9,-110.75,2773.7,INDIAN_CANYON,US,United States
1,CI000085406,1940,5,18.04,-18.35,-70.333,58.0,ARICA,CI,Chile
2,USC00036376,2013,12,6.22,34.3703,-91.1242,61.0,SAINT_CHARLES,US,United States
3,ASN00024002,1963,2,22.93,-34.2833,140.6,65.5,BERRI_IRRIGATION,AS,Australia
4,ASN00028007,2001,11,,-14.7803,143.5036,79.4,MUSGRAVE,AS,Australia


In [8]:
# Checking the overall view of the data

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 10 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   station_id  100000 non-null  object 
 1   year        100000 non-null  int64  
 2   month       100000 non-null  int64  
 3   avg_temp    85554 non-null   float64
 4   latitude    100000 non-null  float64
 5   longitude   100000 non-null  float64
 6   elevation   100000 non-null  float64
 7   station     100000 non-null  object 
 8   countryid   100000 non-null  object 
 9   country     99995 non-null   object 
dtypes: float64(4), int64(2), object(4)
memory usage: 7.6+ MB


In [9]:
# Checking the shape of the dataframe (rows by columns)

df.shape

(100000, 10)

In [12]:
# Create a new column  "date" by combining "month" and "year" column

df["date"] = pd.to_datetime(df["year"].astype(str) + "-" + df["month"].astype(str) + "-01")



In [13]:
df.head()

Unnamed: 0,station_id,year,month,avg_temp,latitude,longitude,elevation,station,countryid,country,date
0,USS0010K01S,2000,4,5.27,39.9,-110.75,2773.7,INDIAN_CANYON,US,United States,2000-04-01
1,CI000085406,1940,5,18.04,-18.35,-70.333,58.0,ARICA,CI,Chile,1940-05-01
2,USC00036376,2013,12,6.22,34.3703,-91.1242,61.0,SAINT_CHARLES,US,United States,2013-12-01
3,ASN00024002,1963,2,22.93,-34.2833,140.6,65.5,BERRI_IRRIGATION,AS,Australia,1963-02-01
4,ASN00028007,2001,11,,-14.7803,143.5036,79.4,MUSGRAVE,AS,Australia,2001-11-01


In [14]:
# Checking the statistical information of the data

df.describe()  #for only the numerical values

Unnamed: 0,year,month,avg_temp,latitude,longitude,elevation
count,100000.0,100000.0,85554.0,100000.0,100000.0,100000.0
mean,1969.45803,6.49464,10.92077,35.075456,-38.123732,565.562545
std,35.836832,3.446463,11.522444,23.545646,84.297049,1073.647214
min,1720.0,1.0,-70.7,-90.0,-179.983,-350.0
25%,1949.0,3.0,3.46,33.190675,-101.657925,64.0
50%,1975.0,6.0,12.22,40.79305,-79.5683,240.8
75%,1998.0,9.0,19.57,47.4247,17.5331,644.7
max,2020.0,12.0,39.95,82.5167,179.75,9999.0


In [15]:
# Checking the statistical information for a particular column "avg_temp" in the data

df["avg_temp"].describe()

count    85554.000000
mean        10.920770
std         11.522444
min        -70.700000
25%          3.460000
50%         12.220000
75%         19.570000
max         39.950000
Name: avg_temp, dtype: float64

In [16]:
df.avg_temp.describe()

count    85554.000000
mean        10.920770
std         11.522444
min        -70.700000
25%          3.460000
50%         12.220000
75%         19.570000
max         39.950000
Name: avg_temp, dtype: float64

In [17]:
# Checking for null values or missing values in the dataframe

df.isnull().sum()

station_id        0
year              0
month             0
avg_temp      14446
latitude          0
longitude         0
elevation         0
station           0
countryid         0
country           5
date              0
dtype: int64

In [18]:
df.shape

(100000, 11)

In [19]:
# Dropping(deleting) the rows where there are missing/null values

df.dropna(subset =['avg_temp','country'],inplace =True, how = "any")

In [20]:
df.shape

(85552, 11)