In [1]:
#importing CSV File and Reading it
import pandas as pd
import csv
import pyspark
from pyspark import SparkContext, SparkConf
import plotly.plotly as py

In [9]:
conf=SparkConf().setAppName("CSE545 Project").set("spark.driver.memory", "12g").set("spark.executor.memory", "6g").set("spark.driver.maxResultSize", "6g")
sc=SparkContext(conf=conf)

In [24]:
#Loading the Climate Dataset
GST_rdd = sc.textFile("GlobalLandTemperaturesByCountry.csv").map(lambda line: line.split(","))

In [25]:
#Filtering out too old records
GST_rdd=GST_rdd.map(lambda x: [w.encode("utf-8") for w in x]).filter(lambda x: x[0] =='dt' or int(x[0][:4])>1960)

In [26]:
#Reformatting the dataset for desired attributes
def reformat(x):
    if x[0]!='dt':
        x[0]=int(x[0][:4])
        
    return [x[3],x[0],x[1]]

GST_rdd2=GST_rdd.map(lambda x: reformat(x))

#print(GST_rdd2.collect())

In [27]:
#COnverting String Values to Integer Values
def conv_x(x):
    if x[0] != "Country":
        if x[2] == "":
            x[2] = str(0)
        x[2] = round(float(x[2]),2)
    return x
            

In [28]:
#Call to the conversion function
GST_rdd3=GST_rdd2.map(lambda x: conv_x(x))
print(GST_rdd3.collect())

[['Country', 'dt', 'AverageTemperature'], ['\xc3\x85land', 1961, -1.46], ['\xc3\x85land', 1961, 1.18], ['\xc3\x85land', 1961, 2.4], ['\xc3\x85land', 1961, 3.05], ['\xc3\x85land', 1961, 7.5], ['\xc3\x85land', 1961, 14.42], ['\xc3\x85land', 1961, 14.77], ['\xc3\x85land', 1961, 14.18], ['\xc3\x85land', 1961, 12.18], ['\xc3\x85land', 1961, 11.14], ['\xc3\x85land', 1961, 4.76], ['\xc3\x85land', 1961, -2.12], ['\xc3\x85land', 1962, -0.88], ['\xc3\x85land', 1962, -2.09], ['\xc3\x85land', 1962, -5.91], ['\xc3\x85land', 1962, 2.79], ['\xc3\x85land', 1962, 6.05], ['\xc3\x85land', 1962, 10.92], ['\xc3\x85land', 1962, 13.63], ['\xc3\x85land', 1962, 13.36], ['\xc3\x85land', 1962, 10.57], ['\xc3\x85land', 1962, 8.22], ['\xc3\x85land', 1962, 3.15], ['\xc3\x85land', 1962, -2.33], ['\xc3\x85land', 1963, -6.29], ['\xc3\x85land', 1963, -8.08], ['\xc3\x85land', 1963, -5.51], ['\xc3\x85land', 1963, 1.45], ['\xc3\x85land', 1963, 9.56], ['\xc3\x85land', 1963, 12.57], ['\xc3\x85land', 1963, 15.55], ['\xc3\x85

In [29]:
#Filling in Missing Values
GST_rdd4=GST_rdd3.map(lambda x: (x[0],x[1]))
GST_rdd5=GST_rdd3.map(lambda x: x[2]).collect()

for i in range(len(GST_rdd5)):
    if(i==len(GST_rdd5)-1 and GST_rdd5[i]==0):
        GST_rdd5[i]=round((2*GST_rdd5[i-1]-GST_rdd5[i-2]),2)
    elif(GST_rdd5[i]==0):
        GST_rdd5[i]=round(((GST_rdd5[i-1]+GST_rdd5[i+1])/2),2)
    

GST_rdd5=sc.parallelize(GST_rdd5)

GST_rdd4 = GST_rdd4.zipWithIndex().map(lambda x: (x[1],x[0]))
GST_rdd5 = GST_rdd5.zipWithIndex().map(lambda x: (x[1],x[0]))
GST_rdd6 = GST_rdd4.join(GST_rdd5).map(lambda x: x[1]).groupByKey().mapValues(list)

print(GST_rdd6.collect())

[(('Namibia', 1978), [15.25, 24.54, 23.73, 15.45, 22.77, 17.53, 22.74, 19.86, 19.75, 21.43, 17.9, 24.0]), (('Hungary', 2001), [17.72, -4.63, 0.79, 21.42, 2.7, 22.13, 7.9, 14.47, 10.3, 13.11, 17.49, 2.88]), (('Baker Island', 1964), [25.22, 24.95, 26.24, 25.09, 26.15, 25.49, 26.23, 25.3, 26.22, 25.1, 25.93, 24.98]), (('Fiji', 1974), [26.33, 24.98, 25.17, 25.82, 24.88, 26.61, 24.58, 26.38, 26.98, 24.05, 26.9, 23.95]), (('Barbados', 1974), [27.2, 25.42, 25.42, 27.4, 25.49, 27.6, 25.4, 27.37, 25.86, 26.74, 26.84, 26.84]), (('Oceania', 1991), [17.14, 26.55, 27.92, 15.26, 27.72, 16.65, 25.72, 19.94, 22.34, 23.86, 18.8, 25.25]), (('Costa Rica', 1961), [26.75, 25.03, 26.8, 25.11, 26.69, 24.94, 25.72, 25.28, 25.9, 25.54, 26.09, 25.86]), (('Burma', 1986), [26.92, 19.98, 18.8, 25.86, 21.25, 26.02, 23.88, 25.49, 26.56, 24.06, 26.82, 22.59]), (('Japan', 1975), [4.42, 22.16, 10.82, 14.8, 14.95, 9.65, 19.05, 2.97, 0.78, 23.15, 0.62, 24.68]), (('Equatorial Guinea', 2005), [26.82, 25.29, 26.76, 25.22, 2

In [30]:
#Take Average of temperature of 12 months to get get temperature for a particular year
def take_average(x):
    avg=0
    if(x[1][0]!="AverageTemperature"):
        avg=round((sum(x[1])/len(x[1])),2)
    else:
        return [x[0][0],x[0][1],x[1][0]]
    
    return [x[0][0],x[0][1],avg]

In [31]:
#call to the take_average function
GST_rdd7=GST_rdd6.map(lambda x: take_average(x))

print(GST_rdd7.collect())

[['Namibia', 1978, 20.41], ['Hungary', 2001, 10.52], ['Baker Island', 1964, 25.58], ['Fiji', 1974, 25.55], ['Barbados', 1974, 26.46], ['Oceania', 1991, 22.26], ['Costa Rica', 1961, 25.81], ['Burma', 1986, 24.02], ['Japan', 1975, 12.34], ['Equatorial Guinea', 2005, 25.62], ['North America', 1964, 1.93], ['Suriname', 1978, 26.51], ['Sri Lanka', 1966, 27.42], ['Virgin Islands', 1977, 26.73], ['Palmyra Atoll', 2001, 27.86], ['France', 1969, 12.83], ['Ireland', 1972, 9.24], ['Eritrea', 2007, 27.49], ['Federated States Of Micronesia', 1999, 27.07], ['Bahrain', 1962, 26.49], ['Antarctica', 2000, 0.01], ['Madagascar', 2010, 23.85], ['Solomon Islands', 1998, 27.37], ['Laos', 2009, 24.37], ['Jamaica', 2007, 27.01], ['Albania', 2007, 13.89], ['British Virgin Islands', 1994, 27.04], ['Estonia', 1986, 4.89], ['Guadeloupe', 1977, 26.72], ['Moldova', 1993, 8.87], ['Morocco', 1965, 17.63], ['Lebanon', 1984, 17.77], ['Oman', 2009, 28.34], ['Saudi Arabia', 1972, 25.27], ['Germany', 1984, 8.12], ['Heard 

In [32]:
#Transorming rdd to pandas dataframes for future ML and visualizations
headers = ["Country","Year","Average Temperature"]

GST_rdd7=GST_rdd7.filter(lambda x: x[0]!='Country')
GST_rdd7=GST_rdd7.sortBy(lambda x: (x[0],x[1]))

df = pd.DataFrame(GST_rdd7.collect(), columns=headers)

In [33]:
df

Unnamed: 0,Country,Year,Average Temperature
0,"""Bonaire",1961,27.39
1,"""Bonaire",1962,27.47
2,"""Bonaire",1963,27.57
3,"""Bonaire",1964,27.42
4,"""Bonaire",1965,27.23
5,"""Bonaire",1966,27.52
6,"""Bonaire",1967,27.19
7,"""Bonaire",1968,27.34
8,"""Bonaire",1969,27.86
9,"""Bonaire",1970,27.59


In [34]:
len(df.Country.unique())

243

In [35]:
data = [ dict(
        type = 'choropleth',
        locations = df['Country'],
        z = df['Average Temperature'],
        text = df['Country'],
        colorscale = [[0.0, 'rgb(242,240,247)'],[0.2, 'rgb(218,218,235)'],[0.4, 'rgb(188,189,220)'],\
            [0.6, 'rgb(158,154,200)'],[0.8, 'rgb(117,107,177)'],[1.0, 'rgb(84,39,143)']],
        autocolorscale = False,
        reversescale = True,
        marker = dict(
            line = dict (
                color = 'rgb(180,180,180)',
                width = 0.5
            ) ),
        colorbar = dict(
            autotick = False,
            tickprefix = '$',
            title = 'Temperature (in degree celsius)'),
      ) ]

layout = dict(
    title = 'Global Surface Temperature',
    geo = dict(
        showframe = False,
        showcoastlines = False,
        projection = dict(
            type = 'Mercator'
        )
    )
)

fig = dict( data=data, layout=layout )
py.iplot( fig, validate=False, filename='d3-world-map' )


Aw, snap! We don't have an account for ''. Want to try again? You can authenticate with your email address or username. Sign in is not case sensitive.

Don't have an account? plot.ly

Questions? support@plot.ly


PlotlyError: Because you didn't supply a 'file_id' in the call, we're assuming you're trying to snag a figure from a url. You supplied the url, '', we expected it to start with 'https://plot.ly'.
Run help on this function for more information.