In [1]:
from IPython.display import clear_output
clear_output()

In [2]:
from pyspark.sql import SparkSession
import numpy as np
import pandas as pd
import warnings

from pyspark.sql import functions as F
from pyspark.sql.window import Window
from IPython.core.display import HTML

In [3]:
display(HTML("<style>pre { white-space: pre !important; }</style>"))

In [4]:
%matplotlib inline

warnings.filterwarnings("ignore")

spark = SparkSession.builder\
            .master("local[*]")\
            .appName("World Happiness Analysis")\
            .getOrCreate()

In [5]:
spark.sparkContext.setLogLevel("ERROR")
clear_output()
spark

In [6]:
hdfspath ="hdfs://localhost:9000/module6/Kaggle/dataset/world_happiness_report/"
spark_df_15 = spark.read.csv(hdfspath+"2015.csv",inferSchema=True,header=True)
spark_df_16 = spark.read.csv(hdfspath+"2016.csv",inferSchema=True,header=True)
spark_df_17 = spark.read.csv(hdfspath+"2017.csv",inferSchema=True,header=True)
spark_df_18 = spark.read.csv(hdfspath+"2018.csv",inferSchema=True,header=True)
spark_df_19 = spark.read.csv(hdfspath+"2019.csv",inferSchema=True,header=True)

In [7]:
pandas_df_15 = spark_df_15.toPandas()
pandas_df_16 = spark_df_16.toPandas()
pandas_df_17 = spark_df_17.toPandas()
pandas_df_18 = spark_df_18.toPandas()
pandas_df_19 = spark_df_19.toPandas()


In [8]:
type(pandas_df_15)

pandas.core.frame.DataFrame

In [9]:
spark_df_15.registerTempTable("spark_tbl_15")
spark_df_16.registerTempTable("spark_tbl_16")
spark_df_17.registerTempTable("spark_tbl_17")
spark_df_18.registerTempTable("spark_tbl_18")
spark_df_19.registerTempTable("spark_tbl_19")

In [10]:
spark_df_15.rdd.getNumPartitions()

1

In [11]:
type(pandas_df_15)

pandas.core.frame.DataFrame

In [12]:
type(spark_df_15)

pyspark.sql.dataframe.DataFrame

In [13]:
pandas_df_15.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 158 entries, 0 to 157
Data columns (total 12 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   Country                        158 non-null    object 
 1   Region                         158 non-null    object 
 2   Happiness Rank                 158 non-null    int32  
 3   Happiness Score                158 non-null    float64
 4   Standard Error                 158 non-null    float64
 5   Economy (GDP per Capita)       158 non-null    float64
 6   Family                         158 non-null    float64
 7   Health (Life Expectancy)       158 non-null    float64
 8   Freedom                        158 non-null    float64
 9   Trust (Government Corruption)  158 non-null    float64
 10  Generosity                     158 non-null    float64
 11  Dystopia Residual              158 non-null    float64
dtypes: float64(9), int32(1), object(2)
memory usage: 1

In [14]:
spark_df_15.printSchema()

root
 |-- Country: string (nullable = true)
 |-- Region: string (nullable = true)
 |-- Happiness Rank: integer (nullable = true)
 |-- Happiness Score: double (nullable = true)
 |-- Standard Error: double (nullable = true)
 |-- Economy (GDP per Capita): double (nullable = true)
 |-- Family: double (nullable = true)
 |-- Health (Life Expectancy): double (nullable = true)
 |-- Freedom: double (nullable = true)
 |-- Trust (Government Corruption): double (nullable = true)
 |-- Generosity: double (nullable = true)
 |-- Dystopia Residual: double (nullable = true)



In [15]:
spark_df_15.describe().show(truncate=False)

+-------+-----------+-------------------------+-----------------+------------------+--------------------+------------------------+------------------+------------------------+------------------+-----------------------------+-------------------+------------------+
|summary|Country    |Region                   |Happiness Rank   |Happiness Score   |Standard Error      |Economy (GDP per Capita)|Family            |Health (Life Expectancy)|Freedom           |Trust (Government Corruption)|Generosity         |Dystopia Residual |
+-------+-----------+-------------------------+-----------------+------------------+--------------------+------------------------+------------------+------------------------+------------------+-----------------------------+-------------------+------------------+
|count  |158        |158                      |158              |158               |158                 |158                     |158               |158                     |158               |158               

In [16]:
spark_df_15.describe().toPandas().set_index('summary').transpose()

summary,count,mean,stddev,min,max
Country,158,,,Afghanistan,Zimbabwe
Region,158,,,Australia and New Zealand,Western Europe
Happiness Rank,158,79.49367088607595,45.75436310480852,1,158
Happiness Score,158,5.375734177215191,1.145010134952066,2.839,7.587
Standard Error,158,0.047884746835443,0.0171461785569693,0.01848,0.13693
Economy (GDP per Capita),158,0.8461372151898726,0.4031207785379107,0.0,1.69042
Family,158,0.9910459493670888,0.2723690860079153,0.0,1.40223
Health (Life Expectancy),158,0.6302593670886079,0.2470777663021721,0.0,1.02525
Freedom,158,0.4286149367088611,0.150692783937678,0.0,0.66973
Trust (Government Corruption),158,0.1434218354430379,0.1200340735745592,0.0,0.55191


In [17]:
spark_df_15.describe().toPandas()

Unnamed: 0,summary,Country,Region,Happiness Rank,Happiness Score,Standard Error,Economy (GDP per Capita),Family,Health (Life Expectancy),Freedom,Trust (Government Corruption),Generosity,Dystopia Residual
0,count,158,158,158.0,158.0,158.0,158.0,158.0,158.0,158.0,158.0,158.0,158.0
1,mean,,,79.49367088607595,5.375734177215191,0.047884746835443,0.8461372151898726,0.9910459493670888,0.6302593670886079,0.4286149367088611,0.1434218354430379,0.237295506329114,2.098976772151899
2,stddev,,,45.75436310480852,1.145010134952066,0.0171461785569693,0.4031207785379107,0.2723690860079153,0.2470777663021721,0.150692783937678,0.1200340735745592,0.1266849340202053,0.5535497923037985
3,min,Afghanistan,Australia and New Zealand,1.0,2.839,0.01848,0.0,0.0,0.0,0.0,0.0,0.0,0.32858
4,max,Zimbabwe,Western Europe,158.0,7.587,0.13693,1.69042,1.40223,1.02525,0.66973,0.55191,0.79588,3.60214


In [18]:
pandas_df_15.describe()

Unnamed: 0,Happiness Rank,Happiness Score,Standard Error,Economy (GDP per Capita),Family,Health (Life Expectancy),Freedom,Trust (Government Corruption),Generosity,Dystopia Residual
count,158.0,158.0,158.0,158.0,158.0,158.0,158.0,158.0,158.0,158.0
mean,79.493671,5.375734,0.047885,0.846137,0.991046,0.630259,0.428615,0.143422,0.237296,2.098977
std,45.754363,1.14501,0.017146,0.403121,0.272369,0.247078,0.150693,0.120034,0.126685,0.55355
min,1.0,2.839,0.01848,0.0,0.0,0.0,0.0,0.0,0.0,0.32858
25%,40.25,4.526,0.037268,0.545808,0.856823,0.439185,0.32833,0.061675,0.150553,1.75941
50%,79.5,5.2325,0.04394,0.910245,1.02951,0.696705,0.435515,0.10722,0.21613,2.095415
75%,118.75,6.24375,0.0523,1.158448,1.214405,0.811013,0.549092,0.180255,0.309883,2.462415
max,158.0,7.587,0.13693,1.69042,1.40223,1.02525,0.66973,0.55191,0.79588,3.60214


In [19]:
pd. __version__

'1.5.2'

In [20]:
pandas_df_15.describe(include='O')

Unnamed: 0,Country,Region
count,158,158
unique,158,10
top,Switzerland,Sub-Saharan Africa
freq,1,40


In [21]:
print("pandas_df_15 ->",pandas_df_15.columns.to_list())
print("pandas_df_16 ->",pandas_df_16.columns.to_list())
print("pandas_df_17 ->",pandas_df_17.columns.to_list())
print("pandas_df_18 ->",pandas_df_18.columns.to_list())
print("pandas_df_19 ->",pandas_df_19.columns.to_list())

pandas_df_15 -> ['Country', 'Region', 'Happiness Rank', 'Happiness Score', 'Standard Error', 'Economy (GDP per Capita)', 'Family', 'Health (Life Expectancy)', 'Freedom', 'Trust (Government Corruption)', 'Generosity', 'Dystopia Residual']
pandas_df_16 -> ['Country', 'Region', 'Happiness Rank', 'Happiness Score', 'Lower Confidence Interval', 'Upper Confidence Interval', 'Economy (GDP per Capita)', 'Family', 'Health (Life Expectancy)', 'Freedom', 'Trust (Government Corruption)', 'Generosity', 'Dystopia Residual']
pandas_df_17 -> ['Country', 'Happiness.Rank', 'Happiness.Score', 'Whisker.high', 'Whisker.low', 'Economy..GDP.per.Capita.', 'Family', 'Health..Life.Expectancy.', 'Freedom', 'Generosity', 'Trust..Government.Corruption.', 'Dystopia.Residual']
pandas_df_18 -> ['Overall rank', 'Country or region', 'Score', 'GDP per capita', 'Social support', 'Healthy life expectancy', 'Freedom to make life choices', 'Generosity', 'Perceptions of corruption']
pandas_df_19 -> ['Overall rank', 'Country 

In [223]:
spark.sql("""select `Overall rank` Overall_rank, `Country or region` Country_or_region,
            rank() over (order by `GDP per capita` desc) as GDP_per_capita_Rank,
            rank() over (order by `Social support` desc) as Social_support_Rank,
            rank() over (order by `Healthy life expectancy` desc) as Healthy_life_expectancy_Rank,
            rank() over (order by `Freedom to make life choices` desc) as Freedom_to_make_life_choices_Rank,
            rank() over (order by `Generosity` desc) as Generosity_Rank,
            rank() over (order by `Perceptions of corruption` desc) as Perceptions_of_corruption_Rank
            from spark_tbl_19
            order by `Overall_rank` 
            limit 10
""").show(5)

+------------+-----------------+-------------------+-------------------+----------------------------+---------------------------------+---------------+------------------------------+
|Overall_rank|Country_or_region|GDP_per_capita_Rank|Social_support_Rank|Healthy_life_expectancy_Rank|Freedom_to_make_life_choices_Rank|Generosity_Rank|Perceptions_of_corruption_Rank|
+------------+-----------------+-------------------+-------------------+----------------------------+---------------------------------+---------------+------------------------------+
|           1|          Finland|                 23|                  2|                          28|                                5|             92|                             4|
|           2|          Denmark|                 14|                  4|                          25|                                6|             37|                             3|
|           3|           Norway|                  7|                  3|             

In [23]:
spark.sql("select * from spark_tbl_19").show()

+------------+-----------------+-----+--------------+--------------+-----------------------+----------------------------+----------+-------------------------+
|Overall rank|Country or region|Score|GDP per capita|Social support|Healthy life expectancy|Freedom to make life choices|Generosity|Perceptions of corruption|
+------------+-----------------+-----+--------------+--------------+-----------------------+----------------------------+----------+-------------------------+
|           1|          Finland|7.769|          1.34|         1.587|                  0.986|                       0.596|     0.153|                    0.393|
|           2|          Denmark|  7.6|         1.383|         1.573|                  0.996|                       0.592|     0.252|                     0.41|
|           3|           Norway|7.554|         1.488|         1.582|                  1.028|                       0.603|     0.271|                    0.341|
|           4|          Iceland|7.494|        

In [24]:
spark.sql("""select `Overall rank` Overall_rank, `Country or region` Country_or_region,
            rank() over (order by `GDP per capita` desc) as GDP_per_capita_Rank,
            `GDP per capita`
            from spark_tbl_19

            order by `GDP per capita` desc
            
            """).show(25)

+------------+--------------------+-------------------+--------------+
|Overall_rank|   Country_or_region|GDP_per_capita_Rank|GDP per capita|
+------------+--------------------+-------------------+--------------+
|          29|               Qatar|                  1|         1.684|
|          14|          Luxembourg|                  2|         1.609|
|          34|           Singapore|                  3|         1.572|
|          21|United Arab Emirates|                  4|         1.503|
|          51|              Kuwait|                  5|           1.5|
|          16|             Ireland|                  6|         1.499|
|           3|              Norway|                  7|         1.488|
|           6|         Switzerland|                  8|         1.452|
|          76|           Hong Kong|                  9|         1.438|
|          19|       United States|                 10|         1.433|
|          28|        Saudi Arabia|                 11|         1.403|
|     

In [25]:
spark.sql("""select `Overall rank` Overall_rank, `Country or region` Country_or_region,
             rank() over (order by `GDP per capita` desc) as GDP_per_capita_Rank,
            `GDP per capita`
            from spark_tbl_19

            order by `GDP per capita` desc
            
            """).show(25)

+------------+--------------------+-------------------+--------------+
|Overall_rank|   Country_or_region|GDP_per_capita_Rank|GDP per capita|
+------------+--------------------+-------------------+--------------+
|          29|               Qatar|                  1|         1.684|
|          14|          Luxembourg|                  2|         1.609|
|          34|           Singapore|                  3|         1.572|
|          21|United Arab Emirates|                  4|         1.503|
|          51|              Kuwait|                  5|           1.5|
|          16|             Ireland|                  6|         1.499|
|           3|              Norway|                  7|         1.488|
|           6|         Switzerland|                  8|         1.452|
|          76|           Hong Kong|                  9|         1.438|
|          19|       United States|                 10|         1.433|
|          28|        Saudi Arabia|                 11|         1.403|
|     

In [26]:
pandas_df_19

Unnamed: 0,Overall rank,Country or region,Score,GDP per capita,Social support,Healthy life expectancy,Freedom to make life choices,Generosity,Perceptions of corruption
0,1,Finland,7.769,1.340,1.587,0.986,0.596,0.153,0.393
1,2,Denmark,7.600,1.383,1.573,0.996,0.592,0.252,0.410
2,3,Norway,7.554,1.488,1.582,1.028,0.603,0.271,0.341
3,4,Iceland,7.494,1.380,1.624,1.026,0.591,0.354,0.118
4,5,Netherlands,7.488,1.396,1.522,0.999,0.557,0.322,0.298
...,...,...,...,...,...,...,...,...,...
151,152,Rwanda,3.334,0.359,0.711,0.614,0.555,0.217,0.411
152,153,Tanzania,3.231,0.476,0.885,0.499,0.417,0.276,0.147
153,154,Afghanistan,3.203,0.350,0.517,0.361,0.000,0.158,0.025
154,155,Central African Republic,3.083,0.026,0.000,0.105,0.225,0.235,0.035


In [27]:
pandas_df_15

Unnamed: 0,Country,Region,Happiness Rank,Happiness Score,Standard Error,Economy (GDP per Capita),Family,Health (Life Expectancy),Freedom,Trust (Government Corruption),Generosity,Dystopia Residual
0,Switzerland,Western Europe,1,7.587,0.03411,1.39651,1.34951,0.94143,0.66557,0.41978,0.29678,2.51738
1,Iceland,Western Europe,2,7.561,0.04884,1.30232,1.40223,0.94784,0.62877,0.14145,0.43630,2.70201
2,Denmark,Western Europe,3,7.527,0.03328,1.32548,1.36058,0.87464,0.64938,0.48357,0.34139,2.49204
3,Norway,Western Europe,4,7.522,0.03880,1.45900,1.33095,0.88521,0.66973,0.36503,0.34699,2.46531
4,Canada,North America,5,7.427,0.03553,1.32629,1.32261,0.90563,0.63297,0.32957,0.45811,2.45176
...,...,...,...,...,...,...,...,...,...,...,...,...
153,Rwanda,Sub-Saharan Africa,154,3.465,0.03464,0.22208,0.77370,0.42864,0.59201,0.55191,0.22628,0.67042
154,Benin,Sub-Saharan Africa,155,3.340,0.03656,0.28665,0.35386,0.31910,0.48450,0.08010,0.18260,1.63328
155,Syria,Middle East and Northern Africa,156,3.006,0.05015,0.66320,0.47489,0.72193,0.15684,0.18906,0.47179,0.32858
156,Burundi,Sub-Saharan Africa,157,2.905,0.08658,0.01530,0.41587,0.22396,0.11850,0.10062,0.19727,1.83302


In [30]:
temp_res= pd.concat([
    pandas_df_15[['Country','Happiness Rank']].assign(Year='2015').rename(columns={'Happiness Rank':'Rank'}),
    pandas_df_16[['Country','Happiness Rank']].assign(Year='2016').rename(columns={'Happiness Rank':'Rank'}),
    pandas_df_17[['Country','Happiness.Rank']].assign(Year='2017').rename(columns={'Happiness.Rank':'Rank'}),
    pandas_df_18[['Country or region','Overall rank']].assign(Year='2018').rename(columns={'Country or region':'Country','Overall rank':'Rank'}),
    pandas_df_19[['Country or region','Overall rank']].assign(Year='2019').rename(columns={'Country or region':'Country','Overall rank':'Rank'})
])

In [35]:
pandas_df_17[['Country','Happiness.Rank']].assign(Year='2017').rename(columns={'Happiness.Rank':'Rank'})


Unnamed: 0,Country,Rank,Year
0,Norway,1,2017
1,Denmark,2,2017
2,Iceland,3,2017
3,Switzerland,4,2017
4,Finland,5,2017
5,Netherlands,6,2017
6,Canada,7,2017
7,New Zealand,8,2017
8,Sweden,9,2017
9,Australia,10,2017


In [35]:
temp_res[temp_res['Rank']<=5]\
    .pivot(index="Year", columns='Rank', values='Country')\
    .rename(columns={1:'Rank_1', 2:'Rank_2', 3:'Rank_3', 4:'Rank_4', 5:'Rank_5'})\
    .rename_axis(None, axis=1)

Unnamed: 0_level_0,Rank_1,Rank_2,Rank_3,Rank_4,Rank_5
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2015,Switzerland,Iceland,Denmark,Norway,Canada
2016,Denmark,Switzerland,Iceland,Norway,Finland
2017,Norway,Denmark,Iceland,Switzerland,Finland
2018,Finland,Norway,Denmark,Iceland,Switzerland
2019,Finland,Denmark,Norway,Iceland,Netherlands


### Using spark DataFrames

In [51]:
spark_df_15['Country','Happiness Rank'].withColumnRenamed('Happiness Rank','Rank').withColumn('Year',F.lit(2015))\
.union(spark_df_16['Country','Happiness Rank'].withColumnRenamed('Happiness ank','Rank').withColumn('Year',F.lit(2016))).count()

315

In [93]:
spark_df_15['Country','Happiness Rank'].withColumnRenamed('Happiness Rank','Rank').withColumn('Year',F.lit(2015))\
.union(spark_df_16['Country','Happiness Rank'].withColumnRenamed('Happiness Rank','Rank').withColumn('Year',F.lit(2016)))\
.union(spark_df_17['Country','`Happiness.Rank`'].withColumnRenamed('`Happiness.Rank`','Rank').withColumn('Year',F.lit(2017)))\
.union(spark_df_18.select(F.col('Country or region').alias('Country'),F.col('Overall rank').alias('Rank'),F.lit(2018).alias('Year')))\
.union(spark_df_19.select(F.col('Country or region').alias('Country'),F.col('Overall rank').alias('Rank'),F.lit(2019).alias('Year')))\
.filter(F.col("Rank")<=5)\
.groupBy('Year').pivot('Rank').agg(F.first('Country'))\
.withColumnRenamed('1','Rank_1').withColumnRenamed('2','Rank_2').withColumnRenamed('3','Rank3')\
.withColumnRenamed('4','Rank_4').withColumnRenamed('5','Rank_5')\
.show()

+----+-----------+-----------+-------+-----------+-----------+
|Year|     Rank_1|     Rank_2|  Rank3|     Rank_4|     Rank_5|
+----+-----------+-----------+-------+-----------+-----------+
|2015|Switzerland|    Iceland|Denmark|     Norway|     Canada|
|2016|    Denmark|Switzerland|Iceland|     Norway|    Finland|
|2017|     Norway|    Denmark|Iceland|Switzerland|    Finland|
|2018|    Finland|     Norway|Denmark|    Iceland|Switzerland|
|2019|    Finland|    Denmark| Norway|    Iceland|Netherlands|
+----+-----------+-----------+-------+-----------+-----------+



In [137]:
spark.sql("""select t.Year,t.Country,t.Rank from (select Country,`Happiness Rank` Rank, 2015  `Year` from spark_tbl_15
            union select Country,`Happiness Rank` Rank, 2016  `Year` from spark_tbl_16
            union select Country,`Happiness.Rank` Rank, 2017  `Year`  from spark_tbl_17
            union select `Country or region` Country,`Overall rank` Rank, 2018 `Year` from spark_tbl_18
            union select `Country or region` Country,`Overall rank` Rank, 2019 `Year` from spark_tbl_19
            ) t
            where t.Rank<=5
            group by t.Year,t.Country,t.Rank

""").show()

+----+-----------+----+
|Year|    Country|Rank|
+----+-----------+----+
|2015|    Denmark|   3|
|2015|    Iceland|   2|
|2015|     Canada|   5|
|2015|     Norway|   4|
|2015|Switzerland|   1|
|2016|     Norway|   4|
|2016|Switzerland|   2|
|2016|    Finland|   5|
|2016|    Denmark|   1|
|2016|    Iceland|   3|
|2017|    Denmark|   2|
|2017|Switzerland|   4|
|2017|    Iceland|   3|
|2017|     Norway|   1|
|2017|    Finland|   5|
|2018|     Norway|   2|
|2018|    Iceland|   4|
|2018|    Finland|   1|
|2018|Switzerland|   5|
|2018|    Denmark|   3|
+----+-----------+----+
only showing top 20 rows



In [198]:
spark.sql("""select t.Year,case when t.Rank=1 then t.Country END `Rank1`,
            case when t.Rank=2 then t.Country END `Rank2`,
            case when t.Rank=3 then t.Country END `Rank3`,
            case when t.Rank=4 then t.Country END `Rank4`,
            case when t.Rank=5 then t.Country END `Rank5`
            from 
            (select Country,`Happiness Rank` Rank, 2015  `Year` from spark_tbl_15
            union select Country,`Happiness Rank` Rank, 2016  `Year` from spark_tbl_16
            union select Country,`Happiness.Rank` Rank, 2017  `Year`  from spark_tbl_17
            union select `Country or region` Country,`Overall rank` Rank, 2018 `Year` from spark_tbl_18
            union select `Country or region` Country,`Overall rank` Rank, 2019 `Year` from spark_tbl_19
            ) t
            where t.Rank<=5
            group by t.Year,t.Country,t.Rank
            

""").show()

+----+-----------+-----------+-------+-----------+-----------+
|Year|      Rank1|      Rank2|  Rank3|      Rank4|      Rank5|
+----+-----------+-----------+-------+-----------+-----------+
|2015|       null|       null|Denmark|       null|       null|
|2015|       null|    Iceland|   null|       null|       null|
|2015|       null|       null|   null|       null|     Canada|
|2015|       null|       null|   null|     Norway|       null|
|2015|Switzerland|       null|   null|       null|       null|
|2016|       null|       null|   null|     Norway|       null|
|2016|       null|Switzerland|   null|       null|       null|
|2016|       null|       null|   null|       null|    Finland|
|2016|    Denmark|       null|   null|       null|       null|
|2016|       null|       null|Iceland|       null|       null|
|2017|       null|    Denmark|   null|       null|       null|
|2017|       null|       null|   null|Switzerland|       null|
|2017|       null|       null|Iceland|       null|     

In [236]:
spark.sql("""with ref_tbl as 
            (select t.Year,t.Country,t.Rank from (select Country,`Happiness Rank` Rank, 2015  `Year` from spark_tbl_15
            union select Country,`Happiness Rank` Rank, 2016  `Year` from spark_tbl_16
            union select Country,`Happiness.Rank` Rank, 2017  `Year`  from spark_tbl_17
            union select `Country or region` Country,`Overall rank` Rank, 2018 `Year` from spark_tbl_18
            union select `Country or region` Country,`Overall rank` Rank, 2019 `Year` from spark_tbl_19
            ) t
            where t.Rank<=5)
            
            select * from ref_tbl
            PIVOT(
                    max(Country)
                    FOR Rank in (1 Rank_1, 2 Rank_2, 3 Rank_3, 4 Rank_4, 5 Rank_5) 
                )
            ORDER BY Year


""").show()

+----+-----------+-----------+-------+-----------+-----------+
|Year|     Rank_1|     Rank_2| Rank_3|     Rank_4|     Rank_5|
+----+-----------+-----------+-------+-----------+-----------+
|2015|Switzerland|    Iceland|Denmark|     Norway|     Canada|
|2016|    Denmark|Switzerland|Iceland|     Norway|    Finland|
|2017|     Norway|    Denmark|Iceland|Switzerland|    Finland|
|2018|    Finland|     Norway|Denmark|    Iceland|Switzerland|
|2019|    Finland|    Denmark| Norway|    Iceland|Netherlands|
+----+-----------+-----------+-------+-----------+-----------+



#### Using Spark SQL

Sor the data by biggest to lowest change between 2015 to  2019

In [247]:
spark.sql("""SELECT t1.Country, t1.`Happiness Rank` as Rank_2015, t2.`Happiness Rank` Rank_2016,
        t3.`Happiness.Rank` Rank_2017, t4.`Overall rank` Rank_2018,t5.`Overall rank` Rank_2019
        from spark_tbl_15 t1
        inner join spark_tbl_16 t2
        on t1.Country= t2.Country and t1.`Happiness Rank` >= t2.`Happiness Rank`
        inner join spark_tbl_17 t3
        on t1.Country= t3.Country and t2.`Happiness Rank` >= t3.`Happiness.Rank`
        inner join spark_tbl_18 t4
        on t1.Country= t4.`Country or region` and t3.`Happiness.Rank` >= t4.`Overall rank`
        inner join spark_tbl_19 t5
        on t1.Country= t5.`Country or region` and t4.`Overall rank` >= t5.`Overall rank`
        
""").show()

+-------------------+---------+---------+---------+---------+---------+
|            Country|Rank_2015|Rank_2016|Rank_2017|Rank_2018|Rank_2019|
+-------------------+---------+---------+---------+---------+---------+
|            Finland|        6|        5|        5|        1|        1|
|        Netherlands|        7|        7|        6|        6|        5|
|        New Zealand|        9|        8|        8|        8|        8|
|     Czech Republic|       31|       27|       23|       21|       20|
|              Malta|       37|       30|       27|       22|       22|
|           Slovakia|       45|       45|       40|       39|       38|
|              Italy|       50|       50|       48|       47|       36|
|             Poland|       60|       57|       46|       42|       40|
|            Estonia|       73|       72|       66|       63|       55|
|            Romania|       86|       71|       57|       52|       48|
|             Latvia|       89|       68|       54|       53|   

In [249]:
spark.sql("""SELECT t1.Country, t1.`Happiness Rank` as Rank_2015, t2.`Happiness Rank` Rank_2016,
        t3.`Happiness.Rank` Rank_2017, t4.`Overall rank` Rank_2018,t5.`Overall rank` Rank_2019,
        (t1.`Happiness Rank` - t5.`Overall rank`) Increment_factor
        from spark_tbl_15 t1
        inner join spark_tbl_16 t2
        on t1.Country= t2.Country and t1.`Happiness Rank` >= t2.`Happiness Rank`
        inner join spark_tbl_17 t3
        on t1.Country= t3.Country and t2.`Happiness Rank` >= t3.`Happiness.Rank`
        inner join spark_tbl_18 t4
        on t1.Country= t4.`Country or region` and t3.`Happiness.Rank` >= t4.`Overall rank`
        inner join spark_tbl_19 t5
        on t1.Country= t5.`Country or region` and t4.`Overall rank` >= t5.`Overall rank`
        order by Increment_factor desc
        
""").show(200)

+-------------------+---------+---------+---------+---------+---------+----------------+
|            Country|Rank_2015|Rank_2016|Rank_2017|Rank_2018|Rank_2019|Increment_factor|
+-------------------+---------+---------+---------+---------+---------+----------------+
|              Benin|      155|      153|      143|      136|      102|              53|
|        Ivory Coast|      151|      139|      128|      107|       99|              52|
|           Honduras|      105|      104|       91|       72|       59|              46|
|            Hungary|      104|       91|       75|       69|       62|              42|
|            Romania|       86|       71|       57|       52|       48|              38|
|       Burkina Faso|      152|      145|      134|      121|      115|              37|
|           Cameroon|      133|      114|      107|       99|       96|              37|
|           Bulgaria|      134|      129|      105|      100|       97|              37|
|Congo (Brazzaville)|

In [264]:
pd.merge(pandas_df_15[['Country','Happiness Rank']], 
         pandas_df_16[['Country','Happiness Rank']], on='Country')\
        .rename(columns={'Happiness Rank_x':'Rank_2015', 'Happiness Rank_y':'Rank_2016'})\
        .query("Rank_2015 >= Rank_2016")\
        .merge(pandas_df_17[['Country','Happiness.Rank']], on='Country')\
        .rename(columns={'Happiness.Rank':'Rank_2017'})\
        .query("Rank_2016 >= Rank_2017")\
        .merge(pandas_df_18[['Country or region','Overall rank']], left_on='Country', right_on='Country or region')\
        .rename(columns={'Country or region':'Country', 'Overall rank': 'Rank_2018'})\
        .query("Rank_2017 >= Rank_2018")\
        .merge(pandas_df_19[['Country or region','Overall rank']], left_on='Country', right_on='Country or region')\
        .rename(columns={'Country or region':'Country', 'Overall rank': 'Rank_2019'})\
        .query("Rank_2018 >= Rank_2019")

ValueError: The column label 'Country' is not unique.

In [295]:
temp_res = pd.merge(pandas_df_15[['Country','Happiness Rank']], 
         pandas_df_16[['Country','Happiness Rank']], on='Country')\
        .rename(columns={'Happiness Rank_x':'Rank_2015', 'Happiness Rank_y':'Rank_2016'})\
        .query("Rank_2015 >= Rank_2016")\
        .merge(pandas_df_17[['Country','Happiness.Rank']], on='Country')\
        .rename(columns={'Happiness.Rank':'Rank_2017'})\
        .query("Rank_2016 >= Rank_2017")\
        .merge(pandas_df_18[['Country or region','Overall rank']], left_on='Country', right_on='Country or region')\
        .rename(columns={ 'Overall rank': 'Rank_2018'})\
        .query("Rank_2017 >= Rank_2018")\
        .drop(columns = 'Country or region')\
        .merge(pandas_df_19[['Country or region','Overall rank']], left_on="Country", right_on="Country or region")\
        .rename(columns ={'Overall rank':'Rank_2019'})\
        .query("Rank_2018 >= Rank_2019")\
        .drop(columns='Country or region')
        
temp_res.assign(Increment_Factor = temp_res['Rank_2015']-temp_res['Rank_2019'])\
    .sort_values('Increment_Factor',ascending=False)\
    .style.hide_index()

Country,Rank_2015,Rank_2016,Rank_2017,Rank_2018,Rank_2019,Increment_Factor
Benin,155,153,143,136,102,53
Ivory Coast,151,139,128,107,99,52
Honduras,105,104,91,72,59,46
Hungary,104,91,75,69,62,42
Romania,86,71,57,52,48,38
Cameroon,133,114,107,99,96,37
Burkina Faso,152,145,134,121,115,37
Bulgaria,134,129,105,100,97,37
Cambodia,145,140,129,120,109,36
Congo (Brazzaville),139,127,124,114,103,36


In [333]:
spark_df_15['Country','Happiness Rank']\
    .withColumnRenamed("Happiness Rank",'Rank_2015')\
    .join(spark_df_16['Country','`Happiness Rank`'].withColumnRenamed('Happiness Rank','Rank_2016'),on='Country')\
    .filter(F.col('Rank_2015')>=F.col('Rank_2016'))\
    .join(spark_df_17['Country','`Happiness.Rank`'].withColumnRenamed('Happiness.Rank','Rank_2017'),on='Country')\
    .filter(F.col('Rank_2016')>=F.col('Rank_2017'))\
    .join(spark_df_18['`Country or region`','`Overall rank`'].withColumnRenamed('Country or region','Country').withColumnRenamed('Overall rank','Rank_2018'),on='Country')\
    .filter(F.col('Rank_2017')>=F.col('Rank_2017'))\
    .join(spark_df_19['`Country or region`','`Overall rank`'].withColumnRenamed('Country or region','Country').withColumnRenamed('Overall rank','Rank_2019'),on='Country')\
    .filter(F.col('Rank_2018')>=F.col('Rank_2019'))\
    .withColumn('Increment_Factor',F.col('Rank_2015')-F.col('Rank_2019'))\
    .sort('Increment_Factor',ascending=False)\
    .show(200)

+-------------------+---------+---------+---------+---------+---------+----------------+
|            Country|Rank_2015|Rank_2016|Rank_2017|Rank_2018|Rank_2019|Increment_Factor|
+-------------------+---------+---------+---------+---------+---------+----------------+
|              Benin|      155|      153|      143|      136|      102|              53|
|        Ivory Coast|      151|      139|      128|      107|       99|              52|
|           Honduras|      105|      104|       91|       72|       59|              46|
|            Hungary|      104|       91|       75|       69|       62|              42|
|            Romania|       86|       71|       57|       52|       48|              38|
|           Cameroon|      133|      114|      107|       99|       96|              37|
|           Bulgaria|      134|      129|      105|      100|       97|              37|
|       Burkina Faso|      152|      145|      134|      121|      115|              37|
|           Cambodia|

### Find thecountries which continuously observed drop in rank along with year wise rank along with year wise rank anddifference of rank from 2015 to 2019

* sort the data by biggestto lowest change between 2015 to 2015

In [351]:
spark.sql("""
    select t1.Country, t1.`Happiness Rank` as Rank_2015,t2.`Happiness Rank` Rank_2016, 
    t3.`Happiness.Rank` Rank_2017, t4.`Overall rank` Rank_2018, t5.`Overall rank` Rank_2019, 
    t1.`Happiness Rank`-t5.`Overall rank` Decrement_Factor
    from spark_tbl_15 t1 
    inner join spark_tbl_16 t2 on t1.Country=t2.Country  and t1.`Happiness Rank` < t2.`Happiness Rank`
    inner join spark_tbl_17 t3 on t2.Country=t3.Country and t2.`Happiness Rank` < t3.`Happiness.Rank`
    inner join spark_tbl_18 t4 on t3.Country=t4.`Country or region` and t3.`Happiness.Rank` < t4.`Overall rank`
    inner join spark_tbl_19 t5 on t4.`Country or region`= t5.`Country or region` and t4.`Overall rank` < t5.`Overall rank`
    order by Decrement_Factor
""").show()

+-----------+---------+---------+---------+---------+---------+----------------+
|    Country|Rank_2015|Rank_2016|Rank_2017|Rank_2018|Rank_2019|Decrement_Factor|
+-----------+---------+---------+---------+---------+---------+----------------+
|  Venezuela|       23|       44|       82|      102|      108|             -85|
|     Zambia|       85|      106|      116|      125|      138|             -53|
|   Zimbabwe|      115|      131|      138|      144|      146|             -31|
|      India|      117|      118|      122|      133|      140|             -23|
|    Belarus|       59|       61|       67|       73|       81|             -22|
|   Botswana|      128|      137|      142|      146|      148|             -20|
|     Malawi|      131|      132|      136|      147|      150|             -19|
|    Moldova|       52|       55|       56|       67|       71|             -19|
|     Brazil|       16|       17|       22|       28|       32|             -16|
| Azerbaijan|       80|     

In [370]:
temp_res = pd.merge(pandas_df_15[['Country','Happiness Rank']], pandas_df_16[['Country','Happiness Rank']], on='Country')\
    .rename(columns={'Happiness Rank_x':'Rank_2015', 'Happiness Rank_y':'Rank_2016'})\
    .query("Rank_2015 < Rank_2016")\
    .merge(pandas_df_17[['Country','Happiness.Rank']], on='Country')\
    .rename(columns={'Happiness.Rank':'Rank_2017'})\
    .query("Rank_2016 < Rank_2017")\
    .merge(pandas_df_18[['Country or region','Overall rank']],left_on='Country',right_on='Country or region')\
    .rename(columns={'Overall rank':'Rank_2018'})\
    .drop(columns='Country or region')\
    .query("Rank_2017 < Rank_2018")\
    .merge(pandas_df_19[['Country or region','Overall rank']],left_on='Country',right_on='Country or region')\
    .rename(columns={'Overall rank':'Rank_2019'})\
    .drop(columns='Country or region')\
    .query("Rank_2018 < Rank_2019")

temp_res.assign(Decrement_Factor = temp_res['Rank_2015']-temp_res['Rank_2019'])\
    .sort_values('Decrement_Factor', ascending=True)

    

Unnamed: 0,Country,Rank_2015,Rank_2016,Rank_2017,Rank_2018,Rank_2019,Decrement_Factor
2,Venezuela,23,44,82,102,108,-85
9,Zambia,85,106,116,125,138,-53
11,Zimbabwe,115,131,138,144,146,-31
12,India,117,118,122,133,140,-23
4,Belarus,59,61,67,73,81,-22
15,Botswana,128,137,142,146,148,-20
3,Moldova,52,55,56,67,71,-19
16,Malawi,131,132,136,147,150,-19
1,Brazil,16,17,22,28,32,-16
8,Azerbaijan,80,81,85,87,90,-10


In [396]:
spark_df_15['Country','Happiness Rank']\
    .withColumnRenamed('Happiness Rank', 'Rank_2015')\
    .join(spark_df_16['Country','Happiness Rank'].withColumnRenamed('Happiness Rank','Rank_2016'),on='Country')\
    .filter(F.col('Rank_2015')<F.col('Rank_2016'))\
    .join(spark_df_17['Country','`Happiness.Rank`'].withColumnRenamed('Happiness.Rank','Rank_2017'),on='Country')\
    .filter(F.col('Rank_2016')<F.col('Rank_2017'))\
    .join(spark_df_18['Country or region','Overall rank'].\
          withColumnRenamed('Overall rank','Rank_2018').withColumnRenamed('Country or region','Country')
          ,on='Country')\
    .filter(F.col('Rank_2017')<F.col('Rank_2018'))\
    .join(spark_df_19['Country or region','Overall rank'].\
          withColumnRenamed('Overall rank','Rank_2019').withColumnRenamed('Country or region','Country')
          ,on='Country')\
    .filter(F.col('Rank_2018')<F.col('Rank_2019'))\
    .withColumn('Decrement_Factor',F.col('Rank_2015')-F.col('Rank_2019'))\
    .sort('Decrement_Factor')\
    .show()

+-----------+---------+---------+---------+---------+---------+----------------+
|    Country|Rank_2015|Rank_2016|Rank_2017|Rank_2018|Rank_2019|Decrement_Factor|
+-----------+---------+---------+---------+---------+---------+----------------+
|  Venezuela|       23|       44|       82|      102|      108|             -85|
|     Zambia|       85|      106|      116|      125|      138|             -53|
|   Zimbabwe|      115|      131|      138|      144|      146|             -31|
|      India|      117|      118|      122|      133|      140|             -23|
|    Belarus|       59|       61|       67|       73|       81|             -22|
|   Botswana|      128|      137|      142|      146|      148|             -20|
|     Malawi|      131|      132|      136|      147|      150|             -19|
|    Moldova|       52|       55|       56|       67|       71|             -19|
|     Brazil|       16|       17|       22|       28|       32|             -16|
| Azerbaijan|       80|     