In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Correlation matrix
def plotCorrelationMatrix(df, graphWidth,title = 'data'):
    filename = title
    df = df.dropna('columns') # drop columns with NaN
    df = df[[col for col in df if df[col].nunique() > 1]] # keep columns where there are more than 1 unique values
    if df.shape[1] < 2:
        print(f'No correlation plots shown: The number of non-NaN or constant columns ({df.shape[1]}) is less than 2')
        return
    corr = df.corr()
    plt.figure(num=None, figsize=(graphWidth, graphWidth), dpi=80, facecolor='w', edgecolor='k')
    corrMat = plt.matshow(corr, fignum = 1)
    #sns.pairplot(corr)
    plt.xticks(range(len(corr.columns)), corr.columns, rotation=90)
    plt.yticks(range(len(corr.columns)), corr.columns)
    plt.gca().xaxis.tick_bottom()
    plt.colorbar(corrMat)
    plt.title(f'Correlation Matrix for {filename}', fontsize=15)
    plt.show()

In [None]:
# Scatter and density plots
def plotScatterMatrix(df, plotSize, textSize):
    df = df.select_dtypes(include =[np.number]) # keep only numerical columns
    # Remove rows and columns that would lead to df being singular
    df = df.dropna('columns')
    df = df[[col for col in df if df[col].nunique() > 1]] # keep columns where there are more than 1 unique values
    columnNames = list(df)
    if len(columnNames) > 10: # reduce the number of columns for matrix inversion of kernel density plots
        columnNames = columnNames[:10]
    df = df[columnNames]
    ax = pd.plotting.scatter_matrix(df, alpha=0.75, figsize=[plotSize, plotSize], diagonal='kde')
    corrs = df.corr().values
    #for i, j in zip(*plt.np.triu_indices_from(ax, k = 1)):
       # ax[i, j].annotate('Corr. coef = %.3f' % corrs[i, j], (0.8, 0.2), xycoords='axes fraction', ha='center', va='center', size=textSize)
    plt.suptitle('Scatter and Density Plot')
    plt.show()


### reading data 

In [None]:
data = pd.read_csv("/kaggle/input/world-happiness/2016.csv")

### understand the data

In [None]:
data.info()

In [None]:
data.describe()

In [None]:
data.shape

In [None]:
data.head()

#### The data is already clean

### lock at data
* lock at some countries

In [None]:
country = ['Denmark','Palestinian Territories','Egypt','Syria','Burundi','Australia']
data[data['Country'].isin(country)]


### we need to answer some answers:
* what is the happinest region in the world ?
* what is the happinest countries ?
* what is The least happy countries ?
* What are the most important contributing factors to a nation’s happiness ?

In [None]:
data['Region'].unique()

*  #### what is the happinest region in the world ?

In [None]:
for region in data['Region'].unique():
    print(len(data[data['Region'] == region]),region)

In [None]:
rows_list=[]
for region in data['Region'].unique():
    happinesScoreSum = data.loc[data['Region'] == region ]['Happiness Score'].sum()
    numCountriesInRegion = len(data[data['Region'] == region ])
    avgHappinesScorePerRegion = happinesScoreSum / numCountriesInRegion;
    row_dict = {'Region' : region,'Happiness Score Per Region':avgHappinesScorePerRegion,'numCountriesInRegion':numCountriesInRegion}
    rows_list.append(row_dict)

In [None]:
df = pd.DataFrame(rows_list)
df.sort_values(['Happiness Score Per Region'], ascending = False, inplace = True)
df

In [None]:
plt.figure(figsize=(10,10))
ax = sns.barplot(df['Region'],df['Happiness Score Per Region'])
ax.set_xticklabels(ax.get_xticklabels(), rotation = 40 ,ha='right')

#### *the ascednig order of Regions*

* ### the happinest countries 

In [None]:
plt.figure(figsize=(10,10))
ax = sns.barplot(data['Country'].head(10),data['Happiness Score'].head(10))
ax.set_xticklabels(ax.get_xticklabels(), rotation = 40 ,ha='right')

* ### The least happy countries

In [None]:
plt.figure(figsize=(10,10))
ax = sns.barplot(data['Country'].tail(10),data['Happiness Score'].tail(10))
ax.set_xticklabels(ax.get_xticklabels(), rotation = 40 ,ha='right')

* #### What are the most important contributing factors to a nation’s happiness ?

In [None]:
plotCorrelationMatrix(data, 10)

In [None]:
plotScatterMatrix(data,20, 8)

**we can look at some important features**

In [None]:
# pair plot with plot type regression
sns.pairplot(data,vars = ['Happiness Score','Lower Confidence Interval','Upper Confidence Interval'],
kind="reg")
plt.show()

In [None]:
# pair plot with plot type regression
sns.pairplot(data,vars = ['Happiness Score','Lower Confidence Interval','Upper Confidence Interval'],hue = 'Region',
kind="reg")
plt.show()

### we can go deeply more, but in this notebook i like to ask some simble question and answer it 