In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Absolutely no offence to the original dataset poster's way of handling the 'raw' dataset, but here is my take on how to clean up the dataset, where finally I reach the same result of the 'cleaned' dataset version as cleaned by the dataset poster

# I decided to show my own approach here because the dataset poster apparently missed a few key points as to why the columns were named with suffixes .1 and .2 and what they represented, and proceeded with wrangling the data with brute force, while those columns with suffixes DID mean something (see below)

# I thought it'd be helpful for people to understand this fact, hence I made this notebook. Thank you.

In [None]:
pd.read_csv('/kaggle/input/obesity-among-adults-by-country-19752016/data.csv').head()

# just to see what the dataset is like, we'll import it in the next step

### We see here that the columns are named as 2016, 2016.1, 2016.2 etc.
### What does this imply? Is this an error? No, it means that this row was a part of a multi index column.
### When we inspect further, we see that the third row has three unique values, 'Both sexes', 'Male' and 'Female',
### these are exactly the second levels of the multi-index columns, the top level being the years.

### Row number 0 and 1 are garbage rows, hence we'll get rid of them.

In [None]:
df = pd.read_csv('/kaggle/input/obesity-among-adults-by-country-19752016/data.csv',
                 header=[0,1], skiprows=[1,2], index_col=0)
df.head()

# Note that zeroth row was the one containing the years, and the first(technically, second) and second (technically, third)
# rows were garbage rows.

# I've made the country column as the index, and the 'new' first two rows (after deleting the garbage rows), are my
# multi indexed columns

#### Now if we look at the column names, we'll see that 'Country' is actually the name of the second level of the column multi index, and not the name of the country column (basically the index)!

In [None]:
df.columns.names

In [None]:
df.columns.names = ['Year', 'Gender']
df.columns.names # Good!

#### And now we'll set the index name to be 'Country'

In [None]:
df.index.names

In [None]:
df.index.names = ['Country']

In [None]:
df.head() # Now our dataframe looks more like it should!

If we want to sort the years in ascending order, we can do it simply by doing this:

In [None]:
df = df.sort_index(axis=1, level=0)
df.head() # or you can also not do this, it all depends

# axis = 1 implies sort the columns, and level = 0 implies the topmost column multi-index level

## Now although this dataframe above is ready for exploratory data analysis, we can give it the form of what the dataset poster has posted

### The dataset poster has years in a single column, hence we'll do it too by using stack

In [None]:
y = df.stack(level=0)
y

### Now we'll reset the index to make the index as regular columns

In [None]:
y = y.reset_index()
y

### Now we'll melt this 'y' dataframe to have all the genders in a single column:

In [None]:
y = y.melt(id_vars=['Country', 'Year'], value_vars=['Both sexes', 'Female', 'Male'], value_name='Obesity levels')
y

### But now we notice that all the genders are clubbed together, not how the dataset poster had in mind!

### Hence we'll sort this dataframe as follows:

In [None]:
y = y.sort_values(by=['Country', 'Year'])
y

### We'll just reset the index now, and we're done!

In [None]:
y.reset_index(drop=True)

## This is what the dataset poster's "cleaned dataframe" looked like, btw:

In [None]:
pd.read_csv('/kaggle/input/obesity-among-adults-by-country-19752016/obesity-cleaned.csv', index_col=0)