# Data Analysis with Python - freeCodeCamp Course

## Introduction to Pandas (DataFrames)

In [1]:
import numpy as np
import pandas as pd

### Pandas Dataframes

In [2]:
df = pd.DataFrame({
    'Population': [
        35.467, 
        63.951, 
        80.94 , 
        60.665, 
        127.061, 
        64.511, 
        318.523
    ],
    'GDP': [
        1785387,
        2833687,
        3874437,
        2167744,
        4602367,
        2950039,
        17348075
    ],
    'Surface Area': [
        9984670,
        640679,
        357114,
        301336,
        377930,
        242495,
        9525067
    ],
    'HDI': [
        0.913,
        0.888,
        0.916,
        0.873,
        0.891,
        0.907,
        0.915
    ],
    'Continent': [
        'America',
        'Europe',
        'Europe',
        'Europe',
        'Asia',
        'Europe',
        'America'
    ]
}, columns=['Population', 'GDP', 'Surface Area', 'HDI', 'Continent'])

df

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
0,35.467,1785387,9984670,0.913,America
1,63.951,2833687,640679,0.888,Europe
2,80.94,3874437,357114,0.916,Europe
3,60.665,2167744,301336,0.873,Europe
4,127.061,4602367,377930,0.891,Asia
5,64.511,2950039,242495,0.907,Europe
6,318.523,17348075,9525067,0.915,America


In [3]:
df.index = [
    'Canada',
    'France',
    'Germany',
    'Italy',
    'Japan',
    'United Kingdom',
    'United States',
]

df

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
Canada,35.467,1785387,9984670,0.913,America
France,63.951,2833687,640679,0.888,Europe
Germany,80.94,3874437,357114,0.916,Europe
Italy,60.665,2167744,301336,0.873,Europe
Japan,127.061,4602367,377930,0.891,Asia
United Kingdom,64.511,2950039,242495,0.907,Europe
United States,318.523,17348075,9525067,0.915,America


In [5]:
df.columns

Index(['Population', 'GDP', 'Surface Area', 'HDI', 'Continent'], dtype='object')

In [6]:
df.index

Index(['Canada', 'France', 'Germany', 'Italy', 'Japan', 'United Kingdom',
       'United States'],
      dtype='object')

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7 entries, Canada to United States
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Population    7 non-null      float64
 1   GDP           7 non-null      int64  
 2   Surface Area  7 non-null      int64  
 3   HDI           7 non-null      float64
 4   Continent     7 non-null      object 
dtypes: float64(2), int64(2), object(1)
memory usage: 336.0+ bytes


In [8]:
df.size

35

In [9]:
df.shape

(7, 5)

In [10]:
df.describe()

Unnamed: 0,Population,GDP,Surface Area,HDI
count,7.0,7.0,7.0,7.0
mean,107.302571,5080248.0,3061327.0,0.900429
std,97.24997,5494020.0,4576187.0,0.016592
min,35.467,1785387.0,242495.0,0.873
25%,62.308,2500716.0,329225.0,0.8895
50%,64.511,2950039.0,377930.0,0.907
75%,104.0005,4238402.0,5082873.0,0.914
max,318.523,17348080.0,9984670.0,0.916


In [12]:
df.dtypes

Population      float64
GDP               int64
Surface Area      int64
HDI             float64
Continent        object
dtype: object

In [13]:
df.dtypes.value_counts()

float64    2
int64      2
object     1
Name: count, dtype: int64

### Indexing, Selection and Slicing

In [15]:
# To select row by index name use 'loc'
df.loc['Canada']

Population       35.467
GDP             1785387
Surface Area    9984670
HDI               0.913
Continent       America
Name: Canada, dtype: object

In [16]:
# To select row by index position use 'iloc'
df.iloc[2]

Population        80.94
GDP             3874437
Surface Area     357114
HDI               0.916
Continent        Europe
Name: Germany, dtype: object

In [18]:
# To select column...
df['Population']

Canada             35.467
France             63.951
Germany            80.940
Italy              60.665
Japan             127.061
United Kingdom     64.511
United States     318.523
Name: Population, dtype: float64

In [22]:
df['Population'].to_frame()

Unnamed: 0,Population
Canada,35.467
France,63.951
Germany,80.94
Italy,60.665
Japan,127.061
United Kingdom,64.511
United States,318.523


In [25]:
df[['Population','GDP']]

Unnamed: 0,Population,GDP
Canada,35.467,1785387
France,63.951,2833687
Germany,80.94,3874437
Italy,60.665,2167744
Japan,127.061,4602367
United Kingdom,64.511,2950039
United States,318.523,17348075


In [28]:
df.loc['France':'Germany']

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
France,63.951,2833687,640679,0.888,Europe
Germany,80.94,3874437,357114,0.916,Europe


In [29]:
df.loc['France':'Germany', 'Population']

France     63.951
Germany    80.940
Name: Population, dtype: float64

In [30]:
df.loc['France':'Germany', ['Population','GDP']]

Unnamed: 0,Population,GDP
France,63.951,2833687
Germany,80.94,3874437


In [31]:
df.iloc[1:3]

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
France,63.951,2833687,640679,0.888,Europe
Germany,80.94,3874437,357114,0.916,Europe


In [32]:
df.iloc[1:3, 0]

France     63.951
Germany    80.940
Name: Population, dtype: float64

In [34]:
df.iloc[1:3, 0:2]

Unnamed: 0,Population,GDP
France,63.951,2833687
Germany,80.94,3874437


### Conditional selection

In [35]:
df['Population'] > 70

Canada            False
France            False
Germany            True
Italy             False
Japan              True
United Kingdom    False
United States      True
Name: Population, dtype: bool

In [36]:
df.loc[df['Population'] > 70]

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
Germany,80.94,3874437,357114,0.916,Europe
Japan,127.061,4602367,377930,0.891,Asia
United States,318.523,17348075,9525067,0.915,America


In [37]:
df.loc[df['Population'] > 70, 'GDP']

Germany           3874437
Japan             4602367
United States    17348075
Name: GDP, dtype: int64

In [42]:
df.loc[df['Population'] > 70, ['Population', 'GDP']]

Unnamed: 0,Population,GDP
Germany,80.94,3874437
Japan,127.061,4602367
United States,318.523,17348075


### Dropping stuff

In [44]:
df.drop('Canada')

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
France,63.951,2833687,640679,0.888,Europe
Germany,80.94,3874437,357114,0.916,Europe
Italy,60.665,2167744,301336,0.873,Europe
Japan,127.061,4602367,377930,0.891,Asia
United Kingdom,64.511,2950039,242495,0.907,Europe
United States,318.523,17348075,9525067,0.915,America


In [45]:
df.drop(['Canada','Japan'])

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
France,63.951,2833687,640679,0.888,Europe
Germany,80.94,3874437,357114,0.916,Europe
Italy,60.665,2167744,301336,0.873,Europe
United Kingdom,64.511,2950039,242495,0.907,Europe
United States,318.523,17348075,9525067,0.915,America


In [46]:
df.drop(columns = 'Population')

Unnamed: 0,GDP,Surface Area,HDI,Continent
Canada,1785387,9984670,0.913,America
France,2833687,640679,0.888,Europe
Germany,3874437,357114,0.916,Europe
Italy,2167744,301336,0.873,Europe
Japan,4602367,377930,0.891,Asia
United Kingdom,2950039,242495,0.907,Europe
United States,17348075,9525067,0.915,America


In [47]:
df.drop(columns = ['Population','GDP'])

Unnamed: 0,Surface Area,HDI,Continent
Canada,9984670,0.913,America
France,640679,0.888,Europe
Germany,357114,0.916,Europe
Italy,301336,0.873,Europe
Japan,377930,0.891,Asia
United Kingdom,242495,0.907,Europe
United States,9525067,0.915,America


### Operations

In [56]:
df[['Population','GDP']]

Unnamed: 0,Population,GDP
Canada,35.467,1785387
France,63.951,2833687
Germany,80.94,3874437
Italy,60.665,2167744
Japan,127.061,4602367
United Kingdom,64.511,2950039
United States,318.523,17348075


In [49]:
df[['Population','GDP']] / 100

Unnamed: 0,Population,GDP
Canada,0.35467,17853.87
France,0.63951,28336.87
Germany,0.8094,38744.37
Italy,0.60665,21677.44
Japan,1.27061,46023.67
United Kingdom,0.64511,29500.39
United States,3.18523,173480.75


In [65]:
crisis = pd.Series([-20, -1000000], index = ['Population','GDP'])
crisis

Population        -20
GDP          -1000000
dtype: int64

In [66]:
df[['Population','GDP']] + crisis

Unnamed: 0,Population,GDP
Canada,15.467,785387
France,43.951,1833687
Germany,60.94,2874437
Italy,40.665,1167744
Japan,107.061,3602367
United Kingdom,44.511,1950039
United States,298.523,16348075


### Modifying DataFrames

In [67]:
langs = pd.Series(['French', 'Italian', 'Japanese'], index = ['France', 'Italy', 'Japan'], name = 'Languages')
langs

France      French
Italy      Italian
Japan     Japanese
Name: Languages, dtype: object

In [68]:
df['Languages'] = langs
df

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent,Languages
Canada,35.467,1785387,9984670,0.913,America,
France,63.951,2833687,640679,0.888,Europe,French
Germany,80.94,3874437,357114,0.916,Europe,
Italy,60.665,2167744,301336,0.873,Europe,Italian
Japan,127.061,4602367,377930,0.891,Asia,Japanese
United Kingdom,64.511,2950039,242495,0.907,Europe,
United States,318.523,17348075,9525067,0.915,America,


In [69]:
df['Languages'] = ['English','French','German','Italian','Japanese','English','English']
df

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent,Languages
Canada,35.467,1785387,9984670,0.913,America,English
France,63.951,2833687,640679,0.888,Europe,French
Germany,80.94,3874437,357114,0.916,Europe,German
Italy,60.665,2167744,301336,0.873,Europe,Italian
Japan,127.061,4602367,377930,0.891,Asia,Japanese
United Kingdom,64.511,2950039,242495,0.907,Europe,English
United States,318.523,17348075,9525067,0.915,America,English


In [71]:
df.rename(index = str.upper)

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent,Languages
CANADA,35.467,1785387,9984670,0.913,America,English
FRANCE,63.951,2833687,640679,0.888,Europe,French
GERMANY,80.94,3874437,357114,0.916,Europe,German
ITALY,60.665,2167744,301336,0.873,Europe,Italian
JAPAN,127.061,4602367,377930,0.891,Asia,Japanese
UNITED KINGDOM,64.511,2950039,242495,0.907,Europe,English
UNITED STATES,318.523,17348075,9525067,0.915,America,English


In [74]:
df.rename(columns = {'GDP' : 'Gross Domestic Product'}, index = {'United Kingdom' : 'UK', 'United States' : 'USA'})

Unnamed: 0,Population,Gross Domestic Product,Surface Area,HDI,Continent,Languages
Canada,35.467,1785387,9984670,0.913,America,English
France,63.951,2833687,640679,0.888,Europe,French
Germany,80.94,3874437,357114,0.916,Europe,German
Italy,60.665,2167744,301336,0.873,Europe,Italian
Japan,127.061,4602367,377930,0.891,Asia,Japanese
UK,64.511,2950039,242495,0.907,Europe,English
USA,318.523,17348075,9525067,0.915,America,English
