## Création, lecture et écriture

In [4]:
import pandas as pd

## Creating data

There are two core objects in pandas: the DataFrame and the Series.

## DataFrame

In [5]:
a = pd.DataFrame({'Yes': [50, 21], 'No': [131, 2]})
a

Unnamed: 0,Yes,No
0,50,131
1,21,2


In [7]:
b = pd.DataFrame({'Apple':[35, 41],'Bananas':[21, 34]},
                 index=['2017 Sales','2018 Sales'])
b

Unnamed: 0,Apple,Bananas
2017 Sales,35,21
2018 Sales,41,34


In [None]:
pd.DataFrame({'Bob': ['I liked it.', 'It was awful.'], 'Sue': ['Pretty good.', 'Bland.']})

In [6]:
pd.DataFrame({'Bob': ['I liked it.', 'It was awful.'], 
              'Sue': ['Pretty good.', 'Bland.']},
             index=['Product A', 'Product B'])

Unnamed: 0,Bob,Sue
Product A,I liked it.,Pretty good.
Product B,It was awful.,Bland.


## Series

In [11]:
a= pd.Series([1, 2, 5, 9, 5])
a

0    1
1    2
2    5
3    9
4    5
dtype: int64

In [12]:
a[3]

9

In [13]:
pd.Series([30, 35, 40], index=['2015 Sales', '2016 Sales', '2017 Sales'], name='Product A')

2015 Sales    30
2016 Sales    35
2017 Sales    40
Name: Product A, dtype: int64

## Reading data files

In [58]:
winequality = pd.read_csv("C:/Users/BAH/Desktop/Formation/covid.csv")

In [59]:
winequality.shape

(209, 16)

In [60]:
winequality.head()

Unnamed: 0,Country/Region,Continent,Population,TotalCases,NewCases,TotalDeaths,NewDeaths,TotalRecovered,NewRecovered,ActiveCases,"Serious,Critical",Tot Cases/1M pop,Deaths/1M pop,TotalTests,Tests/1M pop,WHO Region
0,USA,North America,331198100.0,5032179,,162804.0,,2576668.0,,2292707.0,18296.0,15194.0,492.0,63139605.0,190640.0,Americas
1,Brazil,South America,212710700.0,2917562,,98644.0,,2047660.0,,771258.0,8318.0,13716.0,464.0,13206188.0,62085.0,Americas
2,India,Asia,1381345000.0,2025409,,41638.0,,1377384.0,,606387.0,8944.0,1466.0,30.0,22149351.0,16035.0,South-EastAsia
3,Russia,Europe,145940900.0,871894,,14606.0,,676357.0,,180931.0,2300.0,5974.0,100.0,29716907.0,203623.0,Europe
4,South Africa,Africa,59381570.0,538184,,9604.0,,387316.0,,141264.0,539.0,9063.0,162.0,3149807.0,53044.0,Africa


In [61]:
# wine_reviews = pd.read_csv("../input/wine-reviews/winemag-data-130k-v2.csv", index_col=0)
# wine_reviews.head()

## Indexation, sélection et affectation

La sélection de valeurs spécifiques d'un pandas DataFrame ou d'une série sur laquelle travailler est une étape implicite dans presque toutes les opérations de données que vous exécuterez,
donc l'une des premières choses que vous devez apprendre en travaillant avec des données en Python est de savoir comment sélectionner les données points pertinents pour vous rapidement et efficacement.

In [78]:
# reviews = pd.read_csv("../input/wine-reviews/winemag-data-130k-v2.csv", index_col=0)
pd.set_option('max_rows', 5)

In [79]:
winequality

Unnamed: 0,Country/Region,Continent,Population,TotalCases,NewCases,TotalDeaths,NewDeaths,TotalRecovered,NewRecovered,ActiveCases,"Serious,Critical",Tot Cases/1M pop,Deaths/1M pop,TotalTests,Tests/1M pop,WHO Region
0,USA,North America,331198130.0,5032179,,162804.0,,2576668.0,,2292707.0,18296.0,15194.0,492.0,63139605.0,190640.0,Americas
1,Brazil,South America,212710692.0,2917562,,98644.0,,2047660.0,,771258.0,8318.0,13716.0,464.0,13206188.0,62085.0,Americas
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
207,Vatican City,Europe,801.0,12,,,,12.0,,0.0,,14981.0,,,,Europe
208,Western Sahara,Africa,598682.0,10,,1.0,,8.0,,1.0,,17.0,2.0,,,Africa


In [81]:
winequality.Continent

0      North America
1      South America
           ...      
207           Europe
208           Africa
Name: Continent, Length: 209, dtype: object

In [83]:
winequality.Population

0      331198130.0
1      212710692.0
          ...     
207          801.0
208       598682.0
Name: Population, Length: 209, dtype: float64

In [86]:
winequality['Country/Region']

0                 USA
1              Brazil
            ...      
207      Vatican City
208    Western Sahara
Name: Country/Region, Length: 209, dtype: object

In [87]:
winequality['Country/Region'][0]

'USA'

## Indexing in pandas

### Index-based selection

In [88]:
winequality.iloc[0]

Country/Region              USA
Continent         North America
                      ...      
Tests/1M pop           190640.0
WHO Region             Americas
Name: 0, Length: 16, dtype: object

In [92]:
winequality.iloc[:, 0]


0                 USA
1              Brazil
            ...      
207      Vatican City
208    Western Sahara
Name: Country/Region, Length: 209, dtype: object

In [94]:
winequality.iloc[:4, 1]


0    North America
1    South America
2             Asia
3           Europe
Name: Continent, dtype: object

In [98]:
winequality.iloc[2:4, 0]


2     India
3    Russia
Name: Country/Region, dtype: object

In [191]:
winequality.iloc[[0,1, 2,3], 0:3]

Unnamed: 0,Country/Region,Continent,Population
0,USA,North America,331198100.0
1,Brazil,South America,212710700.0
2,India,Asia,1381345000.0
3,Russia,Europe,145940900.0


In [109]:
winequality.iloc[-5:]

Unnamed: 0,Country/Region,Continent,Population,TotalCases,NewCases,TotalDeaths,NewDeaths,TotalRecovered,NewRecovered,ActiveCases,"Serious,Critical",Tot Cases/1M pop,Deaths/1M pop,TotalTests,Tests/1M pop,WHO Region
204,Montserrat,North America,4992.0,13,,1.0,,10.0,,2.0,,2604.0,200.0,61.0,12220.0,
205,Caribbean Netherlands,North America,26247.0,13,,,,7.0,,6.0,,495.0,,424.0,16154.0,
206,Falkland Islands,South America,3489.0,13,,,,13.0,,0.0,,3726.0,,1816.0,520493.0,
207,Vatican City,Europe,801.0,12,,,,12.0,,0.0,,14981.0,,,,Europe
208,Western Sahara,Africa,598682.0,10,,1.0,,8.0,,1.0,,17.0,2.0,,,Africa


## Label-based selection

In [193]:
winequality.loc[0, 'Country/Region']

'USA'

In [194]:
winequality.loc[:3, ['Country/Region', 'Continent', 'Population']]

Unnamed: 0,Country/Region,Continent,Population
0,USA,North America,331198100.0
1,Brazil,South America,212710700.0
2,India,Asia,1381345000.0
3,Russia,Europe,145940900.0


## Manipulating the index

In [117]:
winequality.set_index("ActiveCases")

Unnamed: 0_level_0,Country/Region,Continent,Population,TotalCases,NewCases,TotalDeaths,NewDeaths,TotalRecovered,NewRecovered,"Serious,Critical",Tot Cases/1M pop,Deaths/1M pop,TotalTests,Tests/1M pop,WHO Region
ActiveCases,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2292707.0,USA,North America,331198130.0,5032179,,162804.0,,2576668.0,,18296.0,15194.0,492.0,63139605.0,190640.0,Americas
771258.0,Brazil,South America,212710692.0,2917562,,98644.0,,2047660.0,,8318.0,13716.0,464.0,13206188.0,62085.0,Americas
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0.0,Vatican City,Europe,801.0,12,,,,12.0,,,14981.0,,,,Europe
1.0,Western Sahara,Africa,598682.0,10,,1.0,,8.0,,,17.0,2.0,,,Africa


## Conditional selection

In [195]:
winequality.Continent == 'Africa'

0      False
1      False
       ...  
207    False
208     True
Name: Continent, Length: 209, dtype: bool

In [196]:
winequality.loc[winequality.Continent == 'Africa']

Unnamed: 0,Country/Region,Continent,Population,TotalCases,NewCases,TotalDeaths,NewDeaths,TotalRecovered,NewRecovered,ActiveCases,"Serious,Critical",Tot Cases/1M pop,Deaths/1M pop,TotalTests,Tests/1M pop,WHO Region,index_backwards,critic
4,South Africa,Africa,59381566.0,538184,,9604.0,,387316.0,,141264.0,539.0,9063.0,162.0,3149807.0,53044.0,Africa,205,everyone
26,Egypt,Africa,102516525.0,95006,,4951.0,,48898.0,,41157.0,41.0,927.0,48.0,135000.0,1317.0,EasternMediterranean,183,everyone
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
184,Seychelles,Africa,98408.0,126,,,,124.0,,2.0,,1280.0,,,,Africa,25,everyone
208,Western Sahara,Africa,598682.0,10,,1.0,,8.0,,1.0,,17.0,2.0,,,Africa,1,everyone


In [201]:
winequality.loc[(winequality.Continent == 'Africa') & (winequality.TotalCases >= 5000)]

Unnamed: 0,Country/Region,Continent,Population,TotalCases,NewCases,TotalDeaths,NewDeaths,TotalRecovered,NewRecovered,ActiveCases,"Serious,Critical",Tot Cases/1M pop,Deaths/1M pop,TotalTests,Tests/1M pop,WHO Region,index_backwards,critic
4,South Africa,Africa,59381566.0,538184,,9604.0,,387316.0,,141264.0,539.0,9063.0,162.0,3149807.0,53044.0,Africa,205,everyone
26,Egypt,Africa,102516525.0,95006,,4951.0,,48898.0,,41157.0,41.0,927.0,48.0,135000.0,1317.0,EasternMediterranean,183,everyone
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
92,Mauritania,Africa,4660728.0,6444,,157.0,,5291.0,,996.0,3.0,1383.0,34.0,57387.0,12313.0,Africa,117,everyone
97,Djibouti,Africa,989387.0,5330,,59.0,,5057.0,,214.0,,5387.0,60.0,59909.0,60552.0,EasternMediterranean,112,everyone


In [203]:
winequality.loc[(winequality.Continent == 'Africa') | (winequality.TotalCases >= 5000)]

Unnamed: 0,Country/Region,Continent,Population,TotalCases,NewCases,TotalDeaths,NewDeaths,TotalRecovered,NewRecovered,ActiveCases,"Serious,Critical",Tot Cases/1M pop,Deaths/1M pop,TotalTests,Tests/1M pop,WHO Region,index_backwards,critic
0,USA,North America,331198130.0,5032179,,162804.0,,2576668.0,,2292707.0,18296.0,15194.0,492.0,63139605.0,190640.0,Americas,209,everyone
1,Brazil,South America,212710692.0,2917562,,98644.0,,2047660.0,,771258.0,8318.0,13716.0,464.0,13206188.0,62085.0,Americas,208,everyone
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
184,Seychelles,Africa,98408.0,126,,,,124.0,,2.0,,1280.0,,,,Africa,25,everyone
208,Western Sahara,Africa,598682.0,10,,1.0,,8.0,,1.0,,17.0,2.0,,,Africa,1,everyone


In [202]:
winequality.loc[winequality.Continent.isin(['Europe', 'Africa'])]

Unnamed: 0,Country/Region,Continent,Population,TotalCases,NewCases,TotalDeaths,NewDeaths,TotalRecovered,NewRecovered,ActiveCases,"Serious,Critical",Tot Cases/1M pop,Deaths/1M pop,TotalTests,Tests/1M pop,WHO Region,index_backwards,critic
3,Russia,Europe,145940924.0,871894,,14606.0,,676357.0,,180931.0,2300.0,5974.0,100.0,29716907.0,203623.0,Europe,206,everyone
4,South Africa,Africa,59381566.0,538184,,9604.0,,387316.0,,141264.0,539.0,9063.0,162.0,3149807.0,53044.0,Africa,205,everyone
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
207,Vatican City,Europe,801.0,12,,,,12.0,,0.0,,14981.0,,,,Europe,2,everyone
208,Western Sahara,Africa,598682.0,10,,1.0,,8.0,,1.0,,17.0,2.0,,,Africa,1,everyone


In [135]:
winequality.loc[winequality.NewCases.notnull()]

Unnamed: 0,Country/Region,Continent,Population,TotalCases,NewCases,TotalDeaths,NewDeaths,TotalRecovered,NewRecovered,ActiveCases,"Serious,Critical",Tot Cases/1M pop,Deaths/1M pop,TotalTests,Tests/1M pop,WHO Region
5,Mexico,North America,129066160.0,462690,6590.0,50517.0,819.0,308848.0,4140.0,103325.0,3987.0,3585.0,391.0,1056915.0,8189.0,Americas
28,Bolivia,South America,11688459.0,86423,1282.0,3465.0,80.0,27373.0,936.0,55585.0,71.0,7394.0,296.0,183583.0,15706.0,Americas
72,S. Korea,Asia,51273732.0,14519,20.0,303.0,1.0,13543.0,42.0,673.0,18.0,283.0,6.0,1613652.0,31471.0,WesternPacific
146,Jamaica,North America,2962478.0,958,30.0,12.0,,745.0,,201.0,,323.0,4.0,41840.0,14123.0,Americas


## Assigning data

In [137]:
winequality['critic'] = 'everyone'
winequality['critic']

0      everyone
1      everyone
         ...   
207    everyone
208    everyone
Name: critic, Length: 209, dtype: object

In [141]:
winequality['index_backwards'] = range(len(winequality), 0, -1)
winequality['index_backwards']

0      209
1      208
      ... 
207      2
208      1
Name: index_backwards, Length: 209, dtype: int64

## Summary Functions and Maps

Summary functions

In [144]:
winequality.NewCases.describe()

count       4.0
mean     1980.5
          ...  
75%      2609.0
max      6590.0
Name: NewCases, Length: 8, dtype: float64

In [145]:
winequality.ActiveCases.describe()

count    2.050000e+02
mean     2.766433e+04
             ...     
75%      7.124000e+03
max      2.292707e+06
Name: ActiveCases, Length: 8, dtype: float64

In [146]:
winequality.NewCases.mean()

1980.5

In [147]:
winequality.ActiveCases.unique()

array([2.292707e+06, 7.712580e+05, 6.063870e+05, 1.809310e+05,
       1.412640e+05, 1.033250e+05, 1.246480e+05, 1.661400e+04,
       1.534160e+05,          nan, 2.467800e+04, 3.408200e+04,
       1.977000e+04, 1.025210e+05, 1.269400e+04, 1.092100e+04,
       1.240920e+05, 9.758000e+03, 8.286100e+04, 3.441700e+04,
       5.047300e+04, 3.758700e+04, 6.489000e+03, 3.083000e+03,
       2.601300e+04, 4.115700e+04, 1.334200e+04, 5.558500e+04,
       9.311000e+03, 2.555600e+04, 3.246500e+04, 3.475100e+04,
       2.418600e+04, 4.363800e+04, 7.966000e+03, 4.167000e+03,
       5.752000e+03, 2.633700e+04, 6.497000e+03, 1.015000e+04,
       1.247800e+04, 1.209900e+04, 1.188400e+04, 3.755900e+04,
       2.788000e+03, 1.236000e+04, 7.491000e+03, 3.059000e+03,
       7.113000e+03, 2.523000e+03, 9.115000e+03, 3.493000e+03,
       8.642000e+03, 8.849000e+03, 1.266400e+04, 7.124000e+03,
       1.240000e+03, 1.356800e+04, 9.958000e+03, 6.296000e+03,
       1.381000e+03, 1.383200e+04, 1.150800e+04, 8.6940

In [152]:
winequality.Continent.value_counts()

Africa               57
Asia                 48
                     ..
South America        14
Australia/Oceania     6
Name: Continent, Length: 6, dtype: int64

Maps

In [154]:
winequality_NewCases_mean = winequality.NewCases.mean()
winequality.NewCases.map(lambda p: p - winequality_NewCases_mean)

0     NaN
1     NaN
       ..
207   NaN
208   NaN
Name: NewCases, Length: 209, dtype: float64

In [156]:
def remean_points(row):
    row.NewCases = row.NewCases - winequality_NewCases_mean
    return row

winequality.apply(remean_newcases, axis='columns')

Unnamed: 0,Country/Region,Continent,Population,TotalCases,NewCases,TotalDeaths,NewDeaths,TotalRecovered,NewRecovered,ActiveCases,"Serious,Critical",Tot Cases/1M pop,Deaths/1M pop,TotalTests,Tests/1M pop,WHO Region,index_backwards,critic
0,USA,North America,331198130.0,5032179,,162804.0,,2576668.0,,2292707.0,18296.0,15194.0,492.0,63139605.0,190640.0,Americas,209,everyone
1,Brazil,South America,212710692.0,2917562,,98644.0,,2047660.0,,771258.0,8318.0,13716.0,464.0,13206188.0,62085.0,Americas,208,everyone
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
207,Vatican City,Europe,801.0,12,,,,12.0,,0.0,,14981.0,,,,Europe,2,everyone
208,Western Sahara,Africa,598682.0,10,,1.0,,8.0,,1.0,,17.0,2.0,,,Africa,1,everyone


In [157]:
winequality.head(1)

Unnamed: 0,Country/Region,Continent,Population,TotalCases,NewCases,TotalDeaths,NewDeaths,TotalRecovered,NewRecovered,ActiveCases,"Serious,Critical",Tot Cases/1M pop,Deaths/1M pop,TotalTests,Tests/1M pop,WHO Region,index_backwards,critic
0,USA,North America,331198130.0,5032179,,162804.0,,2576668.0,,2292707.0,18296.0,15194.0,492.0,63139605.0,190640.0,Americas,209,everyone


In [159]:
winequality_ActiveCases_mean = winequality.ActiveCases.mean()
winequality.ActiveCases - winequality_ActiveCases_mean

0      2.265043e+06
1      7.435937e+05
           ...     
207   -2.766433e+04
208   -2.766333e+04
Name: ActiveCases, Length: 209, dtype: float64

In [171]:
# winequality.Continent + " - " + winequality.NewCases

## Regroupement et tri

In [173]:
pd.set_option("display.max_rows", 5)

In [174]:
winequality.groupby('Continent').Continent.count()

Continent
Africa           57
Asia             48
                 ..
North America    35
South America    14
Name: Continent, Length: 6, dtype: int64

In [185]:
winequality.groupby('Continent').NewCases.min()

Continent
Africa              NaN
Asia               20.0
                  ...  
North America      30.0
South America    1282.0
Name: NewCases, Length: 6, dtype: float64

In [188]:
winequality.groupby('Continent').apply(lambda df: df.ActiveCases.iloc[0])

Continent
Africa            141264.0
Asia              606387.0
                   ...    
North America    2292707.0
South America     771258.0
Length: 6, dtype: float64

In [190]:
# winequality.groupby(['Population', 'Continent']).apply(lambda df: df.loc[df.NewCases.idxmax()])