In [11]:
import pandas as pd
import numpy as np

### unique values

In [12]:
df = pd.DataFrame({'market': ['A', 'B', 'A', 'B', 'A', 'B'], 
                   'fruits': ['apple', 'apple', 'orange', 'orange', 'guava', 'guava'],
                   'price': [100, 120, 80, 75, 55, 70]})

df

Unnamed: 0,market,fruits,price
0,A,apple,100
1,B,apple,120
2,A,orange,80
3,B,orange,75
4,A,guava,55
5,B,guava,70


In [13]:
df['market'].nunique()  # number of unique values

2

In [14]:
df['fruits'].nunique()

3

In [15]:
df['market'].unique()   # array of unique values

array(['A', 'B'], dtype=object)

In [16]:
df['fruits'].unique()

array(['apple', 'orange', 'guava'], dtype=object)

### iterrows() and items()

`iterrows()`: This method iterates over the rows of a DataFrame, returning an iterator that yields pairs of (index, row) for each row. The index is the row label, and the row is a Series object containing the data for that row.

In [17]:
for index, row in df.iterrows():
    print(index, row['market'], row['fruits'], row['price'])

0 A apple 100
1 B apple 120
2 A orange 80
3 B orange 75
4 A guava 55
5 B guava 70


`items()`: This method iterates over the columns of a DataFrame, returning an iterator that yields pairs of (column name, column data) for each column. The column name is a string, and the column data is a Series object containing the data for that column.

`iteritems()`: This was depricated in pandas 2.0

In [18]:
for column_name, column_data in df.items():
    print(column_name, column_data.values)

market ['A' 'B' 'A' 'B' 'A' 'B']
fruits ['apple' 'apple' 'orange' 'orange' 'guava' 'guava']
price [100 120  80  75  55  70]


### value_counts()

The `value_counts()` method operates on a Series. It returns a Series containing the count of unique values in the original Series. By default it sorts the result in descending order and show the most frequent values first.

In [19]:
data = pd.DataFrame({'name': ['A', 'B', 'C', 'D', 'A', 'C', 'C', 'D', 'C', 'A'],
                     'values': [1, 3, 5, 2, 5, 4, 1, 6, 7, 8]})

data

Unnamed: 0,name,values
0,A,1
1,B,3
2,C,5
3,D,2
4,A,5
5,C,4
6,C,1
7,D,6
8,C,7
9,A,8


In [20]:
data['name'].value_counts()

name
C    4
A    3
D    2
B    1
Name: count, dtype: int64

In [21]:
data['name'].value_counts(normalize=True)  # normalize by the sum of the series values

name
C    0.4
A    0.3
D    0.2
B    0.1
Name: proportion, dtype: float64

### Operations on DataFrame

#### Copying the DataFrame

In [22]:
covid_data = pd.read_csv("../Class-07-03-05-2025-python_for_data_science_part-2/worldometer_coronavirus_summary_data.csv")

covid_data.head()

Unnamed: 0,country,continent,total_confirmed,total_deaths,total_recovered,active_cases,serious_or_critical,total_cases_per_1m_population,total_deaths_per_1m_population,total_tests,total_tests_per_1m_population,population
0,Afghanistan,Asia,179267,7690.0,162202.0,9375.0,1124.0,4420,190.0,951337.0,23455.0,40560636
1,Albania,Europe,275574,3497.0,271826.0,251.0,2.0,95954,1218.0,1817530.0,632857.0,2871945
2,Algeria,Africa,265816,6875.0,178371.0,80570.0,6.0,5865,152.0,230861.0,5093.0,45325517
3,Andorra,Europe,42156,153.0,41021.0,982.0,14.0,543983,1974.0,249838.0,3223924.0,77495
4,Angola,Africa,99194,1900.0,97149.0,145.0,,2853,55.0,1499795.0,43136.0,34769277


In [23]:
covid_data_copy = covid_data.copy()

#### Arithmatic operations

Suppose we want to create a column with percentage of total world population 

In [24]:
total_population = sum(covid_data['population'])

total_population

7899878348

In [25]:
population_pct = (100 * covid_data['population'] / total_population)

population_pct

0      0.513434
1      0.036354
2      0.573750
3      0.000981
4      0.440124
         ...   
221    0.000138
222    0.007907
223    0.393032
224    0.244844
225    0.193242
Name: population, Length: 226, dtype: float64

In [26]:
sum(population_pct)

100.0

In [27]:
covid_data_copy['population_pct'] = population_pct

covid_data_copy

Unnamed: 0,country,continent,total_confirmed,total_deaths,total_recovered,active_cases,serious_or_critical,total_cases_per_1m_population,total_deaths_per_1m_population,total_tests,total_tests_per_1m_population,population,population_pct
0,Afghanistan,Asia,179267,7690.0,162202.0,9375.0,1124.0,4420,190.0,951337.0,23455.0,40560636,0.513434
1,Albania,Europe,275574,3497.0,271826.0,251.0,2.0,95954,1218.0,1817530.0,632857.0,2871945,0.036354
2,Algeria,Africa,265816,6875.0,178371.0,80570.0,6.0,5865,152.0,230861.0,5093.0,45325517,0.573750
3,Andorra,Europe,42156,153.0,41021.0,982.0,14.0,543983,1974.0,249838.0,3223924.0,77495,0.000981
4,Angola,Africa,99194,1900.0,97149.0,145.0,,2853,55.0,1499795.0,43136.0,34769277,0.440124
...,...,...,...,...,...,...,...,...,...,...,...,...,...
221,Wallis And Futuna Islands,Australia/Oceania,454,7.0,438.0,9.0,,41755,644.0,20508.0,1886140.0,10873,0.000138
222,Western Sahara,Africa,10,1.0,9.0,0.0,,16,2.0,,,624681,0.007907
223,Yemen,Asia,11819,2149.0,9009.0,661.0,23.0,381,69.0,265253.0,8543.0,31049015,0.393032
224,Zambia,Africa,320591,3983.0,315997.0,611.0,,16575,206.0,3452554.0,178497.0,19342381,0.244844


In [28]:
covid_data_copy[covid_data_copy['country'] == 'China']

Unnamed: 0,country,continent,total_confirmed,total_deaths,total_recovered,active_cases,serious_or_critical,total_cases_per_1m_population,total_deaths_per_1m_population,total_tests,total_tests_per_1m_population,population,population_pct
44,China,Asia,221804,5209.0,210454.0,6141.0,383.0,154,4.0,160000000.0,111163.0,1439323776,18.219569


In [31]:
covid_data_copy[covid_data_copy['country'].isin(['China','India', 'USA'])]

Unnamed: 0,country,continent,total_confirmed,total_deaths,total_recovered,active_cases,serious_or_critical,total_cases_per_1m_population,total_deaths_per_1m_population,total_tests,total_tests_per_1m_population,population,population_pct
44,China,Asia,221804,5209.0,210454.0,6141.0,383.0,154,4.0,160000000.0,111163.0,1439323776,18.219569
94,India,Asia,43121599,524214.0,42579693.0,17692.0,698.0,30686,373.0,843836900.0,600479.0,1405273033,17.78854
216,USA,North America,84209473,1026646.0,81244260.0,1938567.0,1941.0,251659,3068.0,1016883000.0,3038939.0,334617623,4.235731


In [34]:
covid_data_copy.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 226 entries, 0 to 225
Data columns (total 13 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   country                         226 non-null    object 
 1   continent                       226 non-null    object 
 2   total_confirmed                 226 non-null    int64  
 3   total_deaths                    218 non-null    float64
 4   total_recovered                 204 non-null    float64
 5   active_cases                    204 non-null    float64
 6   serious_or_critical             145 non-null    float64
 7   total_cases_per_1m_population   226 non-null    int64  
 8   total_deaths_per_1m_population  218 non-null    float64
 9   total_tests                     212 non-null    float64
 10  total_tests_per_1m_population   212 non-null    float64
 11  population                      226 non-null    int64  
 12  population_pct                  226 

In [37]:
covid_data_copy['total_confirmed_per_1m_population'] = np.round(1000000*covid_data_copy['total_confirmed']/covid_data_copy['population'], 2)

In [38]:
covid_data_copy[['country','total_confirmed','population','total_confirmed_per_1m_population']]

Unnamed: 0,country,total_confirmed,population,total_confirmed_per_1m_population
0,Afghanistan,179267,40560636,4419.73
1,Albania,275574,2871945,95953.79
2,Algeria,265816,45325517,5864.60
3,Andorra,42156,77495,543983.48
4,Angola,99194,34769277,2852.92
...,...,...,...,...
221,Wallis And Futuna Islands,454,10873,41754.81
222,Western Sahara,10,624681,16.01
223,Yemen,11819,31049015,380.66
224,Zambia,320591,19342381,16574.54


####  Sorting

Ascending order sorting

In [39]:
covid_data_copy.sort_values(by='total_confirmed')

Unnamed: 0,country,continent,total_confirmed,total_deaths,total_recovered,active_cases,serious_or_critical,total_cases_per_1m_population,total_deaths_per_1m_population,total_tests,total_tests_per_1m_population,population,population_pct,total_confirmed_per_1m_population
168,Saint Helena,Africa,2,,2.0,0.0,,327,,,,6111,0.000077,327.28
132,Micronesia,Australia/Oceania,7,,1.0,6.0,,60,,,,117269,0.001484,59.69
142,Nauru,Australia/Oceania,8,,5.0,3.0,,731,,,,10951,0.000139,730.53
150,Niue,Australia/Oceania,9,,9.0,0.0,,5464,,,,1647,0.000021,5464.48
222,Western Sahara,Africa,10,1.0,9.0,0.0,,16,2.0,,,624681,0.007907,16.01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
78,Germany,Europe,25780226,137919.0,23956700.0,1685607.0,1279.0,305877,1636.0,1.223324e+08,1451450.0,84282880,1.066888,305877.37
72,France,Europe,29160802,147257.0,28156674.0,856871.0,1329.0,444914,2247.0,2.714902e+08,4142201.0,65542502,0.829665,444914.39
26,Brazil,South America,30682094,664920.0,29718402.0,298772.0,8318.0,142460,3087.0,6.377617e+07,296119.0,215373503,2.726289,142459.93
94,India,Asia,43121599,524214.0,42579693.0,17692.0,698.0,30686,373.0,8.438369e+08,600479.0,1405273033,17.788540,30685.57


Descending order sorting

In [40]:
covid_data_copy.sort_values(by='total_confirmed',ascending=False)

Unnamed: 0,country,continent,total_confirmed,total_deaths,total_recovered,active_cases,serious_or_critical,total_cases_per_1m_population,total_deaths_per_1m_population,total_tests,total_tests_per_1m_population,population,population_pct,total_confirmed_per_1m_population
216,USA,North America,84209473,1026646.0,81244260.0,1938567.0,1941.0,251659,3068.0,1.016883e+09,3038939.0,334617623,4.235731,251658.81
94,India,Asia,43121599,524214.0,42579693.0,17692.0,698.0,30686,373.0,8.438369e+08,600479.0,1405273033,17.788540,30685.57
26,Brazil,South America,30682094,664920.0,29718402.0,298772.0,8318.0,142460,3087.0,6.377617e+07,296119.0,215373503,2.726289,142459.93
72,France,Europe,29160802,147257.0,28156674.0,856871.0,1329.0,444914,2247.0,2.714902e+08,4142201.0,65542502,0.829665,444914.39
78,Germany,Europe,25780226,137919.0,23956700.0,1685607.0,1279.0,305877,1636.0,1.223324e+08,1451450.0,84282880,1.066888,305877.37
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
222,Western Sahara,Africa,10,1.0,9.0,0.0,,16,2.0,,,624681,0.007907,16.01
150,Niue,Australia/Oceania,9,,9.0,0.0,,5464,,,,1647,0.000021,5464.48
142,Nauru,Australia/Oceania,8,,5.0,3.0,,731,,,,10951,0.000139,730.53
132,Micronesia,Australia/Oceania,7,,1.0,6.0,,60,,,,117269,0.001484,59.69


In [41]:
covid_data_copy.sort_values(by='total_confirmed_per_1m_population',ascending=False)

Unnamed: 0,country,continent,total_confirmed,total_deaths,total_recovered,active_cases,serious_or_critical,total_cases_per_1m_population,total_deaths_per_1m_population,total_tests,total_tests_per_1m_population,population,population_pct,total_confirmed_per_1m_population
68,Faeroe Islands,Europe,34658,28.0,7693.0,26937.0,5.0,704302,569.0,778000.0,15810116.0,49209,0.000623,704302.06
3,Andorra,Europe,42156,153.0,41021.0,982.0,14.0,543983,1974.0,249838.0,3223924.0,77495,0.000981,543983.48
93,Iceland,Europe,186545,120.0,,,,540134,347.0,2264004.0,6555338.0,345368,0.004372,540134.00
80,Gibraltar,Europe,18129,102.0,16579.0,1448.0,,538400,3029.0,534283.0,15867278.0,33672,0.000426,538399.86
57,Denmark,Europe,2976667,6287.0,2960386.0,9994.0,12.0,510561,1078.0,127345763.0,21842472.0,5830190,0.073801,510560.89
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
126,Marshall Islands,Australia/Oceania,17,,14.0,3.0,,284,,,,59934,0.000759,283.65
44,China,Asia,221804,5209.0,210454.0,6141.0,383.0,154,4.0,160000000.0,111163.0,1439323776,18.219569,154.10
43,China Macao Sar,Asia,82,,82.0,0.0,,123,,5375.0,8073.0,665819,0.008428,123.16
132,Micronesia,Australia/Oceania,7,,1.0,6.0,,60,,,,117269,0.001484,59.69


In [42]:
covid_data_copy.sort_values(by='total_confirmed',ascending=False).reset_index()

Unnamed: 0,index,country,continent,total_confirmed,total_deaths,total_recovered,active_cases,serious_or_critical,total_cases_per_1m_population,total_deaths_per_1m_population,total_tests,total_tests_per_1m_population,population,population_pct,total_confirmed_per_1m_population
0,216,USA,North America,84209473,1026646.0,81244260.0,1938567.0,1941.0,251659,3068.0,1.016883e+09,3038939.0,334617623,4.235731,251658.81
1,94,India,Asia,43121599,524214.0,42579693.0,17692.0,698.0,30686,373.0,8.438369e+08,600479.0,1405273033,17.788540,30685.57
2,26,Brazil,South America,30682094,664920.0,29718402.0,298772.0,8318.0,142460,3087.0,6.377617e+07,296119.0,215373503,2.726289,142459.93
3,72,France,Europe,29160802,147257.0,28156674.0,856871.0,1329.0,444914,2247.0,2.714902e+08,4142201.0,65542502,0.829665,444914.39
4,78,Germany,Europe,25780226,137919.0,23956700.0,1685607.0,1279.0,305877,1636.0,1.223324e+08,1451450.0,84282880,1.066888,305877.37
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
221,222,Western Sahara,Africa,10,1.0,9.0,0.0,,16,2.0,,,624681,0.007907,16.01
222,150,Niue,Australia/Oceania,9,,9.0,0.0,,5464,,,,1647,0.000021,5464.48
223,142,Nauru,Australia/Oceania,8,,5.0,3.0,,731,,,,10951,0.000139,730.53
224,132,Micronesia,Australia/Oceania,7,,1.0,6.0,,60,,,,117269,0.001484,59.69


In [43]:
covid_data_copy.sort_values(by='total_confirmed',ascending=False).reset_index(drop=True)

Unnamed: 0,country,continent,total_confirmed,total_deaths,total_recovered,active_cases,serious_or_critical,total_cases_per_1m_population,total_deaths_per_1m_population,total_tests,total_tests_per_1m_population,population,population_pct,total_confirmed_per_1m_population
0,USA,North America,84209473,1026646.0,81244260.0,1938567.0,1941.0,251659,3068.0,1.016883e+09,3038939.0,334617623,4.235731,251658.81
1,India,Asia,43121599,524214.0,42579693.0,17692.0,698.0,30686,373.0,8.438369e+08,600479.0,1405273033,17.788540,30685.57
2,Brazil,South America,30682094,664920.0,29718402.0,298772.0,8318.0,142460,3087.0,6.377617e+07,296119.0,215373503,2.726289,142459.93
3,France,Europe,29160802,147257.0,28156674.0,856871.0,1329.0,444914,2247.0,2.714902e+08,4142201.0,65542502,0.829665,444914.39
4,Germany,Europe,25780226,137919.0,23956700.0,1685607.0,1279.0,305877,1636.0,1.223324e+08,1451450.0,84282880,1.066888,305877.37
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
221,Western Sahara,Africa,10,1.0,9.0,0.0,,16,2.0,,,624681,0.007907,16.01
222,Niue,Australia/Oceania,9,,9.0,0.0,,5464,,,,1647,0.000021,5464.48
223,Nauru,Australia/Oceania,8,,5.0,3.0,,731,,,,10951,0.000139,730.53
224,Micronesia,Australia/Oceania,7,,1.0,6.0,,60,,,,117269,0.001484,59.69


In [45]:
covid_data_copy.sort_values(by=['population','total_confirmed'])

Unnamed: 0,country,continent,total_confirmed,total_deaths,total_recovered,active_cases,serious_or_critical,total_cases_per_1m_population,total_deaths_per_1m_population,total_tests,total_tests_per_1m_population,population,population_pct,total_confirmed_per_1m_population
90,Holy See,Europe,29,,29.0,0.0,,36025,,,,805,0.000010,36024.84
150,Niue,Australia/Oceania,9,,9.0,0.0,,5464,,,,1647,0.000021,5464.48
69,Falkland Islands Malvinas,South America,1126,,,,,306896,,8.632000e+03,2352685.0,3669,0.000046,306895.61
137,Montserrat,North America,747,2.0,390.0,355.0,,149460,400.0,1.123800e+04,2248499.0,4998,0.000063,149459.78
172,Saint Pierre And Miquelon,North America,2739,1.0,2449.0,289.0,1.0,477095,174.0,2.368700e+04,4125936.0,5741,0.000073,477094.58
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
153,Pakistan,Asia,1529249,30376.0,1494141.0,4732.0,109.0,6681,133.0,2.835045e+07,123867.0,228878790,2.897244,6681.48
95,Indonesia,Asia,6050776,156458.0,5889534.0,4784.0,2771.0,21694,561.0,9.724539e+07,348662.0,278910317,3.530565,21694.34
216,USA,North America,84209473,1026646.0,81244260.0,1938567.0,1941.0,251659,3068.0,1.016883e+09,3038939.0,334617623,4.235731,251658.81
94,India,Asia,43121599,524214.0,42579693.0,17692.0,698.0,30686,373.0,8.438369e+08,600479.0,1405273033,17.788540,30685.57


In [46]:
# Sort the dataframe using population percentage in descending order and reset index with drop=True

new_df = covid_data_copy.sort_values(by="population_pct", ascending=False).reset_index(drop=True)

new_df

Unnamed: 0,country,continent,total_confirmed,total_deaths,total_recovered,active_cases,serious_or_critical,total_cases_per_1m_population,total_deaths_per_1m_population,total_tests,total_tests_per_1m_population,population,population_pct,total_confirmed_per_1m_population
0,China,Asia,221804,5209.0,210454.0,6141.0,383.0,154,4.0,1.600000e+08,111163.0,1439323776,18.219569,154.10
1,India,Asia,43121599,524214.0,42579693.0,17692.0,698.0,30686,373.0,8.438369e+08,600479.0,1405273033,17.788540,30685.57
2,USA,North America,84209473,1026646.0,81244260.0,1938567.0,1941.0,251659,3068.0,1.016883e+09,3038939.0,334617623,4.235731,251658.81
3,Indonesia,Asia,6050776,156458.0,5889534.0,4784.0,2771.0,21694,561.0,9.724539e+07,348662.0,278910317,3.530565,21694.34
4,Pakistan,Asia,1529249,30376.0,1494141.0,4732.0,109.0,6681,133.0,2.835045e+07,123867.0,228878790,2.897244,6681.48
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
221,Saint Pierre And Miquelon,North America,2739,1.0,2449.0,289.0,1.0,477095,174.0,2.368700e+04,4125936.0,5741,0.000073,477094.58
222,Montserrat,North America,747,2.0,390.0,355.0,,149460,400.0,1.123800e+04,2248499.0,4998,0.000063,149459.78
223,Falkland Islands Malvinas,South America,1126,,,,,306896,,8.632000e+03,2352685.0,3669,0.000046,306895.61
224,Niue,Australia/Oceania,9,,9.0,0.0,,5464,,,,1647,0.000021,5464.48


In [47]:
new_df.tail()['country'] # Least populated 5 countries

221    Saint Pierre And Miquelon
222                   Montserrat
223    Falkland Islands Malvinas
224                         Niue
225                     Holy See
Name: country, dtype: object

#### Filter

In [48]:
covid_data[['country','continent','population']]

Unnamed: 0,country,continent,population
0,Afghanistan,Asia,40560636
1,Albania,Europe,2871945
2,Algeria,Africa,45325517
3,Andorra,Europe,77495
4,Angola,Africa,34769277
...,...,...,...
221,Wallis And Futuna Islands,Australia/Oceania,10873
222,Western Sahara,Africa,624681
223,Yemen,Asia,31049015
224,Zambia,Africa,19342381


In [None]:
covid_data_copy.filter(items=['country','continent','population'])

Unnamed: 0,country,continent,population
0,Afghanistan,Asia,40560636
1,Albania,Europe,2871945
2,Algeria,Africa,45325517
3,Andorra,Europe,77495
4,Angola,Africa,34769277
...,...,...,...
221,Wallis And Futuna Islands,Australia/Oceania,10873
222,Western Sahara,Africa,624681
223,Yemen,Asia,31049015
224,Zambia,Africa,19342381


In [None]:
covid_data_copy.filter(like='population',axis=1)   # By default axis=1

Unnamed: 0,total_cases_per_1m_population,total_deaths_per_1m_population,total_tests_per_1m_population,population,population_pct,total_confirmed_per_1m_population
0,4420,190.0,23455.0,40560636,0.513434,4419.73
1,95954,1218.0,632857.0,2871945,0.036354,95953.79
2,5865,152.0,5093.0,45325517,0.573750,5864.60
3,543983,1974.0,3223924.0,77495,0.000981,543983.48
4,2853,55.0,43136.0,34769277,0.440124,2852.92
...,...,...,...,...,...,...
221,41755,644.0,1886140.0,10873,0.000138,41754.81
222,16,2.0,,624681,0.007907,16.01
223,381,69.0,8543.0,31049015,0.393032,380.66
224,16575,206.0,178497.0,19342381,0.244844,16574.54


#### Query

SQL like queries 

In [55]:
covid_data.query("continent == 'Europe'")

Unnamed: 0,country,continent,total_confirmed,total_deaths,total_recovered,active_cases,serious_or_critical,total_cases_per_1m_population,total_deaths_per_1m_population,total_tests,total_tests_per_1m_population,population
1,Albania,Europe,275574,3497.0,271826.0,251.0,2.0,95954,1218.0,1817530.0,632857.0,2871945
3,Andorra,Europe,42156,153.0,41021.0,982.0,14.0,543983,1974.0,249838.0,3223924.0,77495
11,Austria,Europe,4212492,18303.0,4135885.0,58304.0,58.0,462804,2011.0,185034905.0,20328801.0,9102106
17,Belarus,Europe,982867,6978.0,,,,104078,739.0,13220483.0,1399951.0,9443535
18,Belgium,Europe,4116397,31613.0,3941350.0,143434.0,117.0,352324,2706.0,33846023.0,2896893.0,11683561
24,Bosnia And Herzegovina,Europe,377511,15778.0,,,,116414,4866.0,1767199.0,544957.0,3242823
29,Bulgaria,Europe,1161504,37045.0,1014302.0,110157.0,54.0,169536,5407.0,9924259.0,1448573.0,6851060
40,Channel Islands,Europe,75275,173.0,74574.0,528.0,,425648,978.0,1252808.0,7084095.0,176848
51,Croatia,Europe,1131450,15930.0,1111405.0,4115.0,15.0,278811,3925.0,4838151.0,1192212.0,4058130
55,Czech Republic,Europe,3915847,40245.0,3871654.0,3948.0,14.0,364401,3745.0,55323981.0,5148342.0,10745981


In [56]:
covid_data.query("continent == 'Europe' and total_confirmed > 5000000")

Unnamed: 0,country,continent,total_confirmed,total_deaths,total_recovered,active_cases,serious_or_critical,total_cases_per_1m_population,total_deaths_per_1m_population,total_tests,total_tests_per_1m_population,population
72,France,Europe,29160802,147257.0,28156674.0,856871.0,1329.0,444914,2247.0,271490188.0,4142201.0,65542502
78,Germany,Europe,25780226,137919.0,23956700.0,1685607.0,1279.0,305877,1636.0,122332384.0,1451450.0,84282880
101,Italy,Europe,17057873,165244.0,15894511.0,998118.0,347.0,282901,2741.0,217853667.0,3613054.0,60296265
144,Netherlands,Europe,8067116,22292.0,7989151.0,55673.0,55.0,468869,1296.0,21107399.0,1226784.0,17205480
160,Poland,Europe,6003436,116207.0,5335112.0,552117.0,981.0,158950,3077.0,36224215.0,959088.0,37769420
165,Russia,Europe,18260293,377670.0,17647179.0,235444.0,2300.0,125027,2586.0,273400000.0,1871949.0,146050996
191,Spain,Europe,12127122,105444.0,11548089.0,473589.0,339.0,259190,2254.0,471036328.0,10067352.0,46788503
212,UK,Europe,22159805,176708.0,21677896.0,305201.0,253.0,323264,2578.0,519264096.0,7574950.0,68550166
213,Ukraine,Europe,5006460,108449.0,,,177.0,115771,2508.0,19521252.0,451415.0,43244553


In [57]:
covid_data.query("continent == 'Europe' and total_confirmed > 5000000").sort_values("total_confirmed",ascending=False).reset_index(drop=True)

Unnamed: 0,country,continent,total_confirmed,total_deaths,total_recovered,active_cases,serious_or_critical,total_cases_per_1m_population,total_deaths_per_1m_population,total_tests,total_tests_per_1m_population,population
0,France,Europe,29160802,147257.0,28156674.0,856871.0,1329.0,444914,2247.0,271490188.0,4142201.0,65542502
1,Germany,Europe,25780226,137919.0,23956700.0,1685607.0,1279.0,305877,1636.0,122332384.0,1451450.0,84282880
2,UK,Europe,22159805,176708.0,21677896.0,305201.0,253.0,323264,2578.0,519264096.0,7574950.0,68550166
3,Russia,Europe,18260293,377670.0,17647179.0,235444.0,2300.0,125027,2586.0,273400000.0,1871949.0,146050996
4,Italy,Europe,17057873,165244.0,15894511.0,998118.0,347.0,282901,2741.0,217853667.0,3613054.0,60296265
5,Spain,Europe,12127122,105444.0,11548089.0,473589.0,339.0,259190,2254.0,471036328.0,10067352.0,46788503
6,Netherlands,Europe,8067116,22292.0,7989151.0,55673.0,55.0,468869,1296.0,21107399.0,1226784.0,17205480
7,Poland,Europe,6003436,116207.0,5335112.0,552117.0,981.0,158950,3077.0,36224215.0,959088.0,37769420
8,Ukraine,Europe,5006460,108449.0,,,177.0,115771,2508.0,19521252.0,451415.0,43244553


**Quick challenge:**

select all the columns in the dataframe which doesn't contain `total` in its name

In [58]:
s = covid_data_copy.columns

s.str.contains('total')

array([False, False,  True,  True,  True, False, False,  True,  True,
        True,  True, False, False,  True])

In [59]:
~s.str.contains('total')

array([ True,  True, False, False, False,  True,  True, False, False,
       False, False,  True,  True, False])

In [60]:
s[~s.str.contains('total')]

Index(['country', 'continent', 'active_cases', 'serious_or_critical',
       'population', 'population_pct'],
      dtype='object')

In [61]:
covid_data_copy.filter(items=s[~s.str.contains('total')])

Unnamed: 0,country,continent,active_cases,serious_or_critical,population,population_pct
0,Afghanistan,Asia,9375.0,1124.0,40560636,0.513434
1,Albania,Europe,251.0,2.0,2871945,0.036354
2,Algeria,Africa,80570.0,6.0,45325517,0.573750
3,Andorra,Europe,982.0,14.0,77495,0.000981
4,Angola,Africa,145.0,,34769277,0.440124
...,...,...,...,...,...,...
221,Wallis And Futuna Islands,Australia/Oceania,9.0,,10873,0.000138
222,Western Sahara,Africa,0.0,,624681,0.007907
223,Yemen,Asia,661.0,23.0,31049015,0.393032
224,Zambia,Africa,611.0,,19342381,0.244844


#### Apply and Map

`apply():`
This function can work on both DataFrames and Series. When used on a DataFrame, it applies a function along an axis (rows or columns).

`map():`
While map() is primarily designed for Series, it can be used on a DataFrame in a limited way. When applied to a DataFrame, it operates element-wise


In [62]:
def square(x):
  return x**2

In [63]:
df = pd.DataFrame(np.array([[1,10,15],[1,5,1],[2,19,0],[8,5,4]]),columns=list('abc'))

df

Unnamed: 0,a,b,c
0,1,10,15
1,1,5,1
2,2,19,0
3,8,5,4


In [64]:
df.map(square)

Unnamed: 0,a,b,c
0,1,100,225
1,1,25,1
2,4,361,0
3,64,25,16


In [65]:
df.map(lambda x: x**2)

Unnamed: 0,a,b,c
0,1,100,225
1,1,25,1
2,4,361,0
3,64,25,16


In [66]:
df.apply(square)  # Applies the function square to the every element of the dataframe

Unnamed: 0,a,b,c
0,1,100,225
1,1,25,1
2,4,361,0
3,64,25,16


In [67]:
df.apply(lambda x: x**2)

Unnamed: 0,a,b,c
0,1,100,225
1,1,25,1
2,4,361,0
3,64,25,16


In [68]:
df.map(lambda x: np.sin(x)+np.cos(x))

Unnamed: 0,a,b,c
0,1.381773,-1.383093,-0.1094
1,1.381773,-0.675262,1.381773
2,0.493151,1.138582,1.0
3,0.843858,-0.675262,-1.410446


In [69]:
df.apply(lambda x: np.sin(x)+np.cos(x))

Unnamed: 0,a,b,c
0,1.381773,-1.383093,-0.1094
1,1.381773,-0.675262,1.381773
2,0.493151,1.138582,1.0
3,0.843858,-0.675262,-1.410446


In [70]:
df

Unnamed: 0,a,b,c
0,1,10,15
1,1,5,1
2,2,19,0
3,8,5,4


In [71]:
df.apply(max) # by default the axis is 0 -> column wise

a     8
b    19
c    15
dtype: int64

In [72]:
df.apply(min) # by default the axis is 0 -> column wise

a    1
b    5
c    0
dtype: int64

In [73]:
df.map(max)

TypeError: 'int' object is not iterable

In [74]:
df.apply(lambda f: f.max()-f.min())   # by default the axis is 0 -> column wise

a     7
b    14
c    15
dtype: int64

In [75]:
df

Unnamed: 0,a,b,c
0,1,10,15
1,1,5,1
2,2,19,0
3,8,5,4


In [76]:
df.apply(lambda f: f.max()-f.min(),axis=1)

0    14
1     4
2    19
3     4
dtype: int64

#### Groupby

A groupby operation involves some combination of the object, applying a function, and combining the results. This can be used to group large amounts of data and compute operations on these groups.

In [77]:
covid_data_select = covid_data[['continent','total_confirmed','total_deaths','total_recovered']]

In [78]:
covid_data_select

Unnamed: 0,continent,total_confirmed,total_deaths,total_recovered
0,Asia,179267,7690.0,162202.0
1,Europe,275574,3497.0,271826.0
2,Africa,265816,6875.0,178371.0
3,Europe,42156,153.0,41021.0
4,Africa,99194,1900.0,97149.0
...,...,...,...,...
221,Australia/Oceania,454,7.0,438.0
222,Africa,10,1.0,9.0
223,Asia,11819,2149.0,9009.0
224,Africa,320591,3983.0,315997.0


In [79]:
covid_data_select.groupby(['continent'])

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x0000022E845F9B50>

In [80]:
covid_data_select.groupby(['continent']).sum() # This .sum() is called the aggregate function.

Unnamed: 0_level_0,total_confirmed,total_deaths,total_recovered
continent,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Africa,12042400,254319.0,10137200.0
Asia,149999659,1427939.0,126145273.0
Australia/Oceania,7942867,11413.0,7403813.0
Europe,194330079,1830655.0,170861871.0
North America,99625662,1467234.0,94818163.0
South America,57136485,1296523.0,51031313.0


In [81]:
def population_bracket(x):
    if x > 10000000:
        return "more than 10m"
    elif x > 5000000:
        return "between 5 to 10m"
    elif x > 1000000:
        return "between 1 to 5m"
    else:
        return "less than 1m"

In [82]:
population_bracket(1345727819)

'more than 10m'

In [83]:
covid_data.columns

Index(['country', 'continent', 'total_confirmed', 'total_deaths',
       'total_recovered', 'active_cases', 'serious_or_critical',
       'total_cases_per_1m_population', 'total_deaths_per_1m_population',
       'total_tests', 'total_tests_per_1m_population', 'population'],
      dtype='object')

In [86]:
covid_data_select = covid_data[['population', 'total_confirmed', 'total_deaths']]

covid_data_select['population_bkt'] = covid_data_select['population'].apply(population_bracket)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  covid_data_select['population_bkt'] = covid_data_select['population'].apply(population_bracket)


In [87]:
covid_data_select

Unnamed: 0,population,total_confirmed,total_deaths,population_bkt
0,40560636,179267,7690.0,more than 10m
1,2871945,275574,3497.0,between 1 to 5m
2,45325517,265816,6875.0,more than 10m
3,77495,42156,153.0,less than 1m
4,34769277,99194,1900.0,more than 10m
...,...,...,...,...
221,10873,454,7.0,less than 1m
222,624681,10,1.0,less than 1m
223,31049015,11819,2149.0,more than 10m
224,19342381,320591,3983.0,more than 10m


In [89]:
covid_data_select.groupby(['population_bkt']).sum()

Unnamed: 0_level_0,population,total_confirmed,total_deaths
population_bkt,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
between 1 to 5m,91056050,13502658,148078.0
between 5 to 10m,219366560,35125138,265588.0
less than 1m,16758300,3151738,18336.0
more than 10m,7572697438,469297618,5856081.0


#### Merge

`merge()`: Combine two Series or DataFrame objects with SQL-style joining. Default is `inner join`.

In [90]:
employee = pd.DataFrame({'EmpId': [1,2,3,4,7], 'EmpName': ['Sachin', 'Sourav', 'Zaheer', 'Harbhajan', 'Rishika'], 
                         'location': ['Mumbai', 'Kolkata', 'Nagpur', 'Amritsar','Bangalore']})

employee

Unnamed: 0,EmpId,EmpName,location
0,1,Sachin,Mumbai
1,2,Sourav,Kolkata
2,3,Zaheer,Nagpur
3,4,Harbhajan,Amritsar
4,7,Rishika,Bangalore


In [91]:
manager = pd.DataFrame({'EmpId': [1,2,3,4,5,6], 'Manager_Name': ['Kapil', 'Ravi', 'Srinath', 'Prasanna','Padmini','Babu']})

manager

Unnamed: 0,EmpId,Manager_Name
0,1,Kapil
1,2,Ravi
2,3,Srinath
3,4,Prasanna
4,5,Padmini
5,6,Babu


![](https://miro.medium.com/max/1200/1*9eH1_7VbTZPZd9jBiGIyNA.png)

In [92]:
pd.merge(employee, manager, on='EmpId')   # By default how = 'inner' -> inner join

Unnamed: 0,EmpId,EmpName,location,Manager_Name
0,1,Sachin,Mumbai,Kapil
1,2,Sourav,Kolkata,Ravi
2,3,Zaheer,Nagpur,Srinath
3,4,Harbhajan,Amritsar,Prasanna


In [93]:
pd.merge(employee, manager, on='EmpId', how='inner')

Unnamed: 0,EmpId,EmpName,location,Manager_Name
0,1,Sachin,Mumbai,Kapil
1,2,Sourav,Kolkata,Ravi
2,3,Zaheer,Nagpur,Srinath
3,4,Harbhajan,Amritsar,Prasanna


In [94]:
employee.merge(manager,on='EmpId')

Unnamed: 0,EmpId,EmpName,location,Manager_Name
0,1,Sachin,Mumbai,Kapil
1,2,Sourav,Kolkata,Ravi
2,3,Zaheer,Nagpur,Srinath
3,4,Harbhajan,Amritsar,Prasanna


In [95]:
pd.merge(employee, manager, on='EmpId', how='left')

Unnamed: 0,EmpId,EmpName,location,Manager_Name
0,1,Sachin,Mumbai,Kapil
1,2,Sourav,Kolkata,Ravi
2,3,Zaheer,Nagpur,Srinath
3,4,Harbhajan,Amritsar,Prasanna
4,7,Rishika,Bangalore,


In [96]:
pd.merge(employee, manager, on='EmpId', how='right')

Unnamed: 0,EmpId,EmpName,location,Manager_Name
0,1,Sachin,Mumbai,Kapil
1,2,Sourav,Kolkata,Ravi
2,3,Zaheer,Nagpur,Srinath
3,4,Harbhajan,Amritsar,Prasanna
4,5,,,Padmini
5,6,,,Babu


In [97]:
pd.merge(employee, manager, on='EmpId', how='outer')

Unnamed: 0,EmpId,EmpName,location,Manager_Name
0,1,Sachin,Mumbai,Kapil
1,2,Sourav,Kolkata,Ravi
2,3,Zaheer,Nagpur,Srinath
3,4,Harbhajan,Amritsar,Prasanna
4,5,,,Padmini
5,6,,,Babu
6,7,Rishika,Bangalore,


In [98]:
pd.merge(employee, manager, how='cross')

Unnamed: 0,EmpId_x,EmpName,location,EmpId_y,Manager_Name
0,1,Sachin,Mumbai,1,Kapil
1,1,Sachin,Mumbai,2,Ravi
2,1,Sachin,Mumbai,3,Srinath
3,1,Sachin,Mumbai,4,Prasanna
4,1,Sachin,Mumbai,5,Padmini
5,1,Sachin,Mumbai,6,Babu
6,2,Sourav,Kolkata,1,Kapil
7,2,Sourav,Kolkata,2,Ravi
8,2,Sourav,Kolkata,3,Srinath
9,2,Sourav,Kolkata,4,Prasanna


In [99]:
manager = pd.DataFrame({'emp_id': [1,2,3,4,5,6], 'Manager_Name': ['Kapil', 'Ravi', 'Srinath', 'Prasanna','Padmini','Babu']})

manager

Unnamed: 0,emp_id,Manager_Name
0,1,Kapil
1,2,Ravi
2,3,Srinath
3,4,Prasanna
4,5,Padmini
5,6,Babu


In [101]:
employee

Unnamed: 0,EmpId,EmpName,location
0,1,Sachin,Mumbai
1,2,Sourav,Kolkata
2,3,Zaheer,Nagpur
3,4,Harbhajan,Amritsar
4,7,Rishika,Bangalore


In [100]:
pd.merge(employee, manager)

MergeError: No common columns to perform merge on. Merge options: left_on=None, right_on=None, left_index=False, right_index=False

In [102]:
pd.merge(employee, manager, left_on='EmpId', right_on='emp_id')

Unnamed: 0,EmpId,EmpName,location,emp_id,Manager_Name
0,1,Sachin,Mumbai,1,Kapil
1,2,Sourav,Kolkata,2,Ravi
2,3,Zaheer,Nagpur,3,Srinath
3,4,Harbhajan,Amritsar,4,Prasanna


In [103]:
students = pd.DataFrame({'Name': ['Michael', 'Jonty', 'Michael', 'Roger', 'Lionel'], 
                         'Surname': ['Jordan', 'Rhodes', 'Jackson', 'Federer', 'Messi'],
                         'Ages': [15, 17, 12, 9, 10],
                         })

coaches = pd.DataFrame({'Name': ['Michael', 'Rafael', 'Lionel', 'Michael'], 
                         'Surname': ['Jordan', 'Nadal', 'Messi','Jackson'],
                         'Coach Name': ['Rajkumar Rao', 'Pankaj Tripathi', 'Viv Richards', 'Amir Khan'],
                         'Coach Salary': [17000, 25000, 20000, 16000]
                         })

In [104]:
students

Unnamed: 0,Name,Surname,Ages
0,Michael,Jordan,15
1,Jonty,Rhodes,17
2,Michael,Jackson,12
3,Roger,Federer,9
4,Lionel,Messi,10


In [105]:
coaches

Unnamed: 0,Name,Surname,Coach Name,Coach Salary
0,Michael,Jordan,Rajkumar Rao,17000
1,Rafael,Nadal,Pankaj Tripathi,25000
2,Lionel,Messi,Viv Richards,20000
3,Michael,Jackson,Amir Khan,16000


In [107]:
pd.merge(students, coaches, on=['Name','Surname'])   # joining on multiple keys

Unnamed: 0,Name,Surname,Ages,Coach Name,Coach Salary
0,Michael,Jordan,15,Rajkumar Rao,17000
1,Michael,Jackson,12,Amir Khan,16000
2,Lionel,Messi,10,Viv Richards,20000


#### Join

`join()`: primarily joins the DataFrames based on their indexes. By default, it performs a left join, keeping all rows from the left DataFrame and matching rows from the right DataFrame based on index values.

In [108]:
dfA = pd.DataFrame({'keyA': ['a', 'b', 'c', 'd', 'e', 'f'], 'values_A': [10, 20, 30, 40 ,50, 60]})

dfB = pd.DataFrame({'keyB': ['a', 'b', 'c'], 'values_B': [111, 222, 333]})

In [109]:
dfA

Unnamed: 0,keyA,values_A
0,a,10
1,b,20
2,c,30
3,d,40
4,e,50
5,f,60


In [110]:
dfB

Unnamed: 0,keyB,values_B
0,a,111
1,b,222
2,c,333


In [111]:
dfA.join(dfB)

Unnamed: 0,keyA,values_A,keyB,values_B
0,a,10,a,111.0
1,b,20,b,222.0
2,c,30,c,333.0
3,d,40,,
4,e,50,,
5,f,60,,


In [112]:
dfA.join(dfB, how='right')

Unnamed: 0,keyA,values_A,keyB,values_B
0,a,10,a,111
1,b,20,b,222
2,c,30,c,333


In [113]:
dfA.join(dfB, how='outer')

Unnamed: 0,keyA,values_A,keyB,values_B
0,a,10,a,111.0
1,b,20,b,222.0
2,c,30,c,333.0
3,d,40,,
4,e,50,,
5,f,60,,


In [114]:
dfA.join(dfB, how='inner')

Unnamed: 0,keyA,values_A,keyB,values_B
0,a,10,a,111
1,b,20,b,222
2,c,30,c,333


In [115]:
dfA = pd.DataFrame({'key': ['a', 'b', 'c', 'd', 'e', 'f'], 'values_A': [10, 20, 30, 40 ,50, 60]})

dfB = pd.DataFrame({'key': ['a', 'b', 'c'], 'values_B': [111, 222, 333]})

In [116]:
dfA

Unnamed: 0,key,values_A
0,a,10
1,b,20
2,c,30
3,d,40
4,e,50
5,f,60


In [117]:
dfB

Unnamed: 0,key,values_B
0,a,111
1,b,222
2,c,333


In [118]:
dfA.join(dfB, on='key')

ValueError: You are trying to merge on object and int64 columns for key 'key'. If you wish to proceed you should use pd.concat

In [119]:
dfA.join(dfB, lsuffix='_A', rsuffix='_B')

Unnamed: 0,key_A,values_A,key_B,values_B
0,a,10,a,111.0
1,b,20,b,222.0
2,c,30,c,333.0
3,d,40,,
4,e,50,,
5,f,60,,


In [120]:
dfA.set_index('key')

Unnamed: 0_level_0,values_A
key,Unnamed: 1_level_1
a,10
b,20
c,30
d,40
e,50
f,60


In [121]:
dfA.set_index('key').join(dfB.set_index('key'))

Unnamed: 0_level_0,values_A,values_B
key,Unnamed: 1_level_1,Unnamed: 2_level_1
a,10,111.0
b,20,222.0
c,30,333.0
d,40,
e,50,
f,60,


#### Concat

With concatenation, your datasets are just stitched together along an axis — either the row axis or column axis.

Concatenation along rows           |  Concatenation along columns
:-------------------------:|:-------------------------:
![](https://files.realpython.com/media/concat_axis0.2ec65b5f72bc.png)  |  ![](https://files.realpython.com/media/concat_col.a8eec2b4e84f.png)

In [123]:
df1 = pd.DataFrame(np.array([[1, 6, 8],[5, 4, 2],[3, 9, 4],[7, 2, 0]]), columns=['a', 'b', 'c'])

df2 = pd.DataFrame(np.array([[3, 4, 8, 2],[0, 1, 4, 5],[7, 5, 2, 1]]), columns=['b','c','d','e'])

In [124]:
df1

Unnamed: 0,a,b,c
0,1,6,8
1,5,4,2
2,3,9,4
3,7,2,0


In [125]:
df2

Unnamed: 0,b,c,d,e
0,3,4,8,2
1,0,1,4,5
2,7,5,2,1


In [127]:
df = pd.concat([df1,df2],axis=0)  # Concatenation along rows 

In [128]:
df

Unnamed: 0,a,b,c,d,e
0,1.0,6,8,,
1,5.0,4,2,,
2,3.0,9,4,,
3,7.0,2,0,,
0,,3,4,8.0,2.0
1,,0,1,4.0,5.0
2,,7,5,2.0,1.0


In [130]:
df.iloc[1,:]

a    5.0
b    4.0
c    2.0
d    NaN
e    NaN
Name: 1, dtype: float64

In [131]:
pd.concat([df1,df2],axis=1)  # Concatenation along columns

Unnamed: 0,a,b,c,b.1,c.1,d,e
0,1,6,8,3.0,4.0,8.0,2.0
1,5,4,2,0.0,1.0,4.0,5.0
2,3,9,4,7.0,5.0,2.0,1.0
3,7,2,0,,,,


In [132]:
pd.concat([df1,df2],axis=1)['b']  # Creates problems

Unnamed: 0,b,b.1
0,6,3.0
1,4,0.0
2,9,7.0
3,2,


In [None]:
pd.concat([df1,df2])  # By default the axis is 0

### Missing value and Duplicate value treatment

In [133]:
covid_data

Unnamed: 0,country,continent,total_confirmed,total_deaths,total_recovered,active_cases,serious_or_critical,total_cases_per_1m_population,total_deaths_per_1m_population,total_tests,total_tests_per_1m_population,population
0,Afghanistan,Asia,179267,7690.0,162202.0,9375.0,1124.0,4420,190.0,951337.0,23455.0,40560636
1,Albania,Europe,275574,3497.0,271826.0,251.0,2.0,95954,1218.0,1817530.0,632857.0,2871945
2,Algeria,Africa,265816,6875.0,178371.0,80570.0,6.0,5865,152.0,230861.0,5093.0,45325517
3,Andorra,Europe,42156,153.0,41021.0,982.0,14.0,543983,1974.0,249838.0,3223924.0,77495
4,Angola,Africa,99194,1900.0,97149.0,145.0,,2853,55.0,1499795.0,43136.0,34769277
...,...,...,...,...,...,...,...,...,...,...,...,...
221,Wallis And Futuna Islands,Australia/Oceania,454,7.0,438.0,9.0,,41755,644.0,20508.0,1886140.0,10873
222,Western Sahara,Africa,10,1.0,9.0,0.0,,16,2.0,,,624681
223,Yemen,Asia,11819,2149.0,9009.0,661.0,23.0,381,69.0,265253.0,8543.0,31049015
224,Zambia,Africa,320591,3983.0,315997.0,611.0,,16575,206.0,3452554.0,178497.0,19342381


In [134]:
covid_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 226 entries, 0 to 225
Data columns (total 12 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   country                         226 non-null    object 
 1   continent                       226 non-null    object 
 2   total_confirmed                 226 non-null    int64  
 3   total_deaths                    218 non-null    float64
 4   total_recovered                 204 non-null    float64
 5   active_cases                    204 non-null    float64
 6   serious_or_critical             145 non-null    float64
 7   total_cases_per_1m_population   226 non-null    int64  
 8   total_deaths_per_1m_population  218 non-null    float64
 9   total_tests                     212 non-null    float64
 10  total_tests_per_1m_population   212 non-null    float64
 11  population                      226 non-null    int64  
dtypes: float64(7), int64(3), object(2)
m

In [135]:
covid_data.isnull().sum()  # How many null values are there in each column

country                            0
continent                          0
total_confirmed                    0
total_deaths                       8
total_recovered                   22
active_cases                      22
serious_or_critical               81
total_cases_per_1m_population      0
total_deaths_per_1m_population     8
total_tests                       14
total_tests_per_1m_population     14
population                         0
dtype: int64

In [136]:
covid_data.dropna()  # Removes rows containig missing data (NaN)

Unnamed: 0,country,continent,total_confirmed,total_deaths,total_recovered,active_cases,serious_or_critical,total_cases_per_1m_population,total_deaths_per_1m_population,total_tests,total_tests_per_1m_population,population
0,Afghanistan,Asia,179267,7690.0,162202.0,9375.0,1124.0,4420,190.0,951337.0,23455.0,40560636
1,Albania,Europe,275574,3497.0,271826.0,251.0,2.0,95954,1218.0,1817530.0,632857.0,2871945
2,Algeria,Africa,265816,6875.0,178371.0,80570.0,6.0,5865,152.0,230861.0,5093.0,45325517
3,Andorra,Europe,42156,153.0,41021.0,982.0,14.0,543983,1974.0,249838.0,3223924.0,77495
5,Anguilla,North America,2984,9.0,2916.0,59.0,4.0,195646,590.0,51382.0,3368870.0,15252
...,...,...,...,...,...,...,...,...,...,...,...,...
217,Uzbekistan,Asia,238802,1637.0,236974.0,191.0,23.0,6947,48.0,1377915.0,40088.0,34372515
219,Venezuela,South America,522921,5711.0,516170.0,1040.0,230.0,18487,202.0,3359014.0,118752.0,28285909
220,Viet Nam,Asia,10696630,43065.0,9355040.0,1298525.0,340.0,108080,435.0,85811485.0,867048.0,98969721
223,Yemen,Asia,11819,2149.0,9009.0,661.0,23.0,381,69.0,265253.0,8543.0,31049015


In [137]:
covid_data.dropna(axis=1) # Removes columns containing missing data

Unnamed: 0,country,continent,total_confirmed,total_cases_per_1m_population,population
0,Afghanistan,Asia,179267,4420,40560636
1,Albania,Europe,275574,95954,2871945
2,Algeria,Africa,265816,5865,45325517
3,Andorra,Europe,42156,543983,77495
4,Angola,Africa,99194,2853,34769277
...,...,...,...,...,...
221,Wallis And Futuna Islands,Australia/Oceania,454,41755,10873
222,Western Sahara,Africa,10,16,624681
223,Yemen,Asia,11819,381,31049015
224,Zambia,Africa,320591,16575,19342381


In [138]:
covid_data_filled = covid_data.fillna(value=0)   # fills the missing values with 0

covid_data_filled.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 226 entries, 0 to 225
Data columns (total 12 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   country                         226 non-null    object 
 1   continent                       226 non-null    object 
 2   total_confirmed                 226 non-null    int64  
 3   total_deaths                    226 non-null    float64
 4   total_recovered                 226 non-null    float64
 5   active_cases                    226 non-null    float64
 6   serious_or_critical             226 non-null    float64
 7   total_cases_per_1m_population   226 non-null    int64  
 8   total_deaths_per_1m_population  226 non-null    float64
 9   total_tests                     226 non-null    float64
 10  total_tests_per_1m_population   226 non-null    float64
 11  population                      226 non-null    int64  
dtypes: float64(7), int64(3), object(2)
m

In [139]:
covid_data_filled.isnull().sum()

country                           0
continent                         0
total_confirmed                   0
total_deaths                      0
total_recovered                   0
active_cases                      0
serious_or_critical               0
total_cases_per_1m_population     0
total_deaths_per_1m_population    0
total_tests                       0
total_tests_per_1m_population     0
population                        0
dtype: int64

In [140]:
employee = pd.DataFrame({'Name': ['Susan', 'Maria', 'Olivia', 'Dipika', 'Rashmika'], 
                         'Role': ['Engineer','Manager',np.nan,'Scientist',np.nan],
                         'Salary': [40000, np.nan, 25000, np.nan, 20000]})

employee

Unnamed: 0,Name,Role,Salary
0,Susan,Engineer,40000.0
1,Maria,Manager,
2,Olivia,,25000.0
3,Dipika,Scientist,
4,Rashmika,,20000.0


In [141]:
employee_filled = employee.fillna({'Role':'Engineer', 'Salary': 15000})

employee_filled

Unnamed: 0,Name,Role,Salary
0,Susan,Engineer,40000.0
1,Maria,Manager,15000.0
2,Olivia,Engineer,25000.0
3,Dipika,Scientist,15000.0
4,Rashmika,Engineer,20000.0


In [143]:
df_dup = pd.concat([df1, df1]).reset_index(drop=True)

df_dup

Unnamed: 0,a,b,c
0,1,6,8
1,5,4,2
2,3,9,4
3,7,2,0
4,1,6,8
5,5,4,2
6,3,9,4
7,7,2,0


In [144]:
df_dup.drop_duplicates()   #removes duplicates from the dataset

Unnamed: 0,a,b,c
0,1,6,8
1,5,4,2
2,3,9,4
3,7,2,0


In [145]:
students

Unnamed: 0,Name,Surname,Ages
0,Michael,Jordan,15
1,Jonty,Rhodes,17
2,Michael,Jackson,12
3,Roger,Federer,9
4,Lionel,Messi,10


In [147]:
students.drop_duplicates()

Unnamed: 0,Name,Surname,Ages
0,Michael,Jordan,15
1,Jonty,Rhodes,17
2,Michael,Jackson,12
3,Roger,Federer,9
4,Lionel,Messi,10


In [149]:
students.drop_duplicates(['Name'])

Unnamed: 0,Name,Surname,Ages
0,Michael,Jordan,15
1,Jonty,Rhodes,17
3,Roger,Federer,9
4,Lionel,Messi,10


### inplace operations 

The `inplace` parameter in pandas allows certain operations to modify a DataFrame directly, without creating a new copy. Some common pandas operations that support the inplace parameter are:
- dropna(): Removes rows or columns with missing values.
- fillna(): Fills missing values with a specified value.
- replace(): Replaces values in a DataFrame.
- drop(): Removes rows or columns by label.
- drop_duplicates(): Removes duplicate rows.
- set_index(): Sets the DataFrame index.
- reset_index(): Resets the DataFrame index.
- sort_values(): Sorts the DataFrame by column values.
- sort_index(): Sorts the DataFrame by index.
- rename(): Renames columns or index labels.
- update(): Modifies in place using non-NA values from another DataFrame.

In [150]:
employee

Unnamed: 0,Name,Role,Salary
0,Susan,Engineer,40000.0
1,Maria,Manager,
2,Olivia,,25000.0
3,Dipika,Scientist,
4,Rashmika,,20000.0


In [153]:
employee.fillna({'Role':'Engineer', 'Salary': 15000}, inplace=True)

In [154]:
employee

Unnamed: 0,Name,Role,Salary
0,Susan,Engineer,40000.0
1,Maria,Manager,15000.0
2,Olivia,Engineer,25000.0
3,Dipika,Scientist,15000.0
4,Rashmika,Engineer,20000.0


### Concatenation and Stacking of NumPy Array

![](https://www.w3resource.com/w3r_images/numpy-manipulation-stack-function-image-1.png)

In [155]:
x1 = np.array([[1,2],[3,4],[5,6]])
x2 = np.array([[7,8,-1],[0,-1,1]])
x3 = np.array([[-1,1],[-2,2],[-3,3]])

x1.shape, x2.shape, x3.shape

((3, 2), (2, 3), (3, 2))

In [156]:
x1

array([[1, 2],
       [3, 4],
       [5, 6]])

In [157]:
x3

array([[-1,  1],
       [-2,  2],
       [-3,  3]])

In [159]:
v1 = np.stack((x1,x3), axis=0)

v1

array([[[ 1,  2],
        [ 3,  4],
        [ 5,  6]],

       [[-1,  1],
        [-2,  2],
        [-3,  3]]])

In [160]:
v1.shape

(2, 3, 2)

In [161]:
v2 = np.stack((x1,x3), axis=1)

v2

array([[[ 1,  2],
        [-1,  1]],

       [[ 3,  4],
        [-2,  2]],

       [[ 5,  6],
        [-3,  3]]])

In [162]:
v2.shape

(3, 2, 2)

![](https://www.w3resource.com/w3r_images/numpy-manipulation-hstack-function-image-a.png)

In [163]:
x1

array([[1, 2],
       [3, 4],
       [5, 6]])

In [164]:
x3

array([[-1,  1],
       [-2,  2],
       [-3,  3]])

In [165]:
np.hstack((x1,x3))

array([[ 1,  2, -1,  1],
       [ 3,  4, -2,  2],
       [ 5,  6, -3,  3]])

![](https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcQ6jO_LjYNvZEFQHGWmTRkCRsIFT1KS5pSOCw&s)

In [166]:
np.vstack((x1,x3))

array([[ 1,  2],
       [ 3,  4],
       [ 5,  6],
       [-1,  1],
       [-2,  2],
       [-3,  3]])