# Biblioteka Pandas

In [2]:
import pandas as pd # ładowanie biblioteki Pandas

In [3]:
dict_city = {"City" : ["Warszawa", "Łódź", "Poznań"], 
            "Population" : [12678079,   5398064,  1625631]}

df = pd.DataFrame(dict_city) # tworzenie ramki danych ze słownika
df

Unnamed: 0,City,Population
0,Warszawa,12678079
1,Łódź,5398064
2,Poznań,1625631


In [4]:
df.to_csv("cities.csv") # zachowanie ramki danych na komputerze w formacie csv

In [5]:
lists_city = [["Warszawa", "Łódź", "Poznań"], 
[12678079,   5398064,  1625631]]

pd.DataFrame(lists_city) # tworzenie ramki danych z listy list

Unnamed: 0,0,1,2
0,Warszawa,Łódź,Poznań
1,12678079,5398064,1625631


In [6]:
pd.DataFrame(lists_city).T # transponowanie (wymieniamy kolumny a wierszy)

Unnamed: 0,0,1
0,Warszawa,12678079
1,Łódź,5398064
2,Poznań,1625631


In [7]:
#df = pd.read_csv("IHME-GBD_2019_DATA-ff08d9bc-1/IHME-GBD_2019_DATA-ff08d9bc-1.csv", encoding = "utf-8") 
df = pd.read_csv("cities.csv", encoding = "utf-8") 

# tworzenie ramki danych z pliku 
# w cudzysłowie ścieżka do pliku
# można załadować https://drive.google.com/file/d/16fM_JEC5GAkYEDaDgzROrBm8loi9RZRZ/view 
# w pliku są dane o zmarłych z różnych przyczyn z 204 krajów z 2000 po 2019 lata ze strony 
# http://ghdx.healthdata.org/gbd-results-tool 

In [8]:
df.head() # pierwsze 5 wierszy ramki danych

Unnamed: 0.1,Unnamed: 0,City,Population
0,0,Warszawa,12678079
1,1,Łódź,5398064
2,2,Poznań,1625631


In [9]:
df.head(2) # pierwsze 8 wierszy ramki danych

Unnamed: 0.1,Unnamed: 0,City,Population
0,0,Warszawa,12678079
1,1,Łódź,5398064


In [10]:
df.tail(2) # ostatnie 5 wierszy ramki danych

Unnamed: 0.1,Unnamed: 0,City,Population
1,1,Łódź,5398064
2,2,Poznań,1625631


In [12]:
df.info() # informacja o ramce danych

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  3 non-null      int64 
 1   City        3 non-null      object
 2   Population  3 non-null      int64 
dtypes: int64(2), object(1)
memory usage: 200.0+ bytes


In [13]:
df.shape # pokazuje, ile wierszy i kolumn znajduje się w ramce danych

(3, 3)

In [14]:
df.describe() 
# informacje statystyczne w kolumnach (wartości niepowtarzalne, średnia, odchylenie standardowe, minimum, kwartyle, maksimum)

Unnamed: 0.1,Unnamed: 0,Population
count,3.0,3.0
mean,1.0,6567258.0
std,1.0,5618221.0
min,0.0,1625631.0
25%,0.5,3511848.0
50%,1.0,5398064.0
75%,1.5,9038072.0
max,2.0,12678080.0


In [15]:
df.describe(include = 'all') #statystyki obejmują nie tylko kolumny liczbowe, ale także wiersze
# (unique - ile unikalnych wartości, top - jaka jest najpopularniejsza wartość, freq - jak często najpopularniejsza)

Unnamed: 0.1,Unnamed: 0,City,Population
count,3.0,3,3.0
unique,,3,
top,,Łódź,
freq,,1,
mean,1.0,,6567258.0
std,1.0,,5618221.0
min,0.0,,1625631.0
25%,0.5,,3511848.0
50%,1.0,,5398064.0
75%,1.5,,9038072.0


In [16]:
df.dropna(inplace=True) # usuwanie brakujących wartości (NA)

In [18]:
#df["measure"] # wybór kolumny, metoda 1
df["City"]

0    Warszawa
1        Łódź
2      Poznań
Name: City, dtype: object

In [20]:
#df.measure # wybór kolumny, metoda 2
df.City

0    Warszawa
1        Łódź
2      Poznań
Name: City, dtype: object

In [22]:
#df[["location","sex","year"]] # wybór kilku kolumn jednocześnie
df[["City", "Population"]]

Unnamed: 0,City,Population
0,Warszawa,12678079
1,Łódź,5398064
2,Poznań,1625631


In [23]:
#df.loc[:, "location":"val"] # wybierz wszystkie wiersze i kolumny od „location” do „val”
df.loc[:, "City":"Population"]

Unnamed: 0,City,Population
0,Warszawa,12678079
1,Łódź,5398064
2,Poznań,1625631


In [36]:
#df.loc[100:110, "location":"val"] 
# wybierz wiersze od 100 do 110 (prawa krawędź, czyli liczba 110 jest uwzględniona w metodzie iloc), 
# kolumny od „location” do „val”
df.loc[1:2, "City"]

1      Łódź
2    Poznań
Name: City, dtype: object

In [30]:
#df.iloc[100:110, 0:3] 
# wybierz wiersze od 100 do 109 (prawa krawędź, czyli liczba 110 nie jest uwzględniona w metodzie iloc), 
# kolumny od 0 do 3
df.iloc[1:2, 1:3]

Unnamed: 0,City,Population
1,Łódź,5398064


In [31]:
#df[df["sex"] == "Both"] # wybierz według warunku tylko te wiersze w kolumnie "sex", w których wskazane są obie płcie ("Both")
df[df["City"] == "Łódź"]

Unnamed: 0.1,Unnamed: 0,City,Population
1,1,Łódź,5398064


In [33]:
df[(df["sex"] == "Both") & (df["cause"] == "Cardiovascular diseases") & (df["year"] == 2019)] # kilka warunków na raz

Unnamed: 0,measure,location,sex,age,cause,metric,year,val,upper,lower
197037,Deaths,Sri Lanka,Both,All Ages,Cardiovascular diseases,Rate,2019,213.123244,271.193595,160.527408
197268,Deaths,Armenia,Both,All Ages,Cardiovascular diseases,Rate,2019,447.304269,515.804063,382.174004
197298,Deaths,Viet Nam,Both,All Ages,Cardiovascular diseases,Rate,2019,249.199874,289.447813,209.903244
197433,Deaths,Democratic People's Republic of Korea,Both,All Ages,Cardiovascular diseases,Rate,2019,391.058759,452.857961,337.013789
197508,Deaths,Kazakhstan,Both,All Ages,Cardiovascular diseases,Rate,2019,355.541765,399.325605,314.636207
...,...,...,...,...,...,...,...,...,...,...
235491,Deaths,Egypt,Both,All Ages,Cardiovascular diseases,Rate,2019,266.350999,332.353780,206.500784
235629,Deaths,Congo,Both,All Ages,Cardiovascular diseases,Rate,2019,134.762018,168.489114,108.440606
235686,Deaths,Morocco,Both,All Ages,Cardiovascular diseases,Rate,2019,325.526242,372.812226,263.919650
235950,Deaths,Tunisia,Both,All Ages,Cardiovascular diseases,Rate,2019,302.524818,381.458389,232.286982


In [34]:
df[(df["sex"] == "Both") & (df["cause"] == "Cardiovascular diseases") & (df["year"] == 2019) & (df["val"] > 600)]

Unnamed: 0,measure,location,sex,age,cause,metric,year,val,upper,lower
199944,Deaths,North Macedonia,Both,All Ages,Cardiovascular diseases,Rate,2019,651.388575,775.113232,542.852474
201828,Deaths,Latvia,Both,All Ages,Cardiovascular diseases,Rate,2019,823.834965,961.198221,705.494296
203892,Deaths,Russian Federation,Both,All Ages,Cardiovascular diseases,Rate,2019,684.943411,768.021861,599.861565
205248,Deaths,Hungary,Both,All Ages,Cardiovascular diseases,Rate,2019,638.984813,746.066543,540.141611
207636,Deaths,Ukraine,Both,All Ages,Cardiovascular diseases,Rate,2019,1020.326176,1166.207717,890.09147
220377,Deaths,Georgia,Both,All Ages,Cardiovascular diseases,Rate,2019,810.680929,919.32345,696.025755
223428,Deaths,Romania,Both,All Ages,Cardiovascular diseases,Rate,2019,781.963318,912.699304,666.684367
223692,Deaths,Estonia,Both,All Ages,Cardiovascular diseases,Rate,2019,624.897478,759.074576,503.262686
224058,Deaths,Republic of Moldova,Both,All Ages,Cardiovascular diseases,Rate,2019,635.480079,709.063913,565.538636
224319,Deaths,Bulgaria,Both,All Ages,Cardiovascular diseases,Rate,2019,1140.933052,1349.38722,960.194209


In [35]:
cardio = df[(df["sex"] == "Both")&(df["cause"] == "Cardiovascular diseases") & (df["year"] == 2019) & (df["val"] > 600)]

In [36]:
cardio.sort_values(by = ["val"], ascending=False, inplace=False, na_position='last', ignore_index=False)
# posortuj ramkę danych według kolumny "val", ascending oznacza rosnąco, ascending = False - malejąco

Unnamed: 0,measure,location,sex,age,cause,metric,year,val,upper,lower
224319,Deaths,Bulgaria,Both,All Ages,Cardiovascular diseases,Rate,2019,1140.933052,1349.38722,960.194209
207636,Deaths,Ukraine,Both,All Ages,Cardiovascular diseases,Rate,2019,1020.326176,1166.207717,890.09147
201828,Deaths,Latvia,Both,All Ages,Cardiovascular diseases,Rate,2019,823.834965,961.198221,705.494296
220377,Deaths,Georgia,Both,All Ages,Cardiovascular diseases,Rate,2019,810.680929,919.32345,696.025755
232800,Deaths,Belarus,Both,All Ages,Cardiovascular diseases,Rate,2019,792.975384,977.913501,654.436226
223428,Deaths,Romania,Both,All Ages,Cardiovascular diseases,Rate,2019,781.963318,912.699304,666.684367
231075,Deaths,Lithuania,Both,All Ages,Cardiovascular diseases,Rate,2019,762.327387,895.66766,635.595129
230433,Deaths,Serbia,Both,All Ages,Cardiovascular diseases,Rate,2019,729.051353,853.947585,617.279051
203892,Deaths,Russian Federation,Both,All Ages,Cardiovascular diseases,Rate,2019,684.943411,768.021861,599.861565
199944,Deaths,North Macedonia,Both,All Ages,Cardiovascular diseases,Rate,2019,651.388575,775.113232,542.852474


In [37]:
df[(df["location"] == "Poland") | (df["location"] == "Polska")] 
# wyszukuje ciągi, w których Poland lub Polska (jeden z warunków jest spełniony)

Unnamed: 0,measure,location,sex,age,cause,metric,year,val,upper,lower
2361,Deaths,Poland,Male,All Ages,HIV/AIDS and sexually transmitted infections,Rate,2000,0.629560,0.642673,0.616331
2362,Deaths,Poland,Female,All Ages,HIV/AIDS and sexually transmitted infections,Rate,2000,0.218912,0.224100,0.213422
2363,Deaths,Poland,Both,All Ages,HIV/AIDS and sexually transmitted infections,Rate,2000,0.417918,0.424907,0.411271
2364,Deaths,Poland,Male,All Ages,Respiratory infections and tuberculosis,Rate,2000,26.516740,27.285258,25.403969
2365,Deaths,Poland,Female,All Ages,Respiratory infections and tuberculosis,Rate,2000,21.331357,22.411111,19.341247
...,...,...,...,...,...,...,...,...,...,...
256423,Deaths,Poland,Female,All Ages,Mental disorders,Rate,2006,0.015323,0.016415,0.014378
256424,Deaths,Poland,Both,All Ages,Mental disorders,Rate,2006,0.008000,0.008565,0.007513
256425,Deaths,Poland,Male,All Ages,Self-harm and interpersonal violence,Rate,2006,41.068049,41.719063,40.450699
256426,Deaths,Poland,Female,All Ages,Self-harm and interpersonal violence,Rate,2006,6.969394,7.145081,6.778728


In [38]:
df[df["cause"].str.contains("HIV")] # wiersze zawierają wzmiankę o HIV

Unnamed: 0,measure,location,sex,age,cause,metric,year,val,upper,lower
18,Deaths,Solomon Islands,Male,All Ages,HIV/AIDS and sexually transmitted infections,Rate,2000,21.364097,45.320288,7.606704
19,Deaths,Solomon Islands,Female,All Ages,HIV/AIDS and sexually transmitted infections,Rate,2000,20.605195,43.047910,7.344753
20,Deaths,Solomon Islands,Both,All Ages,HIV/AIDS and sexually transmitted infections,Rate,2000,20.997271,44.039874,7.615446
138,Deaths,Indonesia,Male,All Ages,HIV/AIDS and sexually transmitted infections,Rate,2000,2.581957,4.727558,1.341722
139,Deaths,Indonesia,Female,All Ages,HIV/AIDS and sexually transmitted infections,Rate,2000,1.877628,3.665094,0.916937
...,...,...,...,...,...,...,...,...,...,...
256924,Deaths,Tuvalu,Female,All Ages,HIV/AIDS and sexually transmitted infections,Rate,2014,6.180238,22.473695,1.555340
256925,Deaths,Tuvalu,Both,All Ages,HIV/AIDS and sexually transmitted infections,Rate,2014,7.767307,27.490140,2.205626
256986,Deaths,Democratic Republic of the Congo,Male,All Ages,HIV/AIDS and sexually transmitted infections,Rate,2004,69.621952,94.502440,49.970749
256987,Deaths,Democratic Republic of the Congo,Female,All Ages,HIV/AIDS and sexually transmitted infections,Rate,2004,119.327766,153.633755,90.012653


In [39]:
df[df["cause"].str.contains("HIV") == False] # wiersze nie zawierają wzmiankę o HIV

Unnamed: 0,measure,location,sex,age,cause,metric,year,val,upper,lower
0,Deaths,Samoa,Male,All Ages,Chronic respiratory diseases,Rate,2000,64.470214,81.808307,53.476793
1,Deaths,Samoa,Female,All Ages,Chronic respiratory diseases,Rate,2000,55.234399,77.883497,39.978647
2,Deaths,Samoa,Both,All Ages,Chronic respiratory diseases,Rate,2000,60.039961,76.013089,49.941986
3,Deaths,Samoa,Male,All Ages,Skin and subcutaneous diseases,Rate,2000,2.246741,3.467454,1.438979
4,Deaths,Samoa,Female,All Ages,Skin and subcutaneous diseases,Rate,2000,1.368385,1.945448,0.866099
...,...,...,...,...,...,...,...,...,...,...
257035,Deaths,Comoros,Female,All Ages,Neglected tropical diseases and malaria,Rate,2004,53.727176,92.205694,24.130322
257036,Deaths,Comoros,Both,All Ages,Neglected tropical diseases and malaria,Rate,2004,55.473969,96.573472,24.780106
257037,Deaths,Comoros,Male,All Ages,Chronic respiratory diseases,Rate,2004,26.083547,32.912324,18.654426
257038,Deaths,Comoros,Female,All Ages,Chronic respiratory diseases,Rate,2004,19.658170,25.456633,14.278054


In [40]:
df[~df["cause"].str.contains("HIV")] # wiersze nie zawierają wzmiankę o HIV (HIV)

Unnamed: 0,measure,location,sex,age,cause,metric,year,val,upper,lower
0,Deaths,Samoa,Male,All Ages,Chronic respiratory diseases,Rate,2000,64.470214,81.808307,53.476793
1,Deaths,Samoa,Female,All Ages,Chronic respiratory diseases,Rate,2000,55.234399,77.883497,39.978647
2,Deaths,Samoa,Both,All Ages,Chronic respiratory diseases,Rate,2000,60.039961,76.013089,49.941986
3,Deaths,Samoa,Male,All Ages,Skin and subcutaneous diseases,Rate,2000,2.246741,3.467454,1.438979
4,Deaths,Samoa,Female,All Ages,Skin and subcutaneous diseases,Rate,2000,1.368385,1.945448,0.866099
...,...,...,...,...,...,...,...,...,...,...
257035,Deaths,Comoros,Female,All Ages,Neglected tropical diseases and malaria,Rate,2004,53.727176,92.205694,24.130322
257036,Deaths,Comoros,Both,All Ages,Neglected tropical diseases and malaria,Rate,2004,55.473969,96.573472,24.780106
257037,Deaths,Comoros,Male,All Ages,Chronic respiratory diseases,Rate,2004,26.083547,32.912324,18.654426
257038,Deaths,Comoros,Female,All Ages,Chronic respiratory diseases,Rate,2004,19.658170,25.456633,14.278054


In [42]:
df.drop("measure", axis=1, inplace = True) # usuń kolumnę measure

In [41]:
df.drop(["metric","age","upper","lower"], axis=1, inplace = True) # usuń kolumny metric, age, upper, lower
df

Unnamed: 0,measure,location,sex,cause,year,val
0,Deaths,Samoa,Male,Chronic respiratory diseases,2000,64.470214
1,Deaths,Samoa,Female,Chronic respiratory diseases,2000,55.234399
2,Deaths,Samoa,Both,Chronic respiratory diseases,2000,60.039961
3,Deaths,Samoa,Male,Skin and subcutaneous diseases,2000,2.246741
4,Deaths,Samoa,Female,Skin and subcutaneous diseases,2000,1.368385
...,...,...,...,...,...,...
257035,Deaths,Comoros,Female,Neglected tropical diseases and malaria,2004,53.727176
257036,Deaths,Comoros,Both,Neglected tropical diseases and malaria,2004,55.473969
257037,Deaths,Comoros,Male,Chronic respiratory diseases,2004,26.083547
257038,Deaths,Comoros,Female,Chronic respiratory diseases,2004,19.658170


In [43]:
df["val_round"] = df["val"].round(decimals = 1) 
# utwórz nową kolumnę, która będzie miała zaokrąglone wartości z kolumny val
df.head()

Unnamed: 0,location,sex,cause,year,val,val_round
0,Samoa,Male,Chronic respiratory diseases,2000,64.470214,64.5
1,Samoa,Female,Chronic respiratory diseases,2000,55.234399,55.2
2,Samoa,Both,Chronic respiratory diseases,2000,60.039961,60.0
3,Samoa,Male,Skin and subcutaneous diseases,2000,2.246741,2.2
4,Samoa,Female,Skin and subcutaneous diseases,2000,1.368385,1.4


In [44]:
df.rename(columns = {"val": "value"}, inplace = True) # zmień nazwę kolumny
df

Unnamed: 0,location,sex,cause,year,value,val_round
0,Samoa,Male,Chronic respiratory diseases,2000,64.470214,64.5
1,Samoa,Female,Chronic respiratory diseases,2000,55.234399,55.2
2,Samoa,Both,Chronic respiratory diseases,2000,60.039961,60.0
3,Samoa,Male,Skin and subcutaneous diseases,2000,2.246741,2.2
4,Samoa,Female,Skin and subcutaneous diseases,2000,1.368385,1.4
...,...,...,...,...,...,...
257035,Comoros,Female,Neglected tropical diseases and malaria,2004,53.727176,53.7
257036,Comoros,Both,Neglected tropical diseases and malaria,2004,55.473969,55.5
257037,Comoros,Male,Chronic respiratory diseases,2004,26.083547,26.1
257038,Comoros,Female,Chronic respiratory diseases,2004,19.658170,19.7


In [46]:
df.to_csv("deaths_2000-2019.csv") # zapisywanie dataframe do csv na komputerze

In [47]:
df.loc[:10].to_csv("deaths_10.csv")