### First Steps with Pandas Series

In [1]:
import pandas as pd  # import Library

In [2]:
titanic = pd.read_csv("titanic.csv")   # read csv file

In [3]:
titanic

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,deck
0,0,3,male,22.0,1,0,7.2500,S,
1,1,1,female,38.0,1,0,71.2833,C,C
2,1,3,female,26.0,0,0,7.9250,S,
3,1,1,female,35.0,1,0,53.1000,S,C
4,0,3,male,35.0,0,0,8.0500,S,
...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S,
887,1,1,female,19.0,0,0,30.0000,S,B
888,0,3,female,,1,2,23.4500,S,
889,1,1,male,26.0,0,0,30.0000,C,C


In [4]:
titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   survived  891 non-null    int64  
 1   pclass    891 non-null    int64  
 2   sex       891 non-null    object 
 3   age       714 non-null    float64
 4   sibsp     891 non-null    int64  
 5   parch     891 non-null    int64  
 6   fare      891 non-null    float64
 7   embarked  889 non-null    object 
 8   deck      203 non-null    object 
dtypes: float64(2), int64(4), object(3)
memory usage: 62.8+ KB


In [5]:
titanic["age"]

0      22.0
1      38.0
2      26.0
3      35.0
4      35.0
       ... 
886    27.0
887    19.0
888     NaN
889    26.0
890    32.0
Name: age, Length: 891, dtype: float64

In [6]:
type(titanic["age"])

pandas.core.series.Series

In [7]:
titanic["age"].equals(titanic.age)

True

In [8]:
age = titanic["age"]

In [9]:
age.head(2)

0    22.0
1    38.0
Name: age, dtype: float64

In [10]:
age.tail()

886    27.0
887    19.0
888     NaN
889    26.0
890    32.0
Name: age, dtype: float64

In [11]:
age.dtype

dtype('float64')

In [12]:
age.shape

(891,)

In [13]:
len(age)

891

In [14]:
age.index

RangeIndex(start=0, stop=891, step=1)

In [15]:
#age.info()    # info() method is not available in series only available in DataFrame 

In [16]:
age.to_frame().info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   age     714 non-null    float64
dtypes: float64(1)
memory usage: 7.1 KB


###  Analyzing Numerical Series

In [17]:
age

0      22.0
1      38.0
2      26.0
3      35.0
4      35.0
       ... 
886    27.0
887    19.0
888     NaN
889    26.0
890    32.0
Name: age, Length: 891, dtype: float64

In [18]:
age.describe()  # shows statistical value only age field

count    714.000000
mean      29.699118
std       14.526497
min        0.420000
25%       20.125000
50%       28.000000
75%       38.000000
max       80.000000
Name: age, dtype: float64

In [19]:
age.count()

714

In [20]:
age.size

891

In [21]:
len(age)

891

In [22]:
age.sum(skipna = True)

21205.17

In [23]:
sum(age)

nan

In [24]:
age.mean()

29.69911764705882

In [25]:
age.median()

28.0

In [26]:
age.std()

14.526497332334044

In [27]:
age.min()

0.42

In [28]:
age.max()

80.0

In [29]:
age.unique()

array([22.  , 38.  , 26.  , 35.  ,   nan, 54.  ,  2.  , 27.  , 14.  ,
        4.  , 58.  , 20.  , 39.  , 55.  , 31.  , 34.  , 15.  , 28.  ,
        8.  , 19.  , 40.  , 66.  , 42.  , 21.  , 18.  ,  3.  ,  7.  ,
       49.  , 29.  , 65.  , 28.5 ,  5.  , 11.  , 45.  , 17.  , 32.  ,
       16.  , 25.  ,  0.83, 30.  , 33.  , 23.  , 24.  , 46.  , 59.  ,
       71.  , 37.  , 47.  , 14.5 , 70.5 , 32.5 , 12.  ,  9.  , 36.5 ,
       51.  , 55.5 , 40.5 , 44.  ,  1.  , 61.  , 56.  , 50.  , 36.  ,
       45.5 , 20.5 , 62.  , 41.  , 52.  , 63.  , 23.5 ,  0.92, 43.  ,
       60.  , 10.  , 64.  , 13.  , 48.  ,  0.75, 53.  , 57.  , 80.  ,
       70.  , 24.5 ,  6.  ,  0.67, 30.5 ,  0.42, 34.5 , 74.  ])

In [30]:
len(age.unique())

89

In [31]:
age.nunique(dropna = False)

89

In [32]:
age.value_counts()

24.00    30
22.00    27
18.00    26
19.00    25
28.00    25
         ..
36.50     1
55.50     1
0.92      1
23.50     1
74.00     1
Name: age, Length: 88, dtype: int64

In [33]:
age.value_counts(sort = True)

24.00    30
22.00    27
18.00    26
19.00    25
28.00    25
         ..
36.50     1
55.50     1
0.92      1
23.50     1
74.00     1
Name: age, Length: 88, dtype: int64

In [34]:
age.value_counts(sort = False)

22.00    27
38.00    11
26.00    18
35.00    18
54.00     8
         ..
0.67      1
30.50     2
0.42      1
34.50     1
74.00     1
Name: age, Length: 88, dtype: int64

In [35]:
age.value_counts(dropna = True)

24.00    30
22.00    27
18.00    26
19.00    25
28.00    25
         ..
36.50     1
55.50     1
0.92      1
23.50     1
74.00     1
Name: age, Length: 88, dtype: int64

In [36]:
age.value_counts(dropna = False)

NaN      177
24.00     30
22.00     27
18.00     26
28.00     25
        ... 
36.50      1
55.50      1
0.92       1
23.50      1
74.00      1
Name: age, Length: 89, dtype: int64

In [37]:
age.value_counts(ascending = False)  # default behaviour

24.00    30
22.00    27
18.00    26
19.00    25
28.00    25
         ..
36.50     1
55.50     1
0.92      1
23.50     1
74.00     1
Name: age, Length: 88, dtype: int64

In [38]:
age.value_counts(ascending = True)

74.0     1
14.5     1
70.5     1
12.0     1
36.5     1
        ..
30.0    25
19.0    25
18.0    26
22.0    27
24.0    30
Name: age, Length: 88, dtype: int64

In [39]:
age.value_counts(sort = True, dropna = True, ascending = False, normalize = False)

24.00    30
22.00    27
18.00    26
19.00    25
28.00    25
         ..
36.50     1
55.50     1
0.92      1
23.50     1
74.00     1
Name: age, Length: 88, dtype: int64

In [40]:
age.value_counts(sort = True, dropna = True, ascending = False, normalize = True)

24.00    0.042017
22.00    0.037815
18.00    0.036415
19.00    0.035014
28.00    0.035014
           ...   
36.50    0.001401
55.50    0.001401
0.92     0.001401
23.50    0.001401
74.00    0.001401
Name: age, Length: 88, dtype: float64

In [41]:
age.value_counts(sort = True, dropna = False, ascending = False, normalize = True)

NaN      0.198653
24.00    0.033670
22.00    0.030303
18.00    0.029181
28.00    0.028058
           ...   
36.50    0.001122
55.50    0.001122
0.92     0.001122
23.50    0.001122
74.00    0.001122
Name: age, Length: 89, dtype: float64

In [42]:
age.count()

714

In [43]:
30/age.count()

0.04201680672268908

In [44]:
age.value_counts(sort = True, dropna = False, ascending = False, normalize = True)

NaN      0.198653
24.00    0.033670
22.00    0.030303
18.00    0.029181
28.00    0.028058
           ...   
36.50    0.001122
55.50    0.001122
0.92     0.001122
23.50    0.001122
74.00    0.001122
Name: age, Length: 89, dtype: float64

In [45]:
30/age.size

0.03367003367003367

In [46]:
age.value_counts(sort = True, dropna = True, ascending= False, normalize = False, bins = 5)

(16.336, 32.252]    346
(32.252, 48.168]    188
(0.339, 16.336]     100
(48.168, 64.084]     69
(64.084, 80.0]       11
Name: age, dtype: int64

In [47]:
age.value_counts(sort = True, dropna = True, ascending= False, normalize = True, bins = 10)

(16.336, 24.294]    0.198653
(24.294, 32.252]    0.189675
(32.252, 40.21]     0.132435
(40.21, 48.168]     0.078563
(0.339, 8.378]      0.060606
(8.378, 16.336]     0.051627
(48.168, 56.126]    0.050505
(56.126, 64.084]    0.026936
(64.084, 72.042]    0.010101
(72.042, 80.0]      0.002245
Name: age, dtype: float64

### Analyzing non-numerical Series

In [48]:
import pandas as pd

In [49]:
summer = pd.read_csv("summer.csv")

In [50]:
summer.head()

Unnamed: 0,Year,City,Sport,Discipline,Athlete,Country,Gender,Event,Medal
0,1896,Athens,Aquatics,Swimming,"HAJOS, Alfred",HUN,Men,100M Freestyle,Gold
1,1896,Athens,Aquatics,Swimming,"HERSCHMANN, Otto",AUT,Men,100M Freestyle,Silver
2,1896,Athens,Aquatics,Swimming,"DRIVAS, Dimitrios",GRE,Men,100M Freestyle For Sailors,Bronze
3,1896,Athens,Aquatics,Swimming,"MALOKINIS, Ioannis",GRE,Men,100M Freestyle For Sailors,Gold
4,1896,Athens,Aquatics,Swimming,"CHASAPIS, Spiridon",GRE,Men,100M Freestyle For Sailors,Silver


In [51]:
summer.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31165 entries, 0 to 31164
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Year        31165 non-null  int64 
 1   City        31165 non-null  object
 2   Sport       31165 non-null  object
 3   Discipline  31165 non-null  object
 4   Athlete     31165 non-null  object
 5   Country     31161 non-null  object
 6   Gender      31165 non-null  object
 7   Event       31165 non-null  object
 8   Medal       31165 non-null  object
dtypes: int64(1), object(8)
memory usage: 2.1+ MB


In [52]:
athlete = summer["Athlete"]

In [53]:
athlete.head()

0         HAJOS, Alfred
1      HERSCHMANN, Otto
2     DRIVAS, Dimitrios
3    MALOKINIS, Ioannis
4    CHASAPIS, Spiridon
Name: Athlete, dtype: object

In [54]:
athlete.tail(5)

31160           JANIKOWSKI, Damian
31161    REZAEI, Ghasem Gholamreza
31162               TOTROV, Rustam
31163            ALEKSANYAN, Artur
31164               LIDBERG, Jimmy
Name: Athlete, dtype: object

In [55]:
type(athlete)

pandas.core.series.Series

In [56]:
athlete.dtype

dtype('O')

In [57]:
athlete.shape

(31165,)

In [58]:
athlete.describe()

count               31165
unique              22762
top       PHELPS, Michael
freq                   22
Name: Athlete, dtype: object

In [59]:
athlete.size

31165

In [60]:
athlete.count()

31165

In [61]:
athlete.min()

'AABYE, Edgar'

In [62]:
athlete.unique()

array(['HAJOS, Alfred', 'HERSCHMANN, Otto', 'DRIVAS, Dimitrios', ...,
       'TOTROV, Rustam', 'ALEKSANYAN, Artur', 'LIDBERG, Jimmy'],
      dtype=object)

In [63]:
len(athlete.unique())

22762

In [64]:
athlete.nunique(dropna= False)

22762

In [65]:
athlete.nunique(dropna = True)

22762

In [66]:
athlete.value_counts()

PHELPS, Michael          22
LATYNINA, Larisa         18
ANDRIANOV, Nikolay       15
ONO, Takashi             13
MANGIAROTTI, Edoardo     13
                         ..
ZAKA, Uddin               1
ZAFAR, Hayat              1
MUHAMMAD, Rashid          1
MANNA, Muhammad Afzal     1
LIDBERG, Jimmy            1
Name: Athlete, Length: 22762, dtype: int64

In [67]:
athlete.value_counts(sort = True, ascending=True)

UDVARDI, Istvan        1
LIM, Jin-Suk           1
LEE, Sang-Hyo          1
KOH, Suk-Chang         1
KIM, Jae-Hwan          1
                      ..
ONO, Takashi          13
SHAKHLIN, Boris       13
ANDRIANOV, Nikolay    15
LATYNINA, Larisa      18
PHELPS, Michael       22
Name: Athlete, Length: 22762, dtype: int64

In [68]:
athlete.value_counts(sort = True, ascending=False, normalize = True).head()

PHELPS, Michael         0.000706
LATYNINA, Larisa        0.000578
ANDRIANOV, Nikolay      0.000481
ONO, Takashi            0.000417
MANGIAROTTI, Edoardo    0.000417
Name: Athlete, dtype: float64

### Sorting and introduction to the  inplace-parameter

In [69]:
import pandas as pd

In [70]:
dic = {1:10, 3:25, 2:6, 4:36, 5:2, 6:0, 7:None}
dic

{1: 10, 3: 25, 2: 6, 4: 36, 5: 2, 6: 0, 7: None}

In [71]:
sales = pd.Series(dic)
sales

1    10.0
3    25.0
2     6.0
4    36.0
5     2.0
6     0.0
7     NaN
dtype: float64

In [72]:
sales.sort_index()

1    10.0
2     6.0
3    25.0
4    36.0
5     2.0
6     0.0
7     NaN
dtype: float64

In [73]:
sales.sort_index(ascending = True, inplace= True)

In [74]:
sales

1    10.0
2     6.0
3    25.0
4    36.0
5     2.0
6     0.0
7     NaN
dtype: float64

In [75]:
sales.sort_values(inplace=False)

6     0.0
5     2.0
2     6.0
1    10.0
3    25.0
4    36.0
7     NaN
dtype: float64

In [76]:
sales

1    10.0
2     6.0
3    25.0
4    36.0
5     2.0
6     0.0
7     NaN
dtype: float64

In [77]:
sales.sort_values(ascending=False, na_position="last", inplace= True)

In [78]:
sales

4    36.0
3    25.0
1    10.0
2     6.0
5     2.0
6     0.0
7     NaN
dtype: float64

In [79]:
dic = {"Mon":10, "Tue":25, "Wed":6, "Thu": 36, "Fri": 2}
dic

{'Mon': 10, 'Tue': 25, 'Wed': 6, 'Thu': 36, 'Fri': 2}

In [80]:
sales = pd.Series(dic)

In [81]:
sales

Mon    10
Tue    25
Wed     6
Thu    36
Fri     2
dtype: int64

In [82]:
sales.sort_index(ascending=False)

Wed     6
Tue    25
Thu    36
Mon    10
Fri     2
dtype: int64