## Pandas Series

### First Steps

In [1]:
import pandas as pd
import numpy as np

In [2]:
titanic = pd.read_csv("titanic.csv")

In [3]:
titanic

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,deck
0,0,3,male,22.0,1,0,7.2500,S,
1,1,1,female,38.0,1,0,71.2833,C,C
2,1,3,female,26.0,0,0,7.9250,S,
3,1,1,female,35.0,1,0,53.1000,S,C
4,0,3,male,35.0,0,0,8.0500,S,
...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S,
887,1,1,female,19.0,0,0,30.0000,S,B
888,0,3,female,,1,2,23.4500,S,
889,1,1,male,26.0,0,0,30.0000,C,C


In [41]:
pd.options.display.max_rows = 20  #for setting max mo. of rows to display

In [4]:
titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   survived  891 non-null    int64  
 1   pclass    891 non-null    int64  
 2   sex       891 non-null    object 
 3   age       714 non-null    float64
 4   sibsp     891 non-null    int64  
 5   parch     891 non-null    int64  
 6   fare      891 non-null    float64
 7   embarked  889 non-null    object 
 8   deck      203 non-null    object 
dtypes: float64(2), int64(4), object(3)
memory usage: 62.8+ KB


In [5]:
titanic["age"]

0      22.0
1      38.0
2      26.0
3      35.0
4      35.0
       ... 
886    27.0
887    19.0
888     NaN
889    26.0
890    32.0
Name: age, Length: 891, dtype: float64

In [6]:
type(titanic["age"])

pandas.core.series.Series

In [7]:
titanic["age"].equals(titanic.age)

True

In [8]:
age = titanic["age"] #age is now a panda series

In [9]:
age.head(n= 2) #head() works with panda series

0    22.0
1    38.0
Name: age, dtype: float64

In [10]:
age.dtype

dtype('float64')

In [11]:
age.info() #doesn't work

AttributeError: 'Series' object has no attribute 'info'

In [None]:
age.shape

In [None]:
len(age)

In [12]:
age.index

RangeIndex(start=0, stop=891, step=1)

In [13]:
age.describe()

count    714.000000
mean      29.699118
std       14.526497
min        0.420000
25%       20.125000
50%       28.000000
75%       38.000000
max       80.000000
Name: age, dtype: float64

### Analysing Numerical Series

In [14]:
age.dtype

dtype('float64')

In [15]:
age.count() #gives all non-null values

714

In [16]:
age.size #no. of elements (include missing values)

891

In [17]:
len(age)

891

In [18]:
age.sum() #excludes null values

21205.17

In [19]:
sum(age) #panda built in fxn doesn't make sense with missing values

nan

In [20]:
age.mean()  #mean of all values in age

29.69911764705882

In [21]:
age.min()

0.42

In [22]:
age.max()

80.0

In [23]:
age.median()

28.0

In [24]:
age.unique()  #exclusive method for pandas series

array([22.  , 38.  , 26.  , 35.  ,   nan, 54.  ,  2.  , 27.  , 14.  ,
        4.  , 58.  , 20.  , 39.  , 55.  , 31.  , 34.  , 15.  , 28.  ,
        8.  , 19.  , 40.  , 66.  , 42.  , 21.  , 18.  ,  3.  ,  7.  ,
       49.  , 29.  , 65.  , 28.5 ,  5.  , 11.  , 45.  , 17.  , 32.  ,
       16.  , 25.  ,  0.83, 30.  , 33.  , 23.  , 24.  , 46.  , 59.  ,
       71.  , 37.  , 47.  , 14.5 , 70.5 , 32.5 , 12.  ,  9.  , 36.5 ,
       51.  , 55.5 , 40.5 , 44.  ,  1.  , 61.  , 56.  , 50.  , 36.  ,
       45.5 , 20.5 , 62.  , 41.  , 52.  , 63.  , 23.5 ,  0.92, 43.  ,
       60.  , 10.  , 64.  , 13.  , 48.  ,  0.75, 53.  , 57.  , 80.  ,
       70.  , 24.5 ,  6.  ,  0.67, 30.5 ,  0.42, 34.5 , 74.  ])

In [25]:
len(age.unique())  #no. of unique values

89

In [26]:
age.nunique()  #no. of unique values excluding null values

88

In [27]:
age.nunique(dropna = False) #including nan

89

## analyzing non- numerical series

In [28]:
import pandas as pd

In [30]:
summer = pd.read_csv("summer.csv")

In [31]:
summer.head()

Unnamed: 0,Year,City,Sport,Discipline,Athlete,Country,Gender,Event,Medal
0,1896,Athens,Aquatics,Swimming,"HAJOS, Alfred",HUN,Men,100M Freestyle,Gold
1,1896,Athens,Aquatics,Swimming,"HERSCHMANN, Otto",AUT,Men,100M Freestyle,Silver
2,1896,Athens,Aquatics,Swimming,"DRIVAS, Dimitrios",GRE,Men,100M Freestyle For Sailors,Bronze
3,1896,Athens,Aquatics,Swimming,"MALOKINIS, Ioannis",GRE,Men,100M Freestyle For Sailors,Gold
4,1896,Athens,Aquatics,Swimming,"CHASAPIS, Spiridon",GRE,Men,100M Freestyle For Sailors,Silver


In [32]:
summer.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31165 entries, 0 to 31164
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Year        31165 non-null  int64 
 1   City        31165 non-null  object
 2   Sport       31165 non-null  object
 3   Discipline  31165 non-null  object
 4   Athlete     31165 non-null  object
 5   Country     31161 non-null  object
 6   Gender      31165 non-null  object
 7   Event       31165 non-null  object
 8   Medal       31165 non-null  object
dtypes: int64(1), object(8)
memory usage: 2.1+ MB


In [33]:
athlete = summer['Athlete']

In [34]:
athlete.head()

0         HAJOS, Alfred
1      HERSCHMANN, Otto
2     DRIVAS, Dimitrios
3    MALOKINIS, Ioannis
4    CHASAPIS, Spiridon
Name: Athlete, dtype: object

In [35]:
athlete.tail()

31160           JANIKOWSKI, Damian
31161    REZAEI, Ghasem Gholamreza
31162               TOTROV, Rustam
31163            ALEKSANYAN, Artur
31164               LIDBERG, Jimmy
Name: Athlete, dtype: object

In [37]:
athlete.size

31165

In [38]:
type(athlete)

pandas.core.series.Series

In [39]:
athlete.dtype

dtype('O')

In [42]:
athlete.shape

(31165,)

In [44]:
athlete.describe()

count               31165
unique              22762
top       PHELPS, Michael
freq                   22
Name: Athlete, dtype: object

In [52]:
athlete.size

31165

In [55]:
athlete.count()  # excludes null values

31165

In [56]:
athlete.min()  # alphabetically first element

'AABYE, Edgar'

In [57]:
athlete.unique()

array(['HAJOS, Alfred', 'HERSCHMANN, Otto', 'DRIVAS, Dimitrios', ...,
       'TOTROV, Rustam', 'ALEKSANYAN, Artur', 'LIDBERG, Jimmy'],
      dtype=object)

In [58]:
len(athlete.unique())

22762

In [60]:
athlete.nunique(dropna = False) # if dropna is true, it excludes null values

22762

In [62]:
athlete.value_counts() # frequency of elements in descending order

PHELPS, Michael          22
LATYNINA, Larisa         18
ANDRIANOV, Nikolay       15
ONO, Takashi             13
MANGIAROTTI, Edoardo     13
                         ..
ZAKA, Uddin               1
ZAFAR, Hayat              1
MUHAMMAD, Rashid          1
MANNA, Muhammad Afzal     1
LIDBERG, Jimmy            1
Name: Athlete, Length: 22762, dtype: int64

In [64]:
athlete.value_counts( sort = True, ascending = True)  # same as above but in ascending order

UDVARDI, Istvan        1
LIM, Jin-Suk           1
LEE, Sang-Hyo          1
KOH, Suk-Chang         1
KIM, Jae-Hwan          1
                      ..
ONO, Takashi          13
SHAKHLIN, Boris       13
ANDRIANOV, Nikolay    15
LATYNINA, Larisa      18
PHELPS, Michael       22
Name: Athlete, Length: 22762, dtype: int64

In [66]:
athlete.value_counts( sort = True, normalize = True).head()  #also normalizes the values, so Phelps has only won 0.0706% of total medals

PHELPS, Michael         0.000706
LATYNINA, Larisa        0.000578
ANDRIANOV, Nikolay      0.000481
ONO, Takashi            0.000417
MANGIAROTTI, Edoardo    0.000417
Name: Athlete, dtype: float64

# Creating pandas series

In [1]:
import pandas as pd

from Dataframe

In [3]:
summer = pd.read_csv("summer.csv")

In [4]:
summer.head()

Unnamed: 0,Year,City,Sport,Discipline,Athlete,Country,Gender,Event,Medal
0,1896,Athens,Aquatics,Swimming,"HAJOS, Alfred",HUN,Men,100M Freestyle,Gold
1,1896,Athens,Aquatics,Swimming,"HERSCHMANN, Otto",AUT,Men,100M Freestyle,Silver
2,1896,Athens,Aquatics,Swimming,"DRIVAS, Dimitrios",GRE,Men,100M Freestyle For Sailors,Bronze
3,1896,Athens,Aquatics,Swimming,"MALOKINIS, Ioannis",GRE,Men,100M Freestyle For Sailors,Gold
4,1896,Athens,Aquatics,Swimming,"CHASAPIS, Spiridon",GRE,Men,100M Freestyle For Sailors,Silver


In [5]:
summer["Athlete"]

0                    HAJOS, Alfred
1                 HERSCHMANN, Otto
2                DRIVAS, Dimitrios
3               MALOKINIS, Ioannis
4               CHASAPIS, Spiridon
                   ...            
31160           JANIKOWSKI, Damian
31161    REZAEI, Ghasem Gholamreza
31162               TOTROV, Rustam
31163            ALEKSANYAN, Artur
31164               LIDBERG, Jimmy
Name: Athlete, Length: 31165, dtype: object

In [6]:
summer.Athlete

0                    HAJOS, Alfred
1                 HERSCHMANN, Otto
2                DRIVAS, Dimitrios
3               MALOKINIS, Ioannis
4               CHASAPIS, Spiridon
                   ...            
31160           JANIKOWSKI, Damian
31161    REZAEI, Ghasem Gholamreza
31162               TOTROV, Rustam
31163            ALEKSANYAN, Artur
31164               LIDBERG, Jimmy
Name: Athlete, Length: 31165, dtype: object

In [9]:
summer.iloc[:, 4]

0                    HAJOS, Alfred
1                 HERSCHMANN, Otto
2                DRIVAS, Dimitrios
3               MALOKINIS, Ioannis
4               CHASAPIS, Spiridon
                   ...            
31160           JANIKOWSKI, Damian
31161    REZAEI, Ghasem Gholamreza
31162               TOTROV, Rustam
31163            ALEKSANYAN, Artur
31164               LIDBERG, Jimmy
Name: Athlete, Length: 31165, dtype: object

## While Importing our CSV

In [10]:
pd.read_csv("summer.csv", usecols = ["Athlete"], squeeze = True).head()  #squeeze =If the parsed data only contains one column then return a Series.

0         HAJOS, Alfred
1      HERSCHMANN, Otto
2     DRIVAS, Dimitrios
3    MALOKINIS, Ioannis
4    CHASAPIS, Spiridon
Name: Athlete, dtype: object

### Creating from scratch with pd.Series()

In [12]:
pd.Series([10,24,45,67,78])

0    10
1    24
2    45
3    67
4    78
dtype: int64

In [13]:
pd.Series([10,24,45,67,78], index = ["mon", 'tue', 'wed', 'thur', 'fri'])  # with custom indices

mon     10
tue     24
wed     45
thur    67
fri     78
dtype: int64

In [14]:
pd.Series([10,24,45,67,78], index = ["mon", 'tue', 'wed', 'thur', 'fri'], name = "sales") 

mon     10
tue     24
wed     45
thur    67
fri     78
Name: sales, dtype: int64

## Creating pandas series part -2

### from numpy array

In [15]:
import pandas as pd
import numpy as np

In [16]:
sales = np.array([10,24,45,67,78])
sales

array([10, 24, 45, 67, 78])

In [17]:
pd.Series(sales)

0    10
1    24
2    45
3    67
4    78
dtype: int32

### from list and tuples

In [18]:
sales = [10,24,45,67,78]

In [20]:
pd.Series(sales)   # sales can be a tuple as well

0    10
1    24
2    45
3    67
4    78
dtype: int64

### from dictionary


In [21]:
dic = {"Mon":10, "tue": 3}
dic

{'Mon': 10, 'tue': 3}

In [23]:
sales = pd.Series(dic)
sales

Mon    10
tue     3
dtype: int64

In [26]:
pd.Series(dic, index = ["fri", "tue"])  # pandas uses new indices and uses their values from keys, drops whilch aren't in new indices

fri    NaN
tue    3.0
dtype: float64

In [27]:
pd.Series(dic, index = [1,2])  # all values are NaN

1   NaN
2   NaN
dtype: float64

## Indexing and slicing

In [28]:
import pandas as pd

In [29]:
titanic = pd.read_csv("titanic.csv")

In [30]:
titanic.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,deck
0,0,3,male,22.0,1,0,7.25,S,
1,1,1,female,38.0,1,0,71.2833,C,C
2,1,3,female,26.0,0,0,7.925,S,
3,1,1,female,35.0,1,0,53.1,S,C
4,0,3,male,35.0,0,0,8.05,S,


In [31]:
titanic.tail()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,deck
886,0,2,male,27.0,0,0,13.0,S,
887,1,1,female,19.0,0,0,30.0,S,B
888,0,3,female,,1,2,23.45,S,
889,1,1,male,26.0,0,0,30.0,C,C
890,0,3,male,32.0,0,0,7.75,Q,


In [32]:
age = titanic.age

In [33]:
age.head()

0    22.0
1    38.0
2    26.0
3    35.0
4    35.0
Name: age, dtype: float64

In [34]:
age.tail()

886    27.0
887    19.0
888     NaN
889    26.0
890    32.0
Name: age, dtype: float64

In [35]:
age.index

RangeIndex(start=0, stop=891, step=1)

In [36]:
age[0]

22.0

In [37]:
age[2]

26.0

In [39]:
age.iloc[-1] # using position based indexing

32.0

In [43]:
age.iloc[-1] == (age[890])  #are both the same? YES

True

In [47]:
age.loc[:3]  # first 4 rows

0    22.0
1    38.0
2    26.0
3    35.0
Name: age, dtype: float64

In [48]:
age.iloc[:3]  # first 3 rows

0    22.0
1    38.0
2    26.0
Name: age, dtype: float64

In [49]:
summer = pd.read_csv("summer.csv", index_col = "Athlete")

In [50]:
summer.head()

Unnamed: 0_level_0,Year,City,Sport,Discipline,Country,Gender,Event,Medal
Athlete,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
"HAJOS, Alfred",1896,Athens,Aquatics,Swimming,HUN,Men,100M Freestyle,Gold
"HERSCHMANN, Otto",1896,Athens,Aquatics,Swimming,AUT,Men,100M Freestyle,Silver
"DRIVAS, Dimitrios",1896,Athens,Aquatics,Swimming,GRE,Men,100M Freestyle For Sailors,Bronze
"MALOKINIS, Ioannis",1896,Athens,Aquatics,Swimming,GRE,Men,100M Freestyle For Sailors,Gold
"CHASAPIS, Spiridon",1896,Athens,Aquatics,Swimming,GRE,Men,100M Freestyle For Sailors,Silver


In [52]:
event = summer.Event

In [53]:
event.head()

Athlete
HAJOS, Alfred                     100M Freestyle
HERSCHMANN, Otto                  100M Freestyle
DRIVAS, Dimitrios     100M Freestyle For Sailors
MALOKINIS, Ioannis    100M Freestyle For Sailors
CHASAPIS, Spiridon    100M Freestyle For Sailors
Name: Event, dtype: object

In [55]:
event.index

Index(['HAJOS, Alfred', 'HERSCHMANN, Otto', 'DRIVAS, Dimitrios',
       'MALOKINIS, Ioannis', 'CHASAPIS, Spiridon', 'CHOROPHAS, Efstathios',
       'HAJOS, Alfred', 'ANDREOU, Joannis', 'CHOROPHAS, Efstathios',
       'NEUMANN, Paul',
       ...
       'AHMADOV, Emin', 'KAZAKEVIC, Aleksandr', 'KHUGAEV, Alan',
       'EBRAHIM, Karam Mohamed Gaber', 'GAJIYEV, Danyal', 'JANIKOWSKI, Damian',
       'REZAEI, Ghasem Gholamreza', 'TOTROV, Rustam', 'ALEKSANYAN, Artur',
       'LIDBERG, Jimmy'],
      dtype='object', name='Athlete', length=31165)

In [56]:
event[0]

'100M Freestyle'

In [57]:
event[-1]

'Wg 96 KG'

In [58]:
event[:3]

Athlete
HAJOS, Alfred                    100M Freestyle
HERSCHMANN, Otto                 100M Freestyle
DRIVAS, Dimitrios    100M Freestyle For Sailors
Name: Event, dtype: object

In [59]:
event["DRIVAS, Dimitrios"]

'100M Freestyle For Sailors'

In [60]:
event[:"DRIVAS, Dimitrios"]

Athlete
HAJOS, Alfred                    100M Freestyle
HERSCHMANN, Otto                 100M Freestyle
DRIVAS, Dimitrios    100M Freestyle For Sailors
Name: Event, dtype: object

In [68]:
event['PHELPS, Michael']

Athlete
PHELPS, Michael            100M Butterfly
PHELPS, Michael            200M Butterfly
PHELPS, Michael            200M Freestyle
PHELPS, Michael    200M Individual Medley
PHELPS, Michael    400M Individual Medley
PHELPS, Michael    4X100M Freestyle Relay
PHELPS, Michael       4X100M Medley Relay
PHELPS, Michael    4X200M Freestyle Relay
PHELPS, Michael            100M Butterfly
PHELPS, Michael            200M Butterfly
PHELPS, Michael            200M Freestyle
PHELPS, Michael    200M Individual Medley
PHELPS, Michael    400M Individual Medley
PHELPS, Michael    4X100M Freestyle Relay
PHELPS, Michael       4X100M Medley Relay
PHELPS, Michael    4X200M Freestyle Relay
PHELPS, Michael            100M Butterfly
PHELPS, Michael            200M Butterfly
PHELPS, Michael               200M Medley
PHELPS, Michael          4X100M Freestyle
PHELPS, Michael             4X100M Medley
PHELPS, Michael          4X200M Freestyle
Name: Event, dtype: object

In [70]:
event[["PHELPS, Michael", "LEWIS, Carl"]]

Athlete
PHELPS, Michael            100M Butterfly
PHELPS, Michael            200M Butterfly
PHELPS, Michael            200M Freestyle
PHELPS, Michael    200M Individual Medley
PHELPS, Michael    400M Individual Medley
PHELPS, Michael    4X100M Freestyle Relay
PHELPS, Michael       4X100M Medley Relay
PHELPS, Michael    4X200M Freestyle Relay
PHELPS, Michael            100M Butterfly
PHELPS, Michael            200M Butterfly
PHELPS, Michael            200M Freestyle
PHELPS, Michael    200M Individual Medley
PHELPS, Michael    400M Individual Medley
PHELPS, Michael    4X100M Freestyle Relay
PHELPS, Michael       4X100M Medley Relay
PHELPS, Michael    4X200M Freestyle Relay
PHELPS, Michael            100M Butterfly
PHELPS, Michael            200M Butterfly
PHELPS, Michael               200M Medley
PHELPS, Michael          4X100M Freestyle
PHELPS, Michael             4X100M Medley
PHELPS, Michael          4X200M Freestyle
LEWIS, Carl                          100M
LEWIS, Carl               

## Sorting and inplace-parameter

In [71]:
import pandas as pd

In [72]:
dic = {1:10, 3:25, 2:6, 4:36, 5:2, 6:0, 7:None}
dic

{1: 10, 3: 25, 2: 6, 4: 36, 5: 2, 6: 0, 7: None}

In [73]:
sales = pd.Series(dic)
sales

1    10.0
3    25.0
2     6.0
4    36.0
5     2.0
6     0.0
7     NaN
dtype: float64

In [74]:
sales.sort_index()

1    10.0
2     6.0
3    25.0
4    36.0
5     2.0
6     0.0
7     NaN
dtype: float64

In [79]:
sales.sort_index(ascending = True)   # ascending is True by default

1    10.0
2     6.0
3    25.0
4    36.0
5     2.0
6     0.0
7     NaN
dtype: float64

In [77]:
sales

1    10.0
3    25.0
2     6.0
4    36.0
5     2.0
6     0.0
7     NaN
dtype: float64

In [80]:
sales.sort_index(inplace = False)    #using inplace parameter, it performs changes to the original series

1    10.0
2     6.0
3    25.0
4    36.0
5     2.0
6     0.0
7     NaN
dtype: float64

In [83]:
sales.sort_values(inplace = False)  #sorting by values

6     0.0
5     2.0
2     6.0
1    10.0
3    25.0
4    36.0
7     NaN
dtype: float64

In [86]:
sales.sort_values(ascending = False, na_position = 'last', inplace = True)

In [87]:
sales

4    36.0
3    25.0
1    10.0
2     6.0
5     2.0
6     0.0
7     NaN
dtype: float64

In [89]:
dic = {'Mon':10, 'Tue':25, 'Wed':6, 'Thu':36, 'Fri':2}
dic

{'Mon': 10, 'Tue': 25, 'Wed': 6, 'Thu': 36, 'Fri': 2}

In [90]:
sales = pd.Series(dic)
sales

Mon    10
Tue    25
Wed     6
Thu    36
Fri     2
dtype: int64

In [92]:
sales.sort_index(ascending = False) # sorts alphabetically

Wed     6
Tue    25
Thu    36
Mon    10
Fri     2
dtype: int64

## nlargest() and nsmallest

In [93]:
import pandas as pd

In [94]:
titanic = pd.read_csv('titanic.csv')

In [95]:
titanic.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,deck
0,0,3,male,22.0,1,0,7.25,S,
1,1,1,female,38.0,1,0,71.2833,C,C
2,1,3,female,26.0,0,0,7.925,S,
3,1,1,female,35.0,1,0,53.1,S,C
4,0,3,male,35.0,0,0,8.05,S,


In [96]:
age = titanic.age
age.head()

0    22.0
1    38.0
2    26.0
3    35.0
4    35.0
Name: age, dtype: float64

#### If we want to extract the age of oldest and youngest passenger, we can do:

In [98]:
age.nlargest(n = 3)   # 5 largest values by default

630    80.0
851    74.0
96     71.0
Name: age, dtype: float64

In [99]:
age.nsmallest()  # 5 smallest values by default

803    0.42
755    0.67
469    0.75
644    0.75
78     0.83
Name: age, dtype: float64