# Pandas Series

## First steps with pandas

In [None]:
import pandas as pd

In [None]:
titanic = pd.read_csv('/kaggle/input/titanic-csv/titanic.csv')

In [None]:
titanic

In [None]:
titanic.info()

In [None]:
titanic['age']

In [None]:
type(titanic['age'])

- A Pandas series is one dimensional labeled array.
- It is labelled with the same index that we have in dataframe.

In [None]:
titanic.age

In [None]:
titanic['age'].equals(titanic.age)

- There exist 2 alternatives to access a column.

In [None]:
age = titanic['age']

- Pandas series and Pandas dataframe shares most of the methods and attributes.
- There exists only few methods that are only available for dataframe and vice versa.

In [None]:
age.head()

In [None]:
age.head(2)

In [None]:
age.tail()

In [None]:
age.dtype

In [None]:
age.shape

In [None]:
len(age)

In [None]:
age.index

In [None]:
#age.info()
# This raises attribute error

- The info() method is only available for a pandas dataframe.

In [None]:
#Converting a pandas series to pandas dataframe.
age.to_frame()

In [None]:
type(age.to_frame())

In [None]:
age.to_frame().info()

# Analyzing numerical series

In [None]:
age

In [None]:
age.describe()

- For most of the summary statistics in describe method we have a special method.

In [None]:
age.count()

In [None]:
age.size

In [None]:
len(age)

- The real power of pandas is that it can handle real world data.
- Real world data is typically messy and unclean, e.g. many real world data contains missing values.
- Pandas is very helpful when it comes to missing values.

In [None]:
age.sum()

In [None]:
sum(age)

- The built in python function sum() does not know how to handle the missing values.

In [None]:
# The default value of skipna is True
#i.e. by default pandas skips the missing values.
age.sum(skipna = True)

In [None]:
# if we include the missing values pandas does not know how to calculate the sum.
age.sum(skipna = False)

- We can find skipna parameter in many pandas methods and it is typically set to True.

In [None]:
age.mean()

- This means the average age of a person onborad titanic was 29.

In [None]:
#Q2 or 50th Percentile
age.median()

In [None]:
age.std()

In [None]:
age.min()

In [None]:
age.max()

In [None]:
# unique values are displayed in their order of appearance
age.unique()

In [None]:
len(age.unique())

In [None]:
#to get the number of unique values
#excludes na by default
age.nunique()

In [None]:
age.nunique(dropna = False)

In [None]:
#returns absolute frequencies of all the values,
age.value_counts()

In [None]:
age.value_counts(sort = True)

In [None]:
age.value_counts(sort = False)

In [None]:
#by default we ignore missing values
age.value_counts(dropna = True)

In [None]:
age.value_counts(dropna = False) 

In [None]:
titanic.info()

In [None]:
891- 714

In [None]:
# by default most ascending to least ascending
age.value_counts(ascending = False)

In [None]:
age.value_counts(ascending = True)

In [None]:
age.value_counts(ascending = True)

- By default pandas display the absolute frequencies but we have the option to display the relative frequencies as well.

In [None]:
#with default parameter values
age.value_counts(sort = True, dropna = True, ascending = False, normalize = False)

In [None]:
# to get relative frequency make normalize = True
age.value_counts(sort = True, dropna = True, ascending = False, normalize = True)

 - This means 4.2% of all passengers are 24 years old and 3.7% of all ppassengers are 22 years old.

- How does pandas calculate the relative frequency?

In [None]:
# while calculating the relative frequency pandas ignore the missing values.
# since dropna = True by default
30 / age.count()

In [None]:
# if we change dropna = False then relative frequency changes
age.value_counts(sort = True, dropna = False, ascending = False, normalize = True)

In [None]:
30 / age.size

In [None]:
# to organize in equal width bins
age.value_counts(sort = True, dropna = True, ascending = False, normalize = False, bins = 5)

In [None]:
# here we are getting absolute frequencies
age.value_counts(sort = True, dropna = True, ascending = False, normalize = False, bins = 10)

In [None]:
# here we are getting relative frequencies
age.value_counts(sort = True, dropna = True, ascending = False, normalize = True, bins = 10)

In [None]:
# we can chain the pandas methods to get the desired results.
age.value_counts(sort = True, dropna = False, ascending = False, normalize = True).sum()

# Analyzing Non-numerical series

In [None]:
summer = pd.read_csv('/kaggle/input/summer-olympics-dataset-18962012/summer.csv')

In [None]:
summer.head()

In [None]:
summer.info()

In [None]:
athlete = summer['Athlete']

In [None]:
athlete.head()

In [None]:
athlete.tail(5)

In [None]:
type(athlete)

In [None]:
athlete.dtype

- datatype is object.

In [None]:
athlete.shape

- We have a 1d Array.

In [None]:
athlete.describe()

- descibe() method gives us different summary when applying on non- numerical series as compared to applying it on a numerical series.

In [None]:
athlete.size

In [None]:
athlete.count()

- If we have any missing values then the count() method do not take into account the nan values and hence provide us with a lower value as compared to size attribute.

In [None]:
summer.info()

In [None]:
summer['Country'].count()

In [None]:
summer['Country'].size

- The difference between count() and size can be noticed in the above example.

In [None]:
athlete

In [None]:
# for non numerical series min() returns the alpabetically minimum value.
athlete.min()

In [None]:
athlete.unique()

In [None]:
len(athlete.unique())

In [None]:
athlete.nunique()

In [None]:
# we do not have any missing value in the athlete series therefore the result is same. 
athlete.nunique(dropna = False)

In [None]:
summer['Country'].nunique()

In [None]:
summer['Country'].nunique(dropna = False)

- We can see that all NaN values are counted as a single unique value.

In [None]:
athlete.value_counts()

- Michael Phelps won 22 medals in his olympic career till 2012 London Olympics.

In [None]:
athlete.value_counts(sort = True, ascending = True)

In [None]:
#relative frequencies.
athlete.value_counts(sort = True, ascending = False, normalize = True)

# Creating Pandas Series

## From dataframe

In [None]:
summer = pd.read_csv('/kaggle/input/summer-olympics-dataset-18962012/summer.csv')

In [None]:
summer.head()

- If we simply select 1 column or 1 row of dataframe then we automatically have a pandas series.

In [None]:
summer['Athlete']

In [None]:
summer.iloc[0]

In [None]:
type(summer.iloc[0])

- If we select only one row then also we get a pandas series.

- If from the begining we are sure that we will be working with a single column and not the entire dataset, we can do so at the time of importing the csv.

## Importing from CSV

In [None]:
pd.read_csv('/kaggle/input/summer-olympics-dataset-18962012/summer.csv', usecols = ['Athlete'], squeeze = True).head()

- `squeeze = True` parameter : If the parsed data only contains one column then return a Series.

 ## Creating from Scratch with pd.Series()

In [None]:
pd.Series([10, 20, 30, 40, 50, 25, 4, 2, 29, 87, 69, 88, 47])

- by default we get range-index.

In [None]:
pd.Series([1, 2, 3, 4, 5, 6, 7], index = ['Mon', 'Tues', 'Wed', 'Thurs', 'Fri', 'Sat', 'Sun'])

In [None]:
pd.Series([1, 2, 3, 4, 5, 6, 7], index = ['Mon', 'Tues', 'Wed', 'Thurs', 'Fri', 'Sat', 'Sun'], name = 'Sales')

## Creating pandas series from numpy array

In [None]:
import numpy as np

In [None]:
sales = np.array([10, 23, 56, 12, 4, 5, 16])
sales

In [None]:
pd.Series(sales)

## from List

In [None]:
sales = [10, 23, 56, 12, 4, 5, 16]

In [None]:
pd.Series(sales)

## from tuple

In [None]:
sales = (10, 23, 56, 12, 4, 5, 16)

In [None]:
pd.Series(sales)

- We can change the index from range-index to index of our choice.

In [None]:
pd.Series(sales, index = ['Mon', 'Tues', 'Wed', 'Thurs', 'Fri', 'Sat', 'Sun'], name = 'Sales')

## From Dictionary

In [None]:
dic = {'Mon': 10, 'Tue': 20, 'Thurs': 5, 'Fri': 39, 'Sat':69, 'Sun': 33}
dic

In [None]:
sales = pd.Series(dic)

In [None]:
sales

- The keys become index and values are treated as values .

In [None]:
pd.Series(dic, index = ['Wed', 'Mon', 'Jul', 'Aug', 'Thurs', 'Fri'] )

- Pandas compares our new index to the keys of dictionary.
- The new index dominates over the existing keys, if a key is not present then NaN becomes the value at that index.

In [None]:
pd.Series(dic, index = [1, 2, 3, 4, 5, 6])

- If we pass completly different index then no key values matches it and NaN becomes the values of the index.

# Indexing and Slicing

- It works in the same way as indexing and slicing pandas dataframe.
- The only difference is that pandas Series have only one dimension(1 column or 1 Row) therefore we cannot slice for rows and columns as we did with pandas dataframe.

In [None]:
import pandas as pd

In [None]:
titanic = pd.read_csv('/kaggle/input/titanic-csv/titanic.csv')

In [None]:
titanic.head()

In [None]:
titanic.tail()

In [None]:
age = titanic.age

In [None]:
age.head()

In [None]:
age.tail()

In [None]:
age.index

In [None]:
age[0]

In [None]:
age[2]

- With Range-index we are not sure whether we are doing position based indexing or label based indexing becuase the range based indexes are identical to the positions.

- We are not sure whether age[2] is being calculated using position based indexing or label based indexing.

In [None]:
#Error: KeyError: -1
#age[-1]

- If position based indexing would have been followed then age[-1] would result in last row, however we get an error.
- Therefore we explicitly use position based indexing using iloc[]

In [None]:
#Earlier pandas did not know how to handle age[-1] but now it does with iloc[]
age.iloc[-1]

- Therefore by using iloc[] it becomes unambigous that we are using position based indexing and how pandas is going to access the value.

In [None]:
age[890]

In [None]:
age[[3, 4]]

In [None]:
# here it is behaving like position based indexing since 3 is excluded
age[:3]

- NOTE: with loc[right_boundary:left_boundary], left_boundary is included  (label based indexing)
- whereas in case of iloc[] left_boundary is excluded. (Position based indexing)

- Therefore if we have a range index and we slice using just `[] square bracket` Pandas does not know if we intend to use label based or position based indexing.
- The behaviour is not consistent.
- The best practice is to use iloc[] operator or loc[] opeartor explicitly.

In [None]:
#Position based indexing
age.iloc[:3]

In [None]:
#Label based indexing
age.loc[:3]

In [None]:
summer = pd.read_csv('/kaggle/input/summer-olympics-dataset-18962012/summer.csv', index_col = 'Athlete')

In [None]:
summer.head()

In [None]:
event = summer.Event

In [None]:
event.head()

In [None]:
event.tail()

In [None]:
event.index

In [None]:
event[0]

In [None]:
event[1]

In [None]:
event[2]

In [None]:
# Here Pandas is using position based indexing
event[-1]

In [None]:
# we can use iloc[] but the functionality does not change
event.iloc[-1]

In [None]:
event[: 3]

In [None]:
event['DRIVAS, Dimitrios']

In [None]:
event[: 'DRIVAS, Dimitrios']

In [None]:
event['PHELPS, Michael']

In [None]:
event.loc['PHELPS, Michael'].equals(event['PHELPS, Michael'])

In [None]:
#Error: KeyError: "Cannot get right slice bound for non-unique label: 'PHELPS, Michael'"
#event[:'PHELPS, Michael']

In [None]:
event[['PHELPS, Michael', 'LEWIS, Carl']]

In [None]:
event.loc[['PHELPS, Michael', 'LEWIS, Carl']]

In [None]:
#Error: KeyError: "Passing list-likes to .loc or [] with any missing labels is no longer supported.
#event[['PHELPS, Michael', 'DUCK, Donald']]

# Sorting and introduction to the inplace-parameter

In [None]:
import pandas as ps

In [None]:
dic = {1:10, 3:25, 2:6, 4:36, 5:2, 6:0, 7:None}
dic

In [None]:
sales = pd.Series(dic)
sales

In [None]:
sales.sort_index()

In [None]:
sales.sort_index(ascending = True)

In [None]:
sales.sort_index(ascending = False)

In [None]:
sales

In [None]:
sales.sort_index(ascending = True, inplace = True)

In [None]:
sales

In [None]:
sales.sort_values(inplace = False)

In [None]:
sales.sort_values(ascending = False, na_position = 'last', inplace = False)

In [None]:
sales.sort_values(ascending = False, na_position = 'first', inplace = False)

In [None]:
sales

In [None]:
sales.sort_values(ascending = False, na_position = 'last', inplace = True)

In [None]:
sales

In [None]:
dic = {'Mon': 10, 'Tues': 25, 'Wed': 6, 'Thurs': 36, 'Fri': 2}
dic

In [None]:
sales = pd.Series(dic)
sales

In [None]:
sales.sort_index()

- Our index is sorted in an alphabetical manner.

In [None]:
sales.sort_index(ascending = False)

# nlargest() and nsmallest()

In [None]:
import pandas as pd

In [None]:
titanic = pd.read_csv('/kaggle//input/titanic-csv/titanic.csv')

In [None]:
titanic.head()

In [None]:
age = titanic.age

In [None]:
age.head()

In [None]:
age.size

In [None]:
age.count()

In [None]:
age.sort_values(ascending = False)

- Extracting the 3 oldest passengers.

In [None]:
age.sort_values(ascending = False).head(3)

In [None]:
age.nlargest()

- By default we get 5 largest values in our pandas series.

In [None]:
age.nlargest(n = 3)

In [None]:
age.sort_values(ascending = True)

In [None]:
age.nsmallest()

In [None]:
age.nsmallest(n = 3)

In [None]:
age.sort_values(ascending = True).iloc[:3]

In [None]:
age.nsmallest(n = 3, keep = 'all')

# idxmin() idxmax()

- What if we just want to extract the index of the highest and smallest values.

In [None]:
age.nlargest(n = 3).index[0]

In [None]:
age.nsmallest(n = 1).index

In [None]:
age.nsmallest(n = 1).index[0]

- There is a better and easier way to do this.

In [None]:
titanic.head()

In [None]:
titanic.age.idxmax()

In [None]:
titanic.age.idxmin()

In [None]:
#Oldest passenger
titanic.loc[630]

In [None]:
#Youngest passenger
titanic.loc[803]

In [None]:
titanic.loc[titanic.age.idxmin()]

In [None]:
dic = {'Mon': 10, 'Tues': 25, 'Wed': 6, 'Thu': 36, 'Fri': 2, 'Sat': 0, 'Sun': None}
dic

In [None]:
sales = pd.Series(dic)
sales

In [None]:
sales.sort_values(ascending = True).index[0]

In [None]:
sales.idxmin()

In [None]:
sales.sort_values(ascending = False).index[0]

In [None]:
sales.idxmax()

# Manipulating Series

In [None]:
import pandas as pd

In [None]:
sales = pd.Series([10, 25, 6, 36, 2, 0, None, 5], index = ['Mon', 'Tues', 'Wed', 'Thurs', 'Fri', 'Sat', 'Sun', 'Mon'])

In [None]:
sales

In [None]:
sales['Sun'] = 0

In [None]:
sales

In [None]:
sales.iloc[3]  = 30

In [None]:
sales

In [None]:
#Element wise operation
(sales/1.1)

In [None]:
sales_EUR = (sales/1.1).round(2)

In [None]:
sales_EUR

In [None]:
sales = (sales/1.1).round(2)

In [None]:
sales

In [None]:
#It overwrites for both index Mon
sales['Mon'] = 0

In [None]:
sales

- If the index has duplicates then if we use label based indexing to update the value it overwrites all the instances of that index.

In [None]:
titanic = pd.read_csv('/kaggle/input/titanic-csv/titanic.csv')

In [None]:
titanic.head()

In [None]:
age = titanic['age']

In [None]:
age.head()

In [None]:
age.tail()

In [None]:
age.iloc[1] = 30

In [None]:
age.head()

In [None]:
titanic.head()

- The values is changed in the Series as well as the original dataframe.