## Pandas Crash Course (Part 2)

### First Steps with Pandas Series

In [1]:
import pandas as pd

In [2]:
titanic = pd.read_csv("titanic.csv")

In [None]:
titanic

In [None]:
titanic.info()

In [3]:
titanic["age"]

0      22.0
1      38.0
2      26.0
3      35.0
4      35.0
       ... 
886    27.0
887    19.0
888     NaN
889    26.0
890    32.0
Name: age, Length: 891, dtype: float64

In [4]:
type(titanic["age"])

pandas.core.series.Series

In [5]:
titanic["age"].equals(titanic.age)

True

In [6]:
age = titanic["age"]

In [7]:
age.head(2)

0    22.0
1    38.0
Name: age, dtype: float64

In [8]:
age.tail()

886    27.0
887    19.0
888     NaN
889    26.0
890    32.0
Name: age, dtype: float64

In [9]:
age.dtype

dtype('float64')

In [10]:
age.shape

(891,)

In [11]:
len(age)

891

In [12]:
age.index

RangeIndex(start=0, stop=891, step=1)

In [13]:
#age.info()

In [14]:
age.to_frame().info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   age     714 non-null    float64
dtypes: float64(1)
memory usage: 7.1 KB


###  Analyzing Numerical Series

In [15]:
age

0      22.0
1      38.0
2      26.0
3      35.0
4      35.0
       ... 
886    27.0
887    19.0
888     NaN
889    26.0
890    32.0
Name: age, Length: 891, dtype: float64

In [16]:
age.describe()

count    714.000000
mean      29.699118
std       14.526497
min        0.420000
25%       20.125000
50%       28.000000
75%       38.000000
max       80.000000
Name: age, dtype: float64

In [17]:
age.count()

714

In [18]:
age.size

891

In [19]:
len(age)

891

In [None]:
age.sum(skipna = False)

In [None]:
sum(age)

In [None]:
age.mean()

In [None]:
age.median()

In [None]:
age.std()

In [None]:
age.min()

In [None]:
age.max()

In [20]:
age.unique()

array([22.  , 38.  , 26.  , 35.  ,   nan, 54.  ,  2.  , 27.  , 14.  ,
        4.  , 58.  , 20.  , 39.  , 55.  , 31.  , 34.  , 15.  , 28.  ,
        8.  , 19.  , 40.  , 66.  , 42.  , 21.  , 18.  ,  3.  ,  7.  ,
       49.  , 29.  , 65.  , 28.5 ,  5.  , 11.  , 45.  , 17.  , 32.  ,
       16.  , 25.  ,  0.83, 30.  , 33.  , 23.  , 24.  , 46.  , 59.  ,
       71.  , 37.  , 47.  , 14.5 , 70.5 , 32.5 , 12.  ,  9.  , 36.5 ,
       51.  , 55.5 , 40.5 , 44.  ,  1.  , 61.  , 56.  , 50.  , 36.  ,
       45.5 , 20.5 , 62.  , 41.  , 52.  , 63.  , 23.5 ,  0.92, 43.  ,
       60.  , 10.  , 64.  , 13.  , 48.  ,  0.75, 53.  , 57.  , 80.  ,
       70.  , 24.5 ,  6.  ,  0.67, 30.5 ,  0.42, 34.5 , 74.  ])

In [None]:
len(age.unique())

In [21]:
age.nunique(dropna = False)

89

In [None]:
age.value_counts()

In [None]:
age.value_counts(sort = True)

In [None]:
age.value_counts(sort = False)

In [None]:
age.value_counts(dropna = True)

In [None]:
age.value_counts(dropna = False)

In [None]:
age.value_counts(ascending = False)

In [None]:
age.value_counts(ascending = True)

In [None]:
age.value_counts(sort = True, dropna = True, ascending = False, normalize = False)

In [None]:
age.value_counts(sort = True, dropna = True, ascending = False, normalize = True)

In [None]:
30/age.count()

In [None]:
age.value_counts(sort = True, dropna = False, ascending = False, normalize = True)

In [None]:
30/age.size

In [None]:
age.value_counts(sort = True, dropna = True, ascending= False, normalize = False, bins = 5)

In [None]:
age.value_counts(sort = True, dropna = True, ascending= False, normalize = True, bins = 10)

### Analyzing non-numerical Series

In [None]:
import pandas as pd

In [22]:
summer = pd.read_csv("summer.csv")

In [23]:
summer.head()

Unnamed: 0,Year,City,Sport,Discipline,Athlete,Country,Gender,Event,Medal
0,1896,Athens,Aquatics,Swimming,"HAJOS, Alfred",HUN,Men,100M Freestyle,Gold
1,1896,Athens,Aquatics,Swimming,"HERSCHMANN, Otto",AUT,Men,100M Freestyle,Silver
2,1896,Athens,Aquatics,Swimming,"DRIVAS, Dimitrios",GRE,Men,100M Freestyle For Sailors,Bronze
3,1896,Athens,Aquatics,Swimming,"MALOKINIS, Ioannis",GRE,Men,100M Freestyle For Sailors,Gold
4,1896,Athens,Aquatics,Swimming,"CHASAPIS, Spiridon",GRE,Men,100M Freestyle For Sailors,Silver


In [None]:
summer.info()

In [None]:
athlete = summer["Athlete"]

In [None]:
athlete.head()

In [None]:
athlete.tail(5)

In [None]:
type(athlete)

In [None]:
athlete.dtype

In [None]:
athlete.shape

In [None]:
athlete.describe()

In [None]:
athlete.size

In [None]:
athlete.count()

In [None]:
athlete.min()

In [None]:
athlete.unique()

In [None]:
len(athlete.unique())

In [None]:
athlete.nunique(dropna= False)

In [None]:
athlete.value_counts()

In [None]:
athlete.value_counts(sort = True, ascending=True)

In [None]:
athlete.value_counts(sort = True, ascending=False, normalize = True).head()

### Sorting and introduction to the  inplace-parameter

In [None]:
import pandas as pd

In [24]:
dic = {1:10, 3:25, 2:6, 4:36, 5:2, 6:0, 7:None}
dic

{1: 10, 3: 25, 2: 6, 4: 36, 5: 2, 6: 0, 7: None}

In [25]:
sales = pd.Series(dic)
sales

1    10.0
3    25.0
2     6.0
4    36.0
5     2.0
6     0.0
7     NaN
dtype: float64

In [None]:
sales.sort_index()

In [None]:
sales.sort_index(ascending = True, inplace= True)

In [None]:
sales

In [None]:
sales.sort_values(inplace=False)

In [None]:
sales

In [None]:
sales.sort_values(ascending=False, na_position="last", inplace= True)

In [None]:
sales

In [None]:
dic = {"Mon":10, "Tue":25, "Wed":6, "Thu": 36, "Fri": 2}
dic

In [None]:
sales = pd.Series(dic)

In [None]:
sales

In [None]:
sales.sort_index(ascending=False)