# Pandas Series

In [1]:
import numpy as np
import pandas as pd

In [2]:
#Pandas series are simliar to NumPy arrays but only difference is that you can use indexing other that numerical indexing with Series.
my_series = pd.Series ( data = [1,2,3,4],
                        index = ['a','b','c','d'])
print(my_series)

a    1
b    2
c    3
d    4
dtype: int64


In [6]:
#we can create series from dictionary
my_dict = {'x' : 1,
           'y' : 2,
           'z' : 3,
           'a' : 4}
my_series2 = pd.Series(my_dict)
print(my_series2)

x    1
y    2
z    3
a    4
dtype: int64


In [7]:
#access values using labels
my_series['a']

1

In [10]:
#numeric index works too
my_series[0]

1

In [11]:
my_series[1:4]

b    2
c    3
d    4
dtype: int64

In [12]:
# adding series with same labels
my_series + my_series

a    2
b    4
c    6
d    8
dtype: int64

In [13]:
# adding series with different labels
my_series + my_series2

a    5.0
b    NaN
c    NaN
d    NaN
x    NaN
y    NaN
z    NaN
dtype: float64

# Data Frames
2D table with labeled columns that can each hold different types of data. tables like in (Excel , Sql database)

For working with tabular data in Python.

Can be created with Dict,2D NumPy arrays,Series using pd.DataFrame() function.

In [16]:
# Creating a dict with some different data types as values

my_dict = {"name" : ["Joe","Bob","Frans"],
           "age" : np.array([10,15,20]),
           "weight" : (75,123,239),
           "height" : pd.Series(data = [4.5, 5, 6.1], 
                                index =["Joe","Bob","Frans"]),
           "siblings" : 1,
           "gender" : "M"}

df = pd.DataFrame(my_dict)
df                           

Unnamed: 0,name,age,weight,height,siblings,gender
Joe,Joe,10,75,4.5,1,M
Bob,Bob,15,123,5.0,1,M
Frans,Frans,20,239,6.1,1,M


In [26]:
my_dict2 = {"name" : ["Joe","Bob","Frans"],
           "age" : np.array([10,15,20]),
           "weight" : (75,123,239),
           "height" : [4.5, 5, 6.1], 
           "siblings" : 1,
           "gender" : "M"}

df2 = pd.DataFrame(my_dict2)
df2       

Unnamed: 0,name,age,weight,height,siblings,gender
0,Joe,10,75,4.5,1,M
1,Bob,15,123,5.0,1,M
2,Frans,20,239,6.1,1,M


In [27]:
#custom row labels
df2 = pd.DataFrame(my_dict2,
                  index = my_dict['name'])
df2

Unnamed: 0,name,age,weight,height,siblings,gender
Joe,Joe,10,75,4.5,1,M
Bob,Bob,15,123,5.0,1,M
Frans,Frans,20,239,6.1,1,M


In [21]:
df2['weight'] #or df2.weight

Joe       75
Bob      123
Frans    239
Name: weight, dtype: int64

In [28]:
del df2['age']
df2

Unnamed: 0,name,weight,height,siblings,gender
Joe,Joe,75,4.5,1,M
Bob,Bob,123,5.0,1,M
Frans,Frans,239,6.1,1,M


In [34]:
#adding a column
#note add 3 elements so as it matches our table
df2['age'] = [10,15,20]
df2

Unnamed: 0,name,weight,height,siblings,gender,age
Joe,Joe,75,4.5,1,M,10
Bob,Bob,123,5.0,1,M,15
Frans,Frans,239,6.1,1,M,20


In [35]:
df2['Married'] = False
df2

Unnamed: 0,name,weight,height,siblings,gender,age,Married
Joe,Joe,75,4.5,1,M,10,False
Bob,Bob,123,5.0,1,M,15,False
Frans,Frans,239,6.1,1,M,20,False


In [37]:
df2['college'] = pd.Series (['UCF'], index = ['Frans'])
df2

Unnamed: 0,name,weight,height,siblings,gender,age,Married,college
Joe,Joe,75,4.5,1,M,10,False,
Bob,Bob,123,5.0,1,M,15,False,
Frans,Frans,239,6.1,1,M,20,False,UCF


Selecting both rows/columns by lable with df.loc[row,column]

In [38]:
df2.loc['Joe']

name          Joe
weight         75
height        4.5
siblings        1
gender          M
age            10
Married     False
college       NaN
Name: Joe, dtype: object

In [39]:
df2.loc['Joe','age']

10

In [43]:
df2.loc['Joe':'Frans','age':'college']

Unnamed: 0,age,Married,college
Joe,10,False,
Bob,15,False,
Frans,20,False,UCF


In [44]:
#using iloc
df2.iloc[0]  #get row 0

name          Joe
weight         75
height        4.5
siblings        1
gender          M
age            10
Married     False
college       NaN
Name: Joe, dtype: object

In [46]:
#get row 0 , column 5
df2.iloc[0, 5]

10

In [48]:
df2.iloc[0:3, 5:8]

Unnamed: 0,age,Married,college
Joe,10,False,
Bob,15,False,
Frans,20,False,UCF


# Important Logical Indexing

In [49]:
boolean_index = [False, True, True]
df2[boolean_index]

Unnamed: 0,name,weight,height,siblings,gender,age,Married,college
Bob,Bob,123,5.0,1,M,15,False,
Frans,Frans,239,6.1,1,M,20,False,UCF


In [50]:
boolean_index = df2['age'] > 12 #or df2[df2['age'] > 12]
df2[boolean_index]

Unnamed: 0,name,weight,height,siblings,gender,age,Married,college
Bob,Bob,123,5.0,1,M,15,False,
Frans,Frans,239,6.1,1,M,20,False,UCF


# Exploring DFs

In [20]:
#Loading Dataset
df_train = pd.read_csv('train.csv')
type(df_train)

pandas.core.frame.DataFrame

In [12]:
#size of DF (ROWS, COLUMNS)
df_train.shape

(891, 12)

In [21]:
#print first 10 rows 
df_train.head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


In [13]:
df_train.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [15]:
np.mean(df_train, axis=0)

PassengerId    446.000000
Survived         0.383838
Pclass           2.308642
Age             29.699118
SibSp            0.523008
Parch            0.381594
Fare            32.204208
dtype: float64

In [16]:
#structure of DF
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [23]:
#setting index to Name
df_train.index = df_train['Name']

In [24]:
df_train.head(10)

Unnamed: 0_level_0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
"Braund, Mr. Owen Harris",1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
"Cumings, Mrs. John Bradley (Florence Briggs Thayer)",2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
"Heikkinen, Miss. Laina",3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
"Futrelle, Mrs. Jacques Heath (Lily May Peel)",4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
"Allen, Mr. William Henry",5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
"Moran, Mr. James",6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
"McCarthy, Mr. Timothy J",7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
"Palsson, Master. Gosta Leonard",8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
"Nasser, Mrs. Nicholas (Adele Achem)",10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


In [25]:
del df_train['Name']

In [26]:
df_train.head(10)

Unnamed: 0_level_0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
"Braund, Mr. Owen Harris",1,0,3,male,22.0,1,0,A/5 21171,7.25,,S
"Cumings, Mrs. John Bradley (Florence Briggs Thayer)",2,1,1,female,38.0,1,0,PC 17599,71.2833,C85,C
"Heikkinen, Miss. Laina",3,1,3,female,26.0,0,0,STON/O2. 3101282,7.925,,S
"Futrelle, Mrs. Jacques Heath (Lily May Peel)",4,1,1,female,35.0,1,0,113803,53.1,C123,S
"Allen, Mr. William Henry",5,0,3,male,35.0,0,0,373450,8.05,,S
"Moran, Mr. James",6,0,3,male,,0,0,330877,8.4583,,Q
"McCarthy, Mr. Timothy J",7,0,1,male,54.0,0,0,17463,51.8625,E46,S
"Palsson, Master. Gosta Leonard",8,0,3,male,2.0,3,1,349909,21.075,,S
"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",9,1,3,female,27.0,0,2,347742,11.1333,,S
"Nasser, Mrs. Nicholas (Adele Achem)",10,1,2,female,14.0,1,0,237736,30.0708,,C


In [27]:
print(df_train.index[0:10])

Index(['Braund, Mr. Owen Harris',
       'Cumings, Mrs. John Bradley (Florence Briggs Thayer)',
       'Heikkinen, Miss. Laina',
       'Futrelle, Mrs. Jacques Heath (Lily May Peel)',
       'Allen, Mr. William Henry', 'Moran, Mr. James',
       'McCarthy, Mr. Timothy J', 'Palsson, Master. Gosta Leonard',
       'Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)',
       'Nasser, Mrs. Nicholas (Adele Achem)'],
      dtype='object', name='Name')
