In [166]:
import pandas as pd

get the titanic dataset

In [167]:
url='https://raw.githubusercontent.com/rajeevratan84/datascienceforbusiness/refs/heads/master/titanic.csv'

df=pd.read_csv(url)

display the titanic dataset

In [168]:
df

Unnamed: 0,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,survived
0,1,"Allen, Miss. Elisabeth Walton",female,29.0000,0,0,24160,211.3375,B5,S,1
1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.5500,C22 C26,S,1
2,1,"Allison, Miss. Helen Loraine",female,2.0000,1,2,113781,151.5500,C22 C26,S,0
3,1,"Allison, Mr. Hudson Joshua Creighton",male,30.0000,1,2,113781,151.5500,C22 C26,S,0
4,1,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0000,1,2,113781,151.5500,C22 C26,S,0
...,...,...,...,...,...,...,...,...,...,...,...
1304,3,"Zabour, Miss. Hileni",female,14.5000,1,0,2665,14.4542,,C,0
1305,3,"Zabour, Miss. Thamine",female,,1,0,2665,14.4542,,C,0
1306,3,"Zakarian, Mr. Mapriededer",male,26.5000,0,0,2656,7.2250,,C,0
1307,3,"Zakarian, Mr. Ortin",male,27.0000,0,0,2670,7.2250,,C,0


get the column names of titanic dataset

# UNDERSTANDING THE DATASET

In [169]:
df.columns

Index(['pclass', 'name', 'sex', 'age', 'sibsp', 'parch', 'ticket', 'fare',
       'cabin', 'embarked', 'survived'],
      dtype='object')

get the information of each column

In [170]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   pclass    1309 non-null   int64  
 1   name      1309 non-null   object 
 2   sex       1309 non-null   object 
 3   age       1046 non-null   float64
 4   sibsp     1309 non-null   int64  
 5   parch     1309 non-null   int64  
 6   ticket    1309 non-null   object 
 7   fare      1308 non-null   float64
 8   cabin     295 non-null    object 
 9   embarked  1307 non-null   object 
 10  survived  1309 non-null   int64  
dtypes: float64(2), int64(4), object(5)
memory usage: 112.6+ KB


observation: columns like age,fare,cabin,embarked are having missing values.
All columns are having valid dtype(data type)

# DATA CLEANING

In [171]:
# age  column has to be filled with mean
age_mean=df['age'].mean()
# fare column has to be filled with mean
fare_mean=df['fare'].mean()
# embarked  column has to be filled with mode   or ('unknown)
embarked_mode=df['embarked'].mode()[0]

In [172]:
# fillna() -- used to fill the missing values (None,NaN)

df['age']=df['age'].fillna(age_mean)
df['fare']=df['fare'].fillna(fare_mean)
df['embarked']=df['embarked'].fillna(embarked_mode)

In [173]:
# cabin column which has null values has to be removed from dataset
#-------------------------------------------------------------------
# dropna()
# auto detects the rows and columns which are having missing values and removes them
# axis=0 (remove the rows which are having missing values)
# axis=1 (remove the columns which are having missing values)

df=df.dropna(axis=1)

In [174]:
# convert the fare column dtype
#-----------------------------------------
# astype() - used to convert dtype of columns
#          - we have to pass dict -- {'col':dtype , 'col':dtype ,...........}

df=df.astype({'fare':int})

# DATA TRANSFORMATION

In [175]:
df.head(2)

Unnamed: 0,pclass,name,sex,age,sibsp,parch,ticket,fare,embarked,survived
0,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211,S,1
1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151,S,1


In [176]:
# df['name'].str.split(',',n=1) # list of elements

temp=df['name'].str.split(',',n=1,expand=True) # list has been converted to columns
last_name=temp[0]
t_fname=temp[1]
courtesy_title=t_fname.str.split('.',n=1,expand=True)[0]
first_name=t_fname.str.split('.',n=1,expand=True)[1]
# last_name , courtesy_title , first_name (series)

In [177]:
# creating new columns
df['courtesy_title']=courtesy_title
df['lname']=last_name
df['fname']=first_name

In [178]:
df.head(2)

Unnamed: 0,pclass,name,sex,age,sibsp,parch,ticket,fare,embarked,survived,courtesy_title,lname,fname
0,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211,S,1,Miss,Allen,Elisabeth Walton
1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151,S,1,Master,Allison,Hudson Trevor


In [179]:
df=df.drop(columns=['name'])
# drop() -- used to remove the rows/records
# index=[indices of row]
# columns=[names of column]

In [180]:
df.head(2)

Unnamed: 0,pclass,sex,age,sibsp,parch,ticket,fare,embarked,survived,courtesy_title,lname,fname
0,1,female,29.0,0,0,24160,211,S,1,Miss,Allen,Elisabeth Walton
1,1,male,0.9167,1,2,113781,151,S,1,Master,Allison,Hudson Trevor


In [181]:
df['family_count']=df['sibsp']+df['parch']

In [182]:
df=df.drop(columns=['sibsp','parch'])
df.head(2)

Unnamed: 0,pclass,sex,age,ticket,fare,embarked,survived,courtesy_title,lname,fname,family_count
0,1,female,29.0,24160,211,S,1,Miss,Allen,Elisabeth Walton,0
1,1,male,0.9167,113781,151,S,1,Master,Allison,Hudson Trevor,3


In [183]:
# Rename the sex column to gender column
# rename() -- used to change the name of the rows/columns
# for rows --->           index={'old_name' : 'new_name' ,............}
# for columns --->        columns={'old_name' : 'new_name' ,............}
df=df.rename(columns={'sex':'gender'})

In [184]:
df.columns

Index(['pclass', 'gender', 'age', 'ticket', 'fare', 'embarked', 'survived',
       'courtesy_title', 'lname', 'fname', 'family_count'],
      dtype='object')

In [185]:
pd.unique(df['embarked']) # to get unique data in a column

array(['S', 'C', 'Q'], dtype=object)

In [186]:
# replace() --> used to replace the old_values by new_values
#           --> we have to pass dict
df['embarked']=df['embarked'].replace({'S':'Southampton', 'C':'Cherbourg' ,'Q':'Queenstown'})

In [187]:
df.head(2)

Unnamed: 0,pclass,gender,age,ticket,fare,embarked,survived,courtesy_title,lname,fname,family_count
0,1,female,29.0,24160,211,Southampton,1,Miss,Allen,Elisabeth Walton,0
1,1,male,0.9167,113781,151,Southampton,1,Master,Allison,Hudson Trevor,3


In [None]:
# creating age_category using age column
df['age_category']=pd.cut(df['age'],bins=[0,2,16,25,49,100],labels=['baby','child','teen','adult','senior'])

In [189]:
df.columns

Index(['pclass', 'gender', 'age', 'ticket', 'fare', 'embarked', 'survived',
       'courtesy_title', 'lname', 'fname', 'family_count', 'age_category'],
      dtype='object')

In [190]:
new_order=['courtesy_title','fname','lname','gender','age','age_category','ticket',
           'pclass','embarked','family_count','fare','survived']

df=df[new_order]

In [191]:
df.head(2)

Unnamed: 0,courtesy_title,fname,lname,gender,age,age_category,ticket,pclass,embarked,family_count,fare,survived
0,Miss,Elisabeth Walton,Allen,female,29.0,adult,24160,1,Southampton,0,211,1
1,Master,Hudson Trevor,Allison,male,0.9167,baby,113781,1,Southampton,3,151,1


# ANALYSIS

# get the statistical description for all numerical columns

# get top 5 records (passengers data)

# get the last 5 records

# get the total_no_records

# get the total columns

# get the first row

# get the second row

# get the fname and age of all passengers

# get the 5th and 6th passenger fname ,age and survived details

# get the passenger details of pclass 1

# get the passenger fname and age of pclass 1

# get the male passenger details of pclass 1

# get the female passengers fname who were survived in pclass 1

# Get the passengers details whose embarked is Southampton, cherbourg

# Get the no_of passengers
# Get the no_of male passengers

# Get the highest fare
# Get the lowest fare among female passengers


# Get the no_of_survivals
# Get the survival rate

# Get the aggregates for fare column

# Get the no_of_passengers in each gender

# Get the total fare in each pclass

# Get the pclass wise survival rate and count

# Get the pclass,gender wise survival rate

# Get the no.of.psgn in each pclass

# Find the pclass in which highest no_of_passengers are there

# Find the pclass in which lowest no_of_passengers are there

# Find the no.of.familes travelling

# Find the no.of.familes travelling  who got survived

# find the correlation between the numerical columns

# What is correlation b/w fare and pclass

# what is the correlate b/w fare and survival

# What is correlation b/w age and survival

# Get the pclass wise male and female count and survival rate

# Get the age_seg wise no.of.psgns and survival rate

# Get the age_seg which has high_survivale rate

# Get the gender and age_seg which has high survival rate

# Get the pclass,age_seg,gender in which survival rate is high

# Get the solo travelers full name

# solo travelers survival rate v/s psngs  with family survival rate

# Find the gender and pclass wise survival rate of solo travelers