# Titanic Study

first thing we need to do is to install paspailleur from git:

In [1]:
!pip install --quiet git+https://github.com/smartFCA/paspailleur.git

## Before the start: Download the data

Second is to initiate the dataset:

In [20]:
import pandas as pd

df_full = pd.read_csv('https://raw.githubusercontent.com/datasciencedojo/datasets/refs/heads/master/titanic.csv', index_col=0)
print(df_full.shape)
print(df_full.columns)

df_full.head()

(891, 11)
Index(['Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket',
       'Fare', 'Cabin', 'Embarked'],
      dtype='object')


Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


Now we do some modifications to make the results look better and reorganize the table with only the needed columns:


In [21]:
df_full['Embarked'] = df_full['Embarked'].map({'S': 'Southampton', 'C': 'Cherbourg', 'Q': 'Queenstown'})
# change the values of the Embarked column into the full names instead of the letters
df_full['Survived'] = df_full['Survived'].map(['No', 'Yes'].__getitem__)
# change the values of the Survived column into yes and no instead of 0 1
df_full['Known Age'] = (~df_full['Age'].isna()).map(['No', 'Yes'].__getitem__)
# insert a new column Known Age into the table with values of yes and no
df_full['Known Cabin'] = (~df_full['Cabin'].isna()).map(['No', 'Yes'].__getitem__)
# insert a new column Known Cabin into the table with values of yes and no
df_full = df_full.rename(columns={'Pclass': 'Passenger Class', 'SibSp': '# Siblings and Spouses', 'Parch': '# Parents and Children'})
# rename the columns (Pclass, SibSp, Parch) of the table to make them more understandable

columns_to_consider = [
    'Survived', 'Known Age', 'Known Cabin', 'Sex', 'Embarked',  # categorical columns
    'Passenger Class', 'Age', '# Siblings and Spouses', '# Parents and Children', 'Fare',  # numerical columns
    'Name',  # textual column
]
# the removed columns are ticket and cabin since they are not needed for the study
df = df_full[columns_to_consider].copy()
print(df.shape)
df.head()

(891, 11)


Unnamed: 0_level_0,Survived,Known Age,Known Cabin,Sex,Embarked,Passenger Class,Age,# Siblings and Spouses,# Parents and Children,Fare,Name
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,No,Yes,No,male,Southampton,3,22.0,1,0,7.25,"Braund, Mr. Owen Harris"
2,Yes,Yes,Yes,female,Cherbourg,1,38.0,1,0,71.2833,"Cumings, Mrs. John Bradley (Florence Briggs Th..."
3,Yes,Yes,No,female,Southampton,3,26.0,0,0,7.925,"Heikkinen, Miss. Laina"
4,Yes,Yes,Yes,female,Southampton,1,35.0,1,0,53.1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)"
5,No,Yes,No,male,Southampton,3,35.0,0,0,8.05,"Allen, Mr. William Henry"


## Part One: Describe the data with Patterns

Now we should describe how we treat every column in the data.

- **CategorySetPattern** is designed for categorical data. That is, every object is described by a categorical value. Then a pattern would be a subset of categories that covers rows marked by ANY of the categories of the pattern;
- **IntervalPattern** treats numerical data. Any row (marked by either a number of an interval of numbers) is either lies inside some interval pattern or nor;
- **NgramSetPattern** treats textual data. Every text is represented as an ngram (i.e. a sequence of words). The task here is to find subngrams that can often be found in the data;
- **CartesianPattern** combined independent *dimensions* in the tabular data. Every dimension represents a column in the data described by its own Pattern.

With this, we'll be able to initialize and do simple comaprisons between custom patterns

In [22]:
from paspailleur.pattern_structures import built_in_patterns as bip

class SurvivedPattern(bip.CategorySetPattern):
    # CategorySetPattern required the definition of the Universe of categories,
    # that is the set of all possible categories that can be found in the data
    Universe = ('No', 'Yes')

class KnownAgePattern(bip.CategorySetPattern):
    Universe = ('No', 'Yes')

class KnownCabinPattern(bip.CategorySetPattern):
    Universe = ('No', 'Yes')

class SexPattern(bip.CategorySetPattern):
    Universe = ('female', 'male')

class EmbarkedPattern(bip.CategorySetPattern):
    Universe = ('Southampton', 'Cherbourg', 'Queenstown')


class PassengerClassPattern(bip.IntervalPattern):
    # PassengerClass has only 3 unique values,
    # so we do not have to do any preprocessing
    BoundsUniverse = (1, 2, 3)

class AgePattern(bip.IntervalPattern):
    # Age column has many unique values.
    # We should discretisise the data by providing `BoundsUniverse`,
    # otherwise the software (and the math behind it) would not be able
    # to distinguish stable and noisy patterns
    BoundsUniverse = (0, 20, 40, 60, 80)  # df_full['Age'].max()==80

class NSiblingsPattern(bip.IntervalPattern):
    # Number of siblings column does not have too many unique values,
    # but some of them are too rare (which is undestandable).
    # So we can group rare values together to make them look more stable and less noisy
    BoundsUniverse = (0, 1, 2, 8)

class NParentsPattern(bip.IntervalPattern):
    # Similar to NSiblings, we group rare NParents values together
    BoundsUniverse = (0, 1, 2, 6)

class FarePattern(bip.IntervalPattern):
    # Similar to AgePattern, there are too many unique values of Fare column
    # that shows the ticket price of a passenger
    BoundsUniverse = (0, 30, 100, 300, 515)

class NamePattern(bip.NgramSetPattern):
    # NamePattern does not need any specification.
    # However, for other NgramSetPatterns one might want to
    # specify `StopWords` parameter: then the ngrams that only contain StopWords
    # would be automatically removed from the analysis
    ...

class TitanicPattern(bip.CartesianPattern):
    # CartesianPattern combines Patterns for each column in the data
    DimensionTypes = {
        'Survived': SurvivedPattern,
        'Known Age': KnownAgePattern,
        'Known Cabin': KnownCabinPattern,
        'Sex': SexPattern,
        'Embarked': EmbarkedPattern,
        'Passenger Class': PassengerClassPattern,
        'Age': AgePattern,
        '# Siblings and Spouses': NSiblingsPattern,
        '# Parents and Children': NParentsPattern,
        'Fare': FarePattern,
        'Name': NamePattern
    }

For the moment, `paspailleur` cannot treat `None` and `NaN` values in the data. So we should fill them somehow.

In [23]:
df.isna().sum().sort_values(ascending=False)

Unnamed: 0,0
Age,177
Embarked,2
Survived,0
Known Cabin,0
Known Age,0
Sex,0
Passenger Class,0
# Siblings and Spouses,0
# Parents and Children,0
Fare,0


We fill out the missing values in the Age column with the maximal (and also the least precise) Age interval: from 0 to 80 years old.  
For Embarked column, we replace the missing value with the least precise pattern, saying that the passanger could had embarked in any of the known ports.

In [24]:
print("Minimum age:",df_full['Age'].min(), "Maximum age:",df_full['Age'].max())# returning the minimum age in the csv file and the maximum one which are 0.42 and 80
df['Age'] = df['Age'].fillna(AgePattern.get_min_pattern())
df['Embarked'] = df['Embarked'].fillna(EmbarkedPattern.get_min_pattern())

Minimum age: 0.42 Maximum age: 80.0


Now, let us create a *context* dictionary, where the keys of the dictionary are objects (the names of rows in the data) and values are patterns of these objects.

For every object there should be just one pattern.

Now we create a `PatternStructure` that will let us analyse the context.

Every pattern in the pattern structure would be created by joining atomic patterns together. So a pattern would describe objects that are covered by *all* atomic patterns it consists of.

In [25]:
%%time
from paspailleur.pattern_structures.pattern_structure import PatternStructure

ps = PatternStructure(TitanicPattern)
ps.fit(df.to_dict('index'), min_atom_support=0.1)

CPU times: user 2 s, sys: 26.6 ms, total: 2.02 s
Wall time: 2.09 s


Here we mine stable pattern concepts where every concept can be treated as an individual cluster.

There are two important parameters to the function `ps.mine_concepts`:  
* ``min support`` which is the minimum number of objects covered by the concept.  
* ``min_delta_stability`` which means that all more precise concepts will cover less objects.


## Part Two: Mining patterns

### Mining concepts

In [26]:
%%time
import random
concepts = ps.mine_concepts(min_delta_stability=20, min_support=80, algorithm='gSofia', use_tqdm=True)

gSofia algorithm:   0%|          | 0/46 [00:00<?, ?it/s]

Compute intents:   0%|          | 0/608 [00:00<?, ?it/s]

CPU times: user 1.52 s, sys: 24.8 ms, total: 1.55 s
Wall time: 2.07 s


In [27]:
print("# concepts: {len(concepts)}")
extent, intent = concepts[50]
print("Concept #50")
print(f"* objects in the concept: {list(extent)[:10]} (some of them)")
print(f"* pattern of the concept: {intent}")

# concepts: {len(concepts)}
Concept #50
* objects in the concept: [1, 2, 3, 4, 5, 7, 9, 10, 11, 12] (some of them)
* pattern of the concept: {'# Parents and Children': [0.0, 2.0], '# Siblings and Spouses': [0.0, 2.0], 'Age': [0.0, 60.0], 'Fare': [0.0, 100.0], 'Known Age': {'Yes'}, 'Passenger Class': [1.0, 3.0]}


### Mining implications

Now for the implications


In [28]:
%%time
implications = ps.mine_implications(min_support=80, min_delta_stability=20, max_key_length=None, algorithm='gSofia', reduce_conclusions=True, use_tqdm=True)

gSofia algorithm:   0%|          | 0/46 [00:00<?, ?it/s]

Compute intents:   0%|          | 0/608 [00:00<?, ?it/s]

Mine premise candidates: 0it [00:00, ?it/s]

Construct implications:   0%|          | 0/638 [00:00<?, ?it/s]

CPU times: user 2min 53s, sys: 633 ms, total: 2min 54s
Wall time: 3min


In [29]:
print("\nMined Implications:")
for premise, conclusion in implications.items():
   print(premise, '=>', conclusion, sep='\n')


Mined Implications:
{}
=>
{'# Parents and Children': [0.0, 6.0], '# Siblings and Spouses': [0.0, 8.0], 'Fare': [0.0, 515.0], 'Passenger Class': [1.0, 3.0]}
{'# Parents and Children': < 6.0, 'Fare': < 515.0}
=>
{'# Parents and Children': <= 2.0, 'Fare': <= 300.0}
{'# Parents and Children': < 6.0, '# Siblings and Spouses': < 8.0, 'Fare': < 515.0}
=>
{'# Siblings and Spouses': <= 2.0}
{'# Parents and Children': < 6.0, 'Fare': < 300.0}
=>
{'Fare': <= 100.0}
{'# Parents and Children': < 6.0, '# Siblings and Spouses': < 2.0, 'Fare': < 515.0}
=>
{'# Siblings and Spouses': <= 1.0}
{'# Parents and Children': < 2.0, '# Siblings and Spouses': < 8.0, 'Fare': < 515.0}
=>
{'# Parents and Children': <= 1.0}
{'# Parents and Children': < 6.0, 'Embarked': NOT({'Cherbourg'})}
=>
{'# Parents and Children': <= 2.0, 'Fare': <= 300.0}
{'# Parents and Children': < 6.0, '# Siblings and Spouses': < 8.0, 'Embarked': NOT({'Cherbourg'})}
=>
{'# Siblings and Spouses': <= 2.0}
{'# Parents and Children': < 6.0, 'Age

### Mining subgroups

Now for mining subgroups with the goal which is the survivors

In [30]:
goal_objects = set(df[df['Survived'] == "Yes"].index)
subG = ps.iter_subgroups(goal_objects=goal_objects, quality_measure='Precision', quality_threshold=0.5, return_objects_as_bitarrays=False)

In [31]:
print("\nSubgroups for Survived Passengers:")
for pattern, objects, quality in subG:
    print(f"Pattern: {pattern}, Quality: {quality}, Objects: {list(objects)[:5]}...")


Subgroups for Survived Passengers:
Pattern: {'Passenger Class': <= 2.0}, Quality: 0.5575, Objects: [2, 4, 7, 10, 12]...
Pattern: {'Survived': {'Yes'}}, Quality: 1.0, Objects: [2, 3, 4, 9, 10]...
Pattern: {'Sex': {'female'}}, Quality: 0.7420382165605095, Objects: [2, 3, 4, 9, 10]...
Pattern: {'# Siblings and Spouses': [1.0, 2.0]}, Quality: 0.5274261603375527, Objects: [1, 2, 514, 4, 519]...
Pattern: {'# Parents and Children': <= 1.0, '# Siblings and Spouses': >= 1.0, 'Embarked': NOT({'Queenstown'})}, Quality: 0.5073891625615764, Objects: [1, 2, 514, 4, 519]...
Pattern: {'# Parents and Children': <= 2.0, '# Siblings and Spouses': >= 1.0, 'Age': >= 20.0, 'Embarked': NOT({'Queenstown'})}, Quality: 0.503448275862069, Objects: [1, 2, 514, 4, 519]...
Pattern: {'Embarked': NOT({'Southampton'})}, Quality: 0.5020408163265306, Objects: [2, 514, 6, 518, 10]...
Pattern: {'Fare': >= 30.0}, Quality: 0.5875, Objects: [2, 514, 4, 516, 7]...
Pattern: {'# Parents and Children': >= 1.0}, Quality: 0.51173