In [13]:
import os
os.environ['HADOOP_HOME'] = r'C:\hadoop'
os.environ['PATH'] += r';C:\hadoop\bin'

In [14]:
# Starting the Spark Session
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('Titanic').getOrCreate()

# Reading the data
df = spark.read.csv('Titanic.csv',inferSchema=True, header=True)

# Showing the data
df.show()

+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|          1|       0|     3|Braund, Mr. Owen ...|  male|22.0|    1|    0|       A/5 21171|   7.25| NULL|       S|
|          2|       1|     1|Cumings, Mrs. Joh...|female|38.0|    1|    0|        PC 17599|71.2833|  C85|       C|
|          3|       1|     3|Heikkinen, Miss. ...|female|26.0|    0|    0|STON/O2. 3101282|  7.925| NULL|       S|
|          4|       1|     1|Futrelle, Mrs. Ja...|female|35.0|    1|    0|          113803|   53.1| C123|       S|
|          5|       0|     3|Allen, Mr. Willia...|  male|35.0|    0|    0|          373450|   8.05| NULL|       S|
|          6|       0|     3|    Moran, Mr. James|  male|NULL|    0|    0|      

In [15]:
df.printSchema()

root
 |-- PassengerId: integer (nullable = true)
 |-- Survived: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Cabin: string (nullable = true)
 |-- Embarked: string (nullable = true)



In [16]:
# Selecting the columns which are required 
# to train and test the model.
rm_columns = df.select(['Survived','Pclass',
                       'Sex','Age','SibSp',
                       'Parch','Fare','Embarked'])

# Drops the data having null values
result = rm_columns.na.drop()

# Again showing the data
result.show()

+--------+------+------+----+-----+-----+-------+--------+
|Survived|Pclass|   Sex| Age|SibSp|Parch|   Fare|Embarked|
+--------+------+------+----+-----+-----+-------+--------+
|       0|     3|  male|22.0|    1|    0|   7.25|       S|
|       1|     1|female|38.0|    1|    0|71.2833|       C|
|       1|     3|female|26.0|    0|    0|  7.925|       S|
|       1|     1|female|35.0|    1|    0|   53.1|       S|
|       0|     3|  male|35.0|    0|    0|   8.05|       S|
|       0|     1|  male|54.0|    0|    0|51.8625|       S|
|       0|     3|  male| 2.0|    3|    1| 21.075|       S|
|       1|     3|female|27.0|    0|    2|11.1333|       S|
|       1|     2|female|14.0|    1|    0|30.0708|       C|
|       1|     3|female| 4.0|    1|    1|   16.7|       S|
|       1|     1|female|58.0|    0|    0|  26.55|       S|
|       0|     3|  male|20.0|    0|    0|   8.05|       S|
|       0|     3|  male|39.0|    1|    5| 31.275|       S|
|       0|     3|female|14.0|    0|    0| 7.8542|       

In [17]:
final_data  = result

### Convert String Columns to Ordinal Columns

In [None]:
# Importing the required libraries
from pyspark.ml.feature import VectorAssembler, StringIndexer, OneHotEncoder

# Converting the Sex Column
sexIdx = StringIndexer(inputCol='Sex',
                               outputCol='SexIndex')
sexEncode = OneHotEncoder(inputCol='SexIndex',
                               outputCol='SexVec')

# Converting the Embarked Column
embarkIdx = StringIndexer(inputCol='Embarked',
                               outputCol='EmbarkIndex')
embarkEncode = OneHotEncoder(inputCol='EmbarkIndex',
                               outputCol='EmbarkVec')

# Vectorizing the data into a new column "features" 
# which will be our input/features class
assembler = VectorAssembler(inputCols=['Pclass',
                                       'SexVec','Age',
                                       'SibSp','Parch',
                                       'Fare','EmbarkVec'],
                                    outputCol='features')

In [19]:
# Importing Pipeline and Model
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression

log_reg = LogisticRegression(featuresCol='features',
                             labelCol='Survived')

# Creating the pipeline
pipe = Pipeline(stages=[sexIdx, embarkIdx,
                            sexEncode, embarkEncode,
                            assembler, log_reg])

In [20]:
# Splitting the data into train and test
train_data, test_data = final_data.randomSplit([0.7, .3])

# Fitting the model on training data
fit_model = pipe.fit(train_data)

# Storing the results on test data
results = fit_model.transform(test_data)

# Showing the results
results.show()

+--------+------+------+----+-----+-----+--------+--------+--------+-----------+-------------+-------------+--------------------+--------------------+--------------------+----------+
|Survived|Pclass|   Sex| Age|SibSp|Parch|    Fare|Embarked|SexIndex|EmbarkIndex|       SexVec|    EmbarkVec|            features|       rawPrediction|         probability|prediction|
+--------+------+------+----+-----+-----+--------+--------+--------+-----------+-------------+-------------+--------------------+--------------------+--------------------+----------+
|       0|     1|female|50.0|    0|    0| 28.7125|       C|     1.0|        1.0|    (1,[],[])|(2,[1],[1.0])|(8,[0,2,5,7],[1.0...|[-2.7623545463851...|[0.05939269222948...|       1.0|
|       0|     1|  male|22.0|    0|    0|135.6333|       C|     0.0|        1.0|(1,[0],[1.0])|(2,[1],[1.0])|[1.0,1.0,22.0,0.0...|[-1.2697359816638...|[0.21930245102316...|       1.0|
|       0|     1|  male|24.0|    0|    0|    79.2|       C|     0.0|        1.0|(1,[0

In [21]:
# Importing the evaluator
from pyspark.ml.evaluation import BinaryClassificationEvaluator

# Calling the evaluator
res = BinaryClassificationEvaluator(rawPredictionCol='prediction',labelCol='Survived')

# Evaluating the AUC on results
ROC_AUC = res.evaluate(results)

In [22]:
print(ROC_AUC)

0.7772551433093019


In [None]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

categorical = ['Sex','Embarked']
numeric = ['Pclass','Age','SibSp','Parch','Fare']

pre = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical),
        ('num', 'passthrough', numeric)
    ]
)

model = Pipeline([
    ('pre', pre),
    ('clf', LogisticRegression())
])


In [48]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
import joblib

# Load Data
df = pd.read_csv("Titanic.csv")

# Keep required columns
df = df[['Survived','Pclass','Sex','Age','SibSp','Parch','Fare','Embarked']]

# Drop rows with NA
df = df.dropna()

# Split features and labels
X = df.drop("Survived", axis=1)
y = df["Survived"]

categorical = ['Sex','Embarked']
numeric = ['Pclass','Age','SibSp','Parch','Fare']

pre = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical),
        ('num', 'passthrough', numeric)
    ]
)

model = Pipeline([
    ('pre', pre),
    ('clf', LogisticRegression())
])

# Train-test-split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

# Fit Model
model.fit(X_train, y_train)

# Accuracy
score = model.score(X_test, y_test)
print("Accuracy:", score)

Accuracy: 0.8037383177570093


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [49]:
joblib.dump(model, "titanic_model.pkl")

['titanic_model.pkl']

In [50]:
model = joblib.load("titanic_model.pkl")

In [51]:
sample = pd.DataFrame([{
    "Pclass": 1,
    "Sex": "female",
    "Age": 50,
    "SibSp": 8,
    "Parch": 0,
    "Fare": 27.25,
    "Embarked": "C"
}])

print(model.predict(sample))

[1]


In [53]:
print(model)

Pipeline(steps=[('pre',
                 ColumnTransformer(transformers=[('cat',
                                                  OneHotEncoder(handle_unknown='ignore'),
                                                  ['Sex', 'Embarked']),
                                                 ('num', 'passthrough',
                                                  ['Pclass', 'Age', 'SibSp',
                                                   'Parch', 'Fare'])])),
                ('clf', LogisticRegression())])


In [54]:
pip install scikit-learn==1.4.2
pip install numpy==1.26.4
pip install pandas==2.2.2
pip install joblib
pip install streamlit

SyntaxError: invalid syntax (1773898957.py, line 1)