In [1]:
import pandas as pd
import numpy as np
import matplotlib as mtplt
import seaborn as sb

## Load all the required datasets

In [2]:
train = pd.read_csv('train.csv')

In [3]:
test = pd.read_csv('test.csv')

In [4]:
gender = pd.read_csv('gender_submission.csv')

In [5]:
train

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


---------------------------------------------------------------
## This appears to be passanger data
### - Here Qualitative data appears to be - Name, Sex, Ticket, Cabin, Embarked and PassengerID(as it provides ID) 
###   along with Pclass(as it is ordinal data used to classify the passangers).
### - And Age, SibSp, Parch, and Fare appear to be Quantitative data.
---------------------------------------------------------------

## Load the required libraries for data manipulation/ cleaning

In [6]:
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
encoder = LabelEncoder()
scaler = StandardScaler()
imputer = SimpleImputer(strategy='median')
def processData(dataset):
    dataset['Sex'] = encoder.fit_transform(dataset['Sex'])
    dataset[['Age','Fare']] = imputer.fit_transform(dataset[['Age','Fare']])
    dataset['Embarked'] = encoder.fit_transform(dataset['Embarked'])
    dataset['FamilySize'] = dataset['SibSp'] + dataset['Parch'] + 1
    
    return dataset

In [7]:
processData(train)
processData(test)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,FamilySize
0,892,3,"Kelly, Mr. James",1,34.5,0,0,330911,7.8292,,1,1
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",0,47.0,1,0,363272,7.0000,,2,2
2,894,2,"Myles, Mr. Thomas Francis",1,62.0,0,0,240276,9.6875,,1,1
3,895,3,"Wirz, Mr. Albert",1,27.0,0,0,315154,8.6625,,2,1
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",0,22.0,1,1,3101298,12.2875,,2,3
...,...,...,...,...,...,...,...,...,...,...,...,...
413,1305,3,"Spector, Mr. Woolf",1,27.0,0,0,A.5. 3236,8.0500,,2,1
414,1306,1,"Oliva y Ocana, Dona. Fermina",0,39.0,0,0,PC 17758,108.9000,C105,0,1
415,1307,3,"Saether, Mr. Simon Sivertsen",1,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,2,1
416,1308,3,"Ware, Mr. Frederick",1,27.0,0,0,359309,8.0500,,2,1


In [8]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 13 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    int64  
 5   Age          891 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     891 non-null    int64  
 12  FamilySize   891 non-null    int64  
dtypes: float64(2), int64(8), object(3)
memory usage: 90.6+ KB


In [9]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Name         418 non-null    object 
 3   Sex          418 non-null    int64  
 4   Age          418 non-null    float64
 5   SibSp        418 non-null    int64  
 6   Parch        418 non-null    int64  
 7   Ticket       418 non-null    object 
 8   Fare         418 non-null    float64
 9   Cabin        91 non-null     object 
 10  Embarked     418 non-null    int64  
 11  FamilySize   418 non-null    int64  
dtypes: float64(2), int64(7), object(3)
memory usage: 39.3+ KB


### Specify the features and target for the data

In [10]:
features = ['Pclass', 'Sex', 'Age', 'Fare', 'Embarked', 'FamilySize']
target = 'Survived'

### Load the Logistic Regression model from sklearn library

In [11]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split, cross_val_score

In [12]:
train_X, test_X, train_Y, test_Y = train_test_split(train[features], train[target], test_size=0.2, random_state=42)

In [13]:
model = LogisticRegression()

In [14]:
model.fit(train_X, train_Y)

In [15]:
prediction = model.predict(test_X)

### Calculate the accuracy value for the test data

In [16]:
accuracy = accuracy_score(test_Y, prediction)

In [27]:
print(f"The accuracy score is: {accuracy * 100:.4f}%")

The accuracy score is: 80.4469%


# The accuracy score is: 80.4469%

### Find the cross validation score to see if the train test split gives us close to accurate score

In [28]:
cv_scores = cross_val_score(model, train_X, train_Y, cv=5)
print("Cross-Validation Score: ",np.mean(cv_scores)*100, "%")

Cross-Validation Score:  79.34797596769428 %


# The Cross-Validation Score:  79.34797596769428 %

--------------------------------------------------------------------------------

## It is to be noted that the Accuracy Score for Logistic Regression is 80.44(~80)%

## And Cross Validation Score is 79.34(~79)%
