In [97]:
#Importing pandas and numpy library 
import pandas as pd
import numpy as np

In [98]:
#Reading the data stored in csv file us pandas
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

In [99]:
#Using head function to get the genral idea about our data
train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [100]:
#Using info function to check the data type of features and which feature have missing values
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [101]:
#Adding 'SibSp' and 'Parch' feature to create new family feature
train_df['family'] = train_df['SibSp'] + train_df['Parch']
test_df['family'] = test_df['SibSp'] + test_df['Parch']
#Drop the following two features as 'family' feature will be used inplace of them
train_df.drop(['SibSp','Parch'],axis=1,inplace=True)#axis = 1 means along column, inplace = true means apply the change to dataframe
test_df.drop(['SibSp','Parch'],axis=1,inplace=True)

In [102]:
#Calculating mean to fill the missing values for age as it will greatly impact the survival of a person
mean_age_train = train_df['Age'].mean()
mean_age_test = test_df['Age'].mean()
#Filling mising value
train_df['Age'].fillna(mean_age_train,inplace=True)
test_df['Age'].fillna(mean_age_train,inplace=True)

In [103]:
#maping male to 1 and female to 0 as logistic regression model would only take numerical value
train_df['Sex'] = train_df['Sex'].map({'male':1,'female':0})
test_df['Sex'] = test_df['Sex'].map({'male':1,'female':0})

In [104]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score

In [105]:
#Splitting the data in two parts, removing the features not of use
X,y = train_df.drop(['PassengerId','Survived','Name','Ticket','Cabin','Embarked'],axis=1),train_df['Survived']

In [106]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=17)

In [107]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 623 entries, 66 to 623
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Pclass  623 non-null    int64  
 1   Sex     623 non-null    int64  
 2   Age     623 non-null    float64
 3   Fare    623 non-null    float64
 4   family  623 non-null    int64  
dtypes: float64(2), int64(3)
memory usage: 29.2 KB


In [108]:
logit = LogisticRegression(C=0.1,random_state=17)

In [109]:
%%time
cv_score = cross_val_score(logit,X_train,y_train,cv=5,scoring='roc_auc')

CPU times: total: 31.2 ms
Wall time: 91.6 ms


In [110]:
cv_score

array([0.83469722, 0.87272977, 0.84314805, 0.89269788, 0.82943144])

In [111]:
cv_score.mean()

0.8545408716429455

In [112]:
logit.fit(X,y)

In [113]:
test_X = test_df.drop(['PassengerId','Name','Ticket','Cabin','Embarked'],axis=1)

In [114]:
mean_fare_test =test_X.mean()
Z = test_X.fillna(mean_fare_test)

In [115]:
test_pred = logit.predict(Z)

In [116]:
submission_df = pd.DataFrame({'PassengerId': range(1, len(test_pred)+1), 'Survived': test_pred})

In [117]:
submission_df['PassengerId'] += 891

In [118]:
submission_df.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1


In [119]:
submission_df.to_csv('submission.csv', index=False)