## Importing modules and functions 

In [1]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
import numpy as np
from sklearn.metrics import accuracy_score  
from sklearn.tree import DecisionTreeClassifier
from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import chi2

## Loading the dataset

In [2]:
df=pd.read_csv('C:\\Users\\sabhila1\\Documents\\Titanic ML Problem\\all\\train.csv')

In [3]:
X=df.copy()

## Number of Null values present in each column

In [4]:
X.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

## Count of total number of unique values in each column

In [5]:
X.nunique()

PassengerId    891
Survived         2
Pclass           3
Name           891
Sex              2
Age             88
SibSp            7
Parch            7
Ticket         681
Fare           248
Cabin          147
Embarked         3
dtype: int64

## Dropping columns that are not needed

In [6]:
X.drop(['Name'], axis = 1, inplace = True)  ## 891 UNIQUE VALUES
X.drop(['PassengerId'], axis = 1, inplace = True)  ## 891 UNIQUE VALUES
X.drop(['Ticket'], axis = 1, inplace = True)    ## 687 UNIQUE VALUES


## Filling null values in dataframe 

In [7]:
X=X.fillna(X.mean())

In [8]:
X_dummy = pd.get_dummies(X)

In [9]:
np.array(X_dummy.columns)

array(['Survived', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare',
       'Sex_female', 'Sex_male', 'Cabin_A10', 'Cabin_A14', 'Cabin_A16',
       'Cabin_A19', 'Cabin_A20', 'Cabin_A23', 'Cabin_A24', 'Cabin_A26',
       'Cabin_A31', 'Cabin_A32', 'Cabin_A34', 'Cabin_A36', 'Cabin_A5',
       'Cabin_A6', 'Cabin_A7', 'Cabin_B101', 'Cabin_B102', 'Cabin_B18',
       'Cabin_B19', 'Cabin_B20', 'Cabin_B22', 'Cabin_B28', 'Cabin_B3',
       'Cabin_B30', 'Cabin_B35', 'Cabin_B37', 'Cabin_B38', 'Cabin_B39',
       'Cabin_B4', 'Cabin_B41', 'Cabin_B42', 'Cabin_B49', 'Cabin_B5',
       'Cabin_B50', 'Cabin_B51 B53 B55', 'Cabin_B57 B59 B63 B66',
       'Cabin_B58 B60', 'Cabin_B69', 'Cabin_B71', 'Cabin_B73',
       'Cabin_B77', 'Cabin_B78', 'Cabin_B79', 'Cabin_B80',
       'Cabin_B82 B84', 'Cabin_B86', 'Cabin_B94', 'Cabin_B96 B98',
       'Cabin_C101', 'Cabin_C103', 'Cabin_C104', 'Cabin_C106',
       'Cabin_C110', 'Cabin_C111', 'Cabin_C118', 'Cabin_C123',
       'Cabin_C124', 'Cabin_C125', 'Cabin_C126', 'Cabin_C

In [10]:
X_dummy.corr().to_csv('correlation.csv')

In [11]:
Y = X['Survived']
X_dummy.drop(['Survived'],axis = 1, inplace = True)

## Applying the model 

In [12]:
model = LogisticRegression()

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X_dummy, Y, test_size=0.33, random_state=42)

In [14]:
model.fit(X_train,y_train)


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [15]:
y_predict=model.predict(X_train)
y_actual=np.array(y_train)
accuracy = accuracy_score(y_actual, y_predict)
accuracy

0.8070469798657718

In [16]:
y_predict=model.predict(X_test)
y_actual=np.array(y_test)
accuracy = accuracy_score(y_actual, y_predict)
accuracy

0.8101694915254237

## Feature Selection

In [18]:
ch_val , p_val = chi2(X_train, y_train)

In [21]:
ch_val

array([1.67587941e+01, 1.08610206e+01, 4.55335381e+00, 4.87230976e+00,
       2.77736187e+03, 1.08467163e+02, 5.72929115e+01, 5.93582888e-01,
       5.93582888e-01, 1.68468468e+00, 5.93582888e-01,            nan,
       1.68468468e+00, 5.93582888e-01, 1.68468468e+00,            nan,
       5.93582888e-01,            nan, 5.93582888e-01,            nan,
                  nan,            nan, 1.68468468e+00, 5.93582888e-01,
       3.36936937e+00, 5.93582888e-01, 1.68468468e+00, 1.39133786e-01,
       1.68468468e+00, 1.68468468e+00,            nan, 3.36936937e+00,
       5.93582888e-01,            nan,            nan, 1.68468468e+00,
       1.68468468e+00, 1.68468468e+00, 1.68468468e+00, 3.36936937e+00,
       1.68468468e+00, 1.39133786e-01,            nan,            nan,
                  nan, 5.93582888e-01, 1.68468468e+00, 3.36936937e+00,
                  nan, 1.68468468e+00, 1.68468468e+00, 5.93582888e-01,
                  nan, 5.93582888e-01, 3.36936937e+00, 1.68468468e+00,
      