In [16]:
#importing the necessary libraries to be used
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

In [17]:
#importing the dataset by the help of pnadas library
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
print(train.info())
print("_________------------------__________________")
print(test.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB
None
_________------------------__________________
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
PassengerId    418 non-null int64
Pclass         418 non-null int64
Name           418 non-null object
Sex            418 non-null object
Age            332 non-null float64
SibSp          418 non-null int64
Parch          418 non-null int64
Ticket         41

In [18]:
"""
as we can clearly see from the info that there are missing values in the train data in the columns "age"
, "cabin" and "embarked"
similarly in the test data there are missing values in the columns "age", "fare" and "cabin"
"""
print(train.head(10))
print("------------------------------____________________________________-------------------------------")
print(test.head(10))

   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   
5            6         0       3   
6            7         0       1   
7            8         0       3   
8            9         1       3   
9           10         1       2   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   
5                                   Moran, Mr. James    male   NaN      0   
6                            McCarthy, Mr. Timothy J    male  54

In [19]:
"""
as we need to process both the train and test set so we would merge them and process them together
first of all we will take care of the missing values and the we will convert the data into their 
required representation
"""
combined = [train, test]
for data in combined:
    
    #filling the missing values of tha age column with the median of age
    data['Age'] = data['Age'].fillna(data['Age'].median())
    data['Age'] = data['Age'].astype(int)
    
    #filling the missing value of fair in the test data
    data['Fare'] = data['Fare'].fillna(data['Fare'].mean())
    
    #filling the value in embarked with the most occuring one that is "S"
    data['Embarked'] = data['Embarked'].fillna('S')
    data['Embarked'] = data['Embarked'].map({'S':0, 'C':1, 'Q':2}).astype(int)
    
    #chaning the values in sex column into numeric values
    data['Sex'] = data['Sex'].map({'female':0, 'male':1}).astype(int)
    
    #adding a new feature to the data by combining "parch" and "sibsp" and adding 1 as the person itself
    data['family'] = data['Parch'] + data['SibSp'] + 1
    

In [20]:
#rechecking the data
print(train.head(10))
print("-------------------____________________--------------------")
print(test.head(10))

   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   
5            6         0       3   
6            7         0       1   
7            8         0       3   
8            9         1       3   
9           10         1       2   

                                                Name  Sex  Age  SibSp  Parch  \
0                            Braund, Mr. Owen Harris    1   22      1      0   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...    0   38      1      0   
2                             Heikkinen, Miss. Laina    0   26      0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)    0   35      1      0   
4                           Allen, Mr. William Henry    1   35      0      0   
5                                   Moran, Mr. James    1   28      0      0   
6                            McCarthy, Mr. 

In [21]:
print(train.info())
print("-------------------------_______________________---------------------------")
print(test.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 13 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null int32
Age            891 non-null int32
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       891 non-null int32
family         891 non-null int64
dtypes: float64(1), int32(3), int64(6), object(3)
memory usage: 80.1+ KB
None
-------------------------_______________________---------------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 12 columns):
PassengerId    418 non-null int64
Pclass         418 non-null int64
Name           418 non-null object
Sex            418 non-null int32
Age            418 non-null int32
SibSp          418

In [32]:
"""
creating the target variable and the train features from the train data
creating the test features from the test data
"""
target = train['Survived']
train_features = train[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked', 'family']]
test_features = test[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked', 'family']]

In [36]:
#visualising the created variables
print(train_features.head(10))
print("________________________------------------------_____________________________________")
print(train_features.head(10))
print("________________________------------------------_____________________________________")
print(train_features.info())

   Pclass  Sex  Age  SibSp  Parch     Fare  Embarked  family
0       3    1   22      1      0   7.2500         0       2
1       1    0   38      1      0  71.2833         1       2
2       3    0   26      0      0   7.9250         0       1
3       1    0   35      1      0  53.1000         0       2
4       3    1   35      0      0   8.0500         0       1
5       3    1   28      0      0   8.4583         2       1
6       1    1   54      0      0  51.8625         0       1
7       3    1    2      3      1  21.0750         0       5
8       3    0   27      0      2  11.1333         0       3
9       2    0   14      1      0  30.0708         1       2
________________________------------------------_____________________________________
   Pclass  Sex  Age  SibSp  Parch     Fare  Embarked  family
0       3    1   22      1      0   7.2500         0       2
1       1    0   38      1      0  71.2833         1       2
2       3    0   26      0      0   7.9250         0       1

In [56]:
#fitting the model to our data
classifier =RandomForestClassifier(max_depth=10, min_samples_split=2, n_estimators=100, random_state=1)
model = classifier.fit(train_features, target)

In [57]:
#making the prediction through help of our created model and testing it on test data
prediction = model.predict(test_features)

In [58]:
submission = pd.DataFrame({'PassengerId':data['PassengerId'], "Survived":prediction})
submission.to_csv('submission.csv', index = False)

In [59]:
submission = pd.read_csv('submission.csv')
print(submission.head(10))

   PassengerId  Survived
0          892         0
1          893         0
2          894         0
3          895         0
4          896         0
5          897         0
6          898         0
7          899         0
8          900         1
9          901         0
