In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Loading Data

In [None]:
train_data = pd.read_csv('/kaggle/input/titanic/train.csv')
test_data = pd.read_csv('/kaggle/input/titanic/test.csv')

# Exploratory Data Analysis

**Raw Data:**

In [None]:
train_data.head()

**Male vs Female Survivors:**

In [None]:
male = train_data.loc[train_data.Sex == 'male']['Survived']
female = train_data.loc[train_data.Sex == 'female']['Survived']

percent_male = (sum(male)/len(male)) * 100
percent_female = (sum(female)/len(female)) * 100

sex_data = [percent_male, percent_female]
sex_data = np.array(sex_data)
sex_labels = ['Male', 'Female']

plt.bar(sex_labels, sex_data)
plt.xlabel('Sex')
plt.ylabel('Percent Survivors')
plt.title('Percent of Survivors By Sex')
plt.show()

print('%' + str(round(percent_male, 2)) + ' of males survived.')
print('%' + str(round(percent_female, 2)) + ' of females survived.')

**Ages of Survivors:**

In [None]:
twenty_or_less = train_data.loc[(train_data.Age <= 20)]['Survived']
fifty_or_less =  train_data.loc[(train_data.Age < 20) & (train_data.Age <= 50)]['Survived']
older_than_fifty = train_data.loc[(train_data.Age > 50)]['Survived']

age_data = [(sum(twenty_or_less)/len(twenty_or_less)), (sum(fifty_or_less)/len(fifty_or_less)), 
           (sum(older_than_fifty)/len(older_than_fifty))]

age_data = np.array(age_data) * 100
age_labels = ['0-20', '21-50', '50+']

plt.bar(age_labels, age_data)
plt.xlabel('Ages')
plt.ylabel('Percent Survivors')
plt.title('Percent of Survivors By Age Group')
plt.show()

print('%' + str(round(age_data[0], 2)) + ' of ages 20 or younger survived.')
print('%' + str(round(age_data[1], 2)) + ' of ages younger than 50 and older than 20 survived.')
print('%' + str(round(age_data[2], 2)) + ' of ages over 50 survived.')

****

**Class of Survivors**

In [None]:
first_class = train_data.loc[(train_data.Pclass == 1)]['Survived']
second_class =  train_data.loc[(train_data.Pclass == 2)]['Survived']
third_class = train_data.loc[(train_data.Pclass == 3)]['Survived']

class_data = [(sum(first_class)/len(first_class)), (sum(second_class)/len(second_class)), 
           (sum(third_class)/len(third_class))]

class_data = np.array(class_data) * 100
class_labels = ['1st Class', '2nd Class', '3rd Class']

plt.bar(class_labels, class_data)
plt.xlabel('Class')
plt.ylabel('Percent Survivors')
plt.title('Percent of Survivors By Passenger Class')
plt.show()

print('%' + str(round(class_data[0], 2)) + ' of 1st class survived.')
print('%' + str(round(class_data[1], 2)) + ' of 2nd class survived.')
print('%' + str(round(class_data[2], 2)) + ' of 3rd class survived.')

# Building the Models
(Using Age, Sex, and Class)

**Finding and filling null values:**

In [None]:
print('NaN training data:\n')
print(train_data.isnull().sum())
print()
print('NaN testing data:\n')
print(test_data.isnull().sum())

In [None]:
train_data.fillna(train_data['Age'].mean(), axis = 1, inplace = True)
test_data.fillna(test_data['Age'].mean(), axis = 1, inplace = True)

print('NaN training data:\n')
print(train_data.isnull().sum())
print('NaN testing data:\n')
print(test_data.isnull().sum())

**Splitting data into testing/training and creating models:**

In [None]:
y = train_data['Survived']
targets = ['Pclass', 'Sex', 'Age', 'PassengerId']
X = pd.get_dummies(train_data[targets])
X_test = pd.get_dummies(train_data[targets])

KNN_model = KNeighborsClassifier()
SGD_model = SGDClassifier()
RFC_model = RandomForestClassifier()

KNN_model.fit(X, y)
KNN_predict = KNN_model.predict(X_test)

SGD_model.fit(X, y)
SGD_predict = SGD_model.predict(X_test)

RFC_model.fit(X, y)
RFC_predict = RFC_model.predict(X_test)

# Evaluating the Models

In [None]:
cv1 = cross_val_score(KNN_model, X, y, cv=10)
cv2 = cross_val_score(SGD_model, X, y, cv=10)
cv3 = cross_val_score(RFC_model, X, y, cv=10)

KNN_model_score = cv1.mean()
SGD_model_score = cv2.mean()
RFC_model_score = cv3.mean()

print('KNN model score is ', KNN_model_score)
print('SGD model score is ', SGD_model_score)
print('RFC model score is ', RFC_model_score)

# Submission:
(RFC is best model after evaluation)

In [None]:
test_data = pd.get_dummies(test_data[targets])

prediction = RFC_model.predict(test_data)
output = pd.DataFrame({'PassengerId': test_data['PassengerId'], 'Survived': prediction})
output.to_csv('submission.csv', index=False)

print("Submission generated.")
pd.read_csv('submission.csv').head()