# My first try at Data analytics

In [None]:
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

## Loading dependencies

In [None]:
# Libraries for data preparation, mathematics and os related tasks
import numpy as np
import os
import pandas as pd

# Libraries for visualization
import seaborn as sns
import matplotlib.pyplot as plt

# For use in Jupyter notebook visualizations
%matplotlib inline
plt.style.use('seaborn-notebook')

# machine learning
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier

## Loading datasets and basic overview

In [None]:
# Important disclaimer:

# The following code is almost entirely based and/or copied from work by Manav Sehgal and PandaBrenda published in Kaggle. 
# I have done only minor tweeks to suit the code to my purposes. 
# All credit goes to them. Check out their work here:
# Manav Sehgal: https://www.kaggle.com/startupsci/titanic-data-science-solutions
# PandaBrenda: https://www.kaggle.com/brendan45774

# Current version: 1.00
# Accuracy: 77%
# Model: Linear Regression

# There will be a lot of bugs - all suggestions are welcome!

In [None]:
dataset_path = '/kaggle/input/titanic/'

dataset_file = 'train.csv'
dataset_test_file = 'test.csv'

# Combine train and test datasets to one for ease of use (for performing the same operations on both datasets - preventing
# mismatch):

# Function for loading the dataset
def load_dataset(dataset_path, dataset_file):
    csv_path = os.path.join(dataset_path, dataset_file)
    return pd.read_csv(csv_path)

# Load train dataset
titanic = load_dataset(dataset_path, dataset_file)

# Load test dataset
titanic_test = load_dataset(dataset_path, dataset_test_file)

# Combine datasets
combine = [titanic, titanic_test]

In [None]:
# Let's review the Titanic dataset features - data types, null or empty values etc.
titanic.info()
print('_'*40) # Print a line to separate the two tables
titanic_test.info()

In [None]:
# Basic statistical description of the training Titanic dataset
titanic.describe()

In [None]:
titanic.describe(include=['O']) # Returns distribution of categorical features

In [None]:
# Drop "Ticket" and "Cabin" features from the dataset because of too many missing values. Also drop "PassengerId" because I won't use it.
for i in range(len(combine)):
    combine[i] = combine[i].drop(["Ticket", "Cabin", "PassengerId"], axis=1)

# Analysis, visualization and data cleaning

## What percentage of people survived based on age

In [None]:
# What percentage of men and women survived?
women = titanic.loc[titanic.Sex == 'female']["Survived"]
rate_women = sum(women)/len(women)

print("% of women who survived:", round(rate_women * 100, 2))

men = titanic.loc[titanic.Sex == 'male']["Survived"]
rate_men = sum(men)/len(men)

print("% of men who survived:", round(rate_men * 100, 2))

# Better solution to the same thing above by Manav Sehgal:
titanic[["Sex", "Survived"]].groupby(['Sex'], as_index=False).mean().sort_values(by='Survived', ascending=False)

## Clean and analyze age data

In [None]:
# Round age data in Titanic dataset to the nearest integer.
# Note: Numpy uses a specific type of rounding where for example: 1.5 rounds up to 2 and 10.5 rounds down to 10!

titanic["Age"] = titanic["Age"].apply(lambda x: np.rint(x))

In [None]:
# Create age brackets and count how many of those people survived:
# Brackets: 0-5, 6-15, 16-21, 22-30, 31-40, 41-50, 51-60, 61-70, 71-80

age_brackets = [[0,5], [6,15], [16,21], [22,30], [31,40], [41,50], [51,60], [61,70], [71,80]]

for bracket in age_brackets:
    
    # Divide the dataset to age brackets:
    age_group = titanic.loc[(titanic.Age >= bracket[0]) & (titanic.Age <= bracket[1])]
    print("There are:", age_group["Survived"].count(), "people in the", bracket ,"age group.") # "Survived" index is there just to get the number 891, i.e. the number of total rows. This should be done by some better code in the future.
    
    # Create a dataframe of people who survived from the age bracket
    survived = age_group.loc[age_group["Survived"] == 1].count()
    
    # Percentage of people who survived from the age bracket
    survival_rate = survived["Survived"] / age_group["Survived"].count() # Divide the number of people who survived by the total number of rows in the age bracket for "Survived"
    print("Survival for this age group is {}%".format(round(survival_rate * 100, 2)))
    #print("In this group the survival rate for females was:")
    #print("In this group the survival rate for males was:")

## Transpose survival by age groups

In [None]:
survived = 'survived'
not_survived = 'not survived'
fig, axes = plt.subplots(nrows=1, ncols=2,figsize=(16, 8))
women = titanic[titanic['Sex']=='female']
men = titanic[titanic['Sex']=='male']
ax = sns.histplot(women[women['Survived']==1].Age.dropna(), bins=40, label = survived, ax = axes[0], kde =False, color="green")
ax = sns.histplot(women[women['Survived']==0].Age.dropna(), bins=40, label = not_survived, ax = axes[0], kde =False, color="red")
ax.legend()
ax.set_title('Female')
ax = sns.histplot(men[men['Survived']==1].Age.dropna(), bins=40, label = survived, ax = axes[1], kde = False, color="green")
ax = sns.histplot(men[men['Survived']==0].Age.dropna(), bins=40, label = not_survived, ax = axes[1], kde = False, color="red")
ax.legend()
_ = ax.set_title('Male');

## Correlation matrix

In [None]:
# Create a correlation matrix:
corr_matrix = titanic.corr()
sns.heatmap(corr_matrix, annot=True)
plt.show()

## Dataset state so far

In [None]:
combine[0].info()

In [None]:
combine[1].info()

## Fix missing Age, Embarked and Fare data, and transform them into integers

In [None]:
# Let's start with the Embarked data column

# Load both datasets column for Embarked into one dataset and find out which port of embarkation is the most frequent
complete_dataset = combine[0].append(combine[1])
complete_dataset.info()

freq_port = titanic.Embarked.dropna().mode()[0] # find most frequent port of embarkating. Dropping NaN values is needed
print(40*"_")
print ("Most frequent port is: {}".format(freq_port))

combine[0]["Embarked"] = combine[0]["Embarked"].fillna(freq_port)

In [None]:
# Change the "Embarked" data from string to integers. 0 = Southampton, 1 = Cherbourg, 2 = Queenstown
for i in range(len(combine)):
    combine[i]["Embarked"] = combine[i]["Embarked"].map({'S': 0, 'C': 1, 'Q': 2}).astype(int)

In [None]:
combine[0].info() # Check if data type changed to integer and all missing values have been filled
combine[1].info()

In [None]:
# Change "Sex" data points to integers instead of strings. Female = 0, Male = 1

for i in range(len(combine)):
    combine[i]["Sex"] = combine[i]["Sex"].apply(lambda x: 0 if x == "female" else 1) # We can use "apply" method with lambda or "map"

In [None]:
# Fill missing age data

# Get the age mean and standard deviation of the whole age dataset
age_mean = complete_dataset["Age"].mean() # Titanic ages mean
age_std = complete_dataset["Age"].std() # Standard deviation

print("The mean of the population is: {}".format(age_mean))
print("The standard deviation of the titanic population age is: {}".format(age_std))

for i in range(len(combine)):
    # Get count of missing age data for each dataset
    missing_age_values = combine[i]['Age'].isna().sum()
    print("Number of missing age values is: {}".format(missing_age_values))
    
    # Generate random ages in the first standard deviation from the mean
    rand_age = np.random.randint(age_mean - age_std, age_mean + age_std, missing_age_values) # This is OK but not sure what "discrete uniform" distribution is - more research is needed.
    print("The ages generated are: {}".format(rand_age))
    
    # Add random ages to the missing values for age:
    age_slice = combine[i]["Age"].copy()
    age_slice[np.isnan(age_slice)] = rand_age # Get all NaN values from the age_slice series and replace them with our random ages
    combine[i]["Age"] = age_slice # Assign our modified age_slice series back to the full dataset
    combine[i]["Age"] = combine[i]["Age"].astype(int)

In [None]:
# Fill missing value for fare
fare_mean = complete_dataset["Fare"].mean() # Titanic fare mean
print("The fare mean of the whole dataset is {}:".format(fare_mean))

combine[1]["Fare"] = combine[1]["Fare"].fillna(fare_mean)

## Drop "Name" data and distribute datasets back to original variables

In [None]:
for i in range(len(combine)):
    combine[i] = combine[i].drop(["Name"], axis=1)

In [None]:
titanic = combine[0]
titanic_test = combine[1]

## Create machine learning models and predict survival

In [None]:
X_train = titanic.drop("Survived", axis=1)
Y_train = titanic["Survived"]
X_test  = titanic_test.copy()
X_train.shape, Y_train.shape, X_test.shape

In [None]:
# Linear Regression

lin_reg = LinearRegression()
lin_reg.fit(X_train, Y_train)

In [None]:
results = lin_reg.predict(titanic_test)

In [None]:
print(results)

In [None]:
# Example of the linear regression model prediction:

print("Will the following passenger of the Titanic survive: \n{}".format(titanic_test.loc[0]))
print("The model predicts the following survival chance: \n{}".format(results[0]))

In [None]:
# Another example from the linear regression model prediction

print("Will the following passenger of the Titanic survive: \n{}".format(titanic_test.loc[22]))
print("The model predicts the following survival chance: \n{}".format(results[22]))

In [None]:
# This is experimental - it is too ambiguous for values around 0.4 to 0.6 - should be revisited
# Round numbers to the nearest integer (either 0 or 1), 0 = dead, 1 = survived

rounded_results = np.rint(results).astype(int) # numpy.rint method works for rounding elements of numpy arrays

print(rounded_results)

In [None]:
# Compare our rounded integers to the original floats from model prediction

def compare(float_number, integer_number):
    for i in range(len(results)):
        print("The float result number {} compares to: {}".format(float_number[i], integer_number[i]))
    
compare(results, rounded_results)

In [None]:
# Count the number of survived and dead passengers

def count_dead_survived(rounded_result_array):
    unique, counts = np.unique(rounded_result_array, return_counts=True)
    for i in range(len(unique)):
        print("The value of {} repeats this many times: {}".format(unique[i], counts[i]))
    
count_dead_survived(rounded_results)

In [None]:
# Create submission file

submission_file = 'gender_submission.csv'
submission = load_dataset(dataset_path, submission_file)

submission['Survived'] = rounded_results
submission.to_csv('submission_titanic.csv', index=False) # Save submission dataset to a csv file in the folder where the program is running
