Hello Kaggle! It is my first Competition

# Imports

In [None]:
import pandas as pd
import seaborn as sns; sns.set()
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.impute import KNNImputer
import os
import re
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

def findall_len(string, pattern):
    return len(pattern.findall(string))

data_train = pd.read_csv("/kaggle/input/titanic/train.csv")
data_test = pd.read_csv("/kaggle/input/titanic/test.csv")
data_train

# Data understanding


## Histograms

In [None]:
#variables used
hist_vars = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare']
#Pclass, Sex, Age, SibSp, Parch, Fare
fig, ax = plt.subplots(len(hist_vars), 2, figsize=(20, 30))
classes = set(data_train['Survived'])
for i, class_ in enumerate(classes):
    class_df = data_train.query('Survived == @class_')
    for j, vars_ in enumerate(hist_vars):
        ax[j][0].hist(class_df[vars_], alpha = 0.5,
                      label = class_, bins = 25)
        ax[j][1].hist(class_df[vars_], alpha = 0.5,
                      label = class_, bins = 25,
                      log = True)
        ax[j][0].set_xlabel(vars_)
        ax[j][0].legend()
        ax[j][1].set_xlabel(vars_ + '_log')
        ax[j][1].legend()
#Embarked   
fig, ax = plt.subplots(2, 2, figsize=(20, 15))
classes = set(data_train['Survived'])
for i, class_ in enumerate(classes):
    class_df = data_train.query('Survived == @class_')
    
    cut = class_df.groupby('Embarked').count()
    ax[0][0].bar(x = cut.index, height = cut['PassengerId'],
                     alpha = 0.5, label = class_)
    ax[0][1].bar(x = cut.index, height = cut['PassengerId'],
                     alpha = 0.5, label = class_, log = True)
    ax[0][0].legend()
    ax[0][0].set_xlabel('Embarked')
    ax[0][1].legend()
    ax[0][1].set_xlabel('Embarked_log')
#Cabin   
cut = data_train['Cabin'].fillna('').str.replace(r'[0-9]+', '', regex = True)
for i, class_ in enumerate(classes):
    ax[1][1].hist(cut[data_train['Survived'] == class_],
                  alpha = 0.5, log = True, label = class_)
    ax[1][0].hist(cut[data_train['Survived'] == class_],
                  alpha = 0.5, label = class_)
    ax[1][1].set_xlabel('Cabin_log')
    ax[1][1].legend()
    ax[1][0].set_xlabel('Cabin')
    ax[1][0].legend()
#Mr, Miss, Mrs and скобками "()" 
filters = {'Mr':re.compile(r"Mr\."),
           'Miss':re.compile(r"Miss\."),
           'Mrs':re.compile(r"Mrs\."),
           'quote':re.compile(r'["(<{[|].*[")>}]|]')}
fig, ax = plt.subplots(len(filters), 2, figsize=(20, 30))
classes = data_train['Survived'].unique()
for class_ in classes:
    text_class = data_train.query('Survived == @class_')['Name']
    for i, dict_ in enumerate(filters.items()):
        hist_ = text_class.apply(findall_len, args = [dict_[1]])
        ax[i][1].hist(hist_, label = class_, alpha = 0.5, log = True)
        ax[i][1].set_xlabel(str(dict_[0]) + '_log')
        ax[i][1].legend()
        ax[i][0].hist(hist_, label = class_, alpha = 0.5, log = False)
        ax[i][0].set_xlabel(str(dict_[0]))
        ax[i][0].legend()

## Nans maps

In [None]:
fig, ax =plt.subplots(1, 2, figsize=(20, 5))
sns.heatmap(data_train.isna(), ax=ax[0])
sns.heatmap(data_test.isna(), ax=ax[1])

## Conclusions

1. **Pclass.** Most of the third-class passengers were dead. Only a third survived;
2. **Sex**. Men died more than women did;
3. **Age**. People aged 15 to 50 died more than others .Children were more likely to be saved;
4. **SibSp**. People with more than 3 relatives were more likely to die;
5. **Parch**. Mostly People who had more than 3 parents or children died more;
6. **Fare**. People with tickets which prices were higher than 100 survived more than the others;
7. **Embarked**. More than half of the S people died;
8. **Cabin**. It seems that people from some cabins survived more often;
8. **Mr, Miss, Mrs**. Definitely important;
9. **Brackets**. A dubious sign, but it seems that with brackets in the name, many more people survived. 

Removed:
1. The **Cabin** had a lot of omissions and will be removed;
2. The **Ticket** is unique and does not represent any information;
3. It was decided not to take the presence of **brackets** in the name as a feature..

The remaining features will be included in the model.

# Data definition and filling a NaNs

## One hot encoding

In [None]:
#variables used in modeling
vars_ = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']
X_train = pd.get_dummies(data_train[vars_], columns = ['Sex', 'Embarked'])
y_train = data_train['Survived']
X_test = pd.get_dummies(data_test[vars_], columns = ['Sex', 'Embarked'])
X_train

In [None]:
#add miss, mr, mrs
vectorizer = CountVectorizer(binary = True, max_features = 3)
text_train = vectorizer.fit_transform(data_train['Name'])
text_test = vectorizer.transform(data_test['Name'])
print(vectorizer.get_feature_names())
text_train.toarray()

## Filling a NaNs

In [None]:
#merge data
X_train = pd.concat([X_train, pd.DataFrame(text_train.toarray())]
                    , axis = 1, ignore_index = True)
X_test = pd.concat([X_test, pd.DataFrame(text_test.toarray())],
                   axis = 1, ignore_index = True)
#filling in the nans with the nearest neighbor
imputer = KNNImputer(n_neighbors=5)
X_train = imputer.fit_transform(X_train)
X_test = imputer.transform(X_test)

In [None]:
#run model
model = RandomForestClassifier(n_estimators = 10, random_state = 0)
grid = GridSearchCV(estimator = model, param_grid = {'max_depth': range(2, 6)}, cv = 10)
grid.fit(X_train, y_train)

predict = grid.predict(X_test)
output = pd.DataFrame({'PassengerId': data_test.PassengerId, 'Survived': predict})
output = output.astype({'Survived': 'int32'})
output.to_csv('pashasherst_submission.csv', index=False)

In [None]:
grid.best_score_

Thank you for watching