In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn import linear_model
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

In [2]:
url = 'https://raw.githubusercontent.com/rahgirrafi/kaggle/master/titanic/train.csv'
df = pd.read_csv(url)
#df.describe()
#df.head()

In [None]:
df.shape
df.info()

In [None]:
#df['Sex'] = df['Sex'].replace('male',0)
#df['Sex'] = df['Sex'].replace('female',1)
#df['Embarked'] = df['Embarked'] .replace('C',0)
#df['Embarked']  = df['Embarked'].replace('Q',1)
#df['Embarked']  = df['Embarked'].replace('S',2)
#df['Embarked']  = df['Embarked'].replace(np.nan,3)
#df['Embarked'].unique()

In [3]:
splitter = StratifiedShuffleSplit(n_splits = 1, test_size = .2, random_state= 0)

for train_idx, test_idx in splitter.split(df, df[['Sex','Pclass','Survived']]):
  train = df.loc[train_idx]
  test = df.loc[test_idx]


In [None]:
survived_by_gender = df.groupby('Sex')['Survived'].sum()
print(survived_by_gender)
gender_labels = survived_by_gender.index
survivor_counts = survived_by_gender.values

plt.bar(gender_labels, survivor_counts, color=['blue', 'pink'])
plt.xlabel('Sex')
plt.ylabel('Count of Survivors')
plt.title('Survivors on the Titanic by Gender')
plt.show()

In [None]:
sns.heatmap(df.corr(), annot= True)

In [None]:
sns.countplot(x= df['Sex'], hue=df['Survived'])
plt.show()
sns.countplot(x= df['SibSp'], hue=df['Survived'])
plt.show()
sns.countplot(x= df['Parch'], hue=df['Survived'])
plt.show()
sns.countplot(x= df['Pclass'], hue=df['Survived'])
plt.show()
sns.countplot(x= df['Embarked'], hue=df['Survived'])
plt.show()
fig, ax = plt.subplots(figsize=(35, 5))
sns.countplot(data= df, x= df['Age'], hue=df['Survived'])
plt.show()
fig, ax = plt.subplots(figsize=(10,10))
sns.scatterplot(data= df, x= df['PassengerId'], y=df['Fare'], hue=df['Survived'])
plt.show()


In [4]:
class featureEncoder(BaseEstimator, TransformerMixin):
  def fit(self, X, y=None):
    return self
  def transform(self, X):
    encoder = OneHotEncoder(sparse= False)
    mat = encoder.fit_transform(X[['Embarked']]) #fit_transform expects a 2D array
    columns=['C', 'Q', 'S', 'N']

    for i in range(len(columns)):
      X[columns[i]]=mat.T[i]

    mat = encoder.fit_transform(X[['Sex']])
    columns = ['female', 'male']

    for i in range(len(columns)):

      X[columns[i]]= mat.T[i]

    return X

In [6]:
#dropping unnecessary columns
class featureDropper(BaseEstimator, TransformerMixin):
  def fit(self, X, y=None):
    return self

  def transform(self, X):
    toDrop = ['PassengerId', 'Name', 'Ticket', 'Cabin', 'Sex', 'Embarked']
    X= X.drop(columns=toDrop)

    return X


In [7]:
#handling missing values
class featureImputer(BaseEstimator, TransformerMixin):
  def fit(self, X, y=None):
    return self
  def transform(self, X):
    ageImputer = SimpleImputer(strategy = 'mean')
    X['Age'] = ageImputer.fit_transform(X[['Age']])

    return X


In [8]:
pipeline = Pipeline([
    ('imputer', featureImputer() ),
    ('encoder', featureEncoder() ),
    ('dropper', featureDropper() )
])
preprocessed_train = pipeline.fit_transform(train)




In [9]:
#scaling/ normalization
X_train = preprocessed_train.drop(columns=['Survived'])
y_train = preprocessed_train['Survived']

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
y_train = y_train.to_numpy()


In [11]:
#model selection

tree = RandomForestClassifier()
param_grid = {
    'n_estimators': [100,500,1000],
    'criterion': ['gini', 'entropy', 'log_loss'],
    'max_depth': [None,5,10,20],
    'min_samples_split':[2,3,4],

}

gridSearch = GridSearchCV(tree, param_grid, cv = 5, scoring = 'accuracy', n_jobs= -1, return_train_score = True)

gridSearch.fit(X_train, y_train)


In [None]:
chosen_tree = gridSearch.best_estimator_


In [None]:
resuts = gridSearch.cv_results_