In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn import linear_model
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier

from sklearn.naive_bayes import GaussianNB

In [None]:
url = 'https://media.githubusercontent.com/media/rahgirrafi/kaggle/master/titanic/train.csv'
df = pd.read_csv(url)
#df.describe()
#df.head()


In [None]:
df

In [None]:
df.loc[:,['Pclass', 'Cabin', 'Survived']].head(50)

In [None]:
df.loc[:,['Pclass', 'Cabin','Survived']].tail(50)

In [None]:
#df['Sex'] = df['Sex'].replace('male',0)
#df['Sex'] = df['Sex'].replace('female',1)
#df['Embarked'] = df['Embarked'] .replace('C',0)
#df['Embarked']  = df['Embarked'].replace('Q',1)
#df['Embarked']  = df['Embarked'].replace('S',2)
df['Embarked']  = df['Embarked'].replace(np.nan,'N')
df['Cabin']  = df['Embarked'].replace(np.nan,'N')

#df['Embarked'].unique()
df.shape
df.info()

In [None]:
df['farePerPerson'] = df['Fare']/(df['Parch']+df['SibSp']+1)
df['cllasFareRatio'] = df['Fare']/df['Pclass']

df.info()


In [None]:
splitter = StratifiedShuffleSplit(n_splits = 1, test_size = .2, random_state= 0)

for train_idx, test_idx in splitter.split(df, df[['Sex','Pclass','Survived']]):
  train = df.loc[train_idx]
  test = df.loc[test_idx]


In [None]:
survived_by_gender = df.groupby('Sex')['Survived'].sum()
print(survived_by_gender)
gender_labels = survived_by_gender.index
survivor_counts = survived_by_gender.values

plt.bar(gender_labels, survivor_counts, color=['blue', 'pink'])
plt.xlabel('Sex')
plt.ylabel('Count of Survivors')
plt.title('Survivors on the Titanic by Gender')
plt.show()

In [None]:
sns.heatmap(df.corr(), annot= True)

In [None]:
cols = df.columns[:]
for i in cols:
  sns.countplot(x= df[i], hue=df['Survived'])
  plt.show()



In [None]:
class featureEncoder(BaseEstimator, TransformerMixin):
  def fit(self, X, y=None):
    return self
  def transform(self, X):
    encoder = OneHotEncoder(sparse= False)
    mat = encoder.fit_transform(X[['Embarked']]) #fit_transform expects a 2D array
    columns=['C', 'Q', 'S']

    for i in range(len(columns)):
      X[columns[i]]=mat.T[i]

    mat = encoder.fit_transform(X[['Sex']])
    columns = ['female', 'male']

    for i in range(len(columns)):

      X[columns[i]]= mat.T[i]

    return X

In [None]:
#dropping unnecessary columns
class featureDropper(BaseEstimator, TransformerMixin):
  def fit(self, X, y=None):
    return self

  def transform(self, X):
    toDrop = ['PassengerId', 'Name', 'Ticket', 'Cabin', 'Sex', 'Embarked']
    X= X.drop(columns=toDrop)

    return X


In [None]:
#handling missing values
class featureImputer(BaseEstimator, TransformerMixin):
  def fit(self, X, y=None):
    return self
  def transform(self, X):
    ageImputer = SimpleImputer(strategy = 'mean')
    X['Age'] = ageImputer.fit_transform(X[['Age']])
    embarkedImputer = SimpleImputer(strategy = 'most_frequent')
    X['Age'] =  X['Age'].floordiv(10)
    X['Embarked'] = embarkedImputer.fit_transform(X[['Embarked']])
    return X


In [None]:
pipeline = Pipeline([
    ('imputer', featureImputer() ),
    ('encoder', featureEncoder() ),
    ('dropper', featureDropper() )
])
preprocessed_train = pipeline.fit_transform(train)
preprocessed_test = pipeline.fit_transform(test)
preprocessed_Df = pipeline.fit_transform(df)


In [None]:
preprocessed_Df.head(50)

In [None]:
#scaling/ normalization of Training set
X_train = preprocessed_train.drop(columns=['Survived'])
y_train = preprocessed_train['Survived']
y_train = y_train.to_numpy()

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)

In [None]:
#scaling/ normalization of Testset
X_test = preprocessed_test.drop(columns=['Survived'])
y_test = preprocessed_test['Survived']
X_test = scaler.fit_transform(X_test)
y_test = y_test.to_numpy()


In [None]:
#scaling/ normalization of Full Dataset
X_final = preprocessed_Df.drop(columns=['Survived'])
y_final = preprocessed_Df['Survived']
X_final = scaler.fit_transform(X_final)
y_final = y_final.to_numpy()
X_final

In [None]:
#model selection

tree = RandomForestClassifier()
param_grid = {
    'n_estimators': [100,500,1000],
    'criterion': ['gini', 'entropy', 'log_loss'],
    'max_depth': [None,5,10,20],
    'min_samples_split':[2,3,4],

}

gridSearch = GridSearchCV(tree, param_grid, cv = 5, scoring = 'accuracy', n_jobs= -1, return_train_score = True)

gridSearch.fit(X_train, y_train)
#tree.fit(X_train, y_train)

In [None]:
chosen_tree = gridSearch.best_estimator_
chosen_tree
tree = RandomForestClassifier(criterion='entropy', max_depth=10, min_samples_split=6, n_estimators=800)
tree.fit(X_train, y_train)

In [None]:
tree.score(X_test,y_test)

In [None]:
#final model selesction
final_tree = RandomForestClassifier()
param_grid = {
    'n_estimators': [100,500,1000],
    'criterion': ['gini', 'entropy', 'log_loss'],
    'max_depth': [None,5,10,20],
    'min_samples_split':[2,3,4],

}

gridSearch = GridSearchCV(final_tree, param_grid, cv = 5, scoring = 'accuracy', n_jobs= -1, return_train_score = True)

gridSearch.fit(X_train, y_train)
final_chosen_tree = gridSearch.best_estimator_

In [None]:
final_tree = RandomForestClassifier(criterion='entropy', max_depth=10, min_samples_split=6, n_estimators=800)
final_tree.fit(X_final, y_final)

In [None]:
test_url = 'https://media.githubusercontent.com/media/rahgirrafi/kaggle/master/titanic/test.csv'
X_test_Data_ = pd.read_csv(test_url)
X_test_Data_['farePerPerson'] = X_test_Data_['Fare']/(X_test_Data_['Parch']+X_test_Data_['SibSp']+1)
X_test_Data_['cllasFareRatio'] = X_test_Data_['Fare']/X_test_Data_['Pclass']

X_test_Data = pipeline.fit_transform(X_test_Data_)
X_test_Data['Fare']  = X_test_Data['Fare'].replace(np.nan, X_test_Data['Fare'].mean())
X_test_Data['farePerPerson']  = X_test_Data['farePerPerson'].replace(np.nan, X_test_Data['farePerPerson'].mean())
X_test_Data['cllasFareRatio']  = X_test_Data['cllasFareRatio'].replace(np.nan, X_test_Data['cllasFareRatio'].mean())



In [None]:
X_test_Data.describe()

In [None]:
!git add .

In [None]:
X_test_final = scaler.fit_transform(X_test_Data)


In [None]:
preidctions = final_tree.predict(X_test_final)

final_df = pd.DataFrame(X_test_Data_['PassengerId'])
final_df['Survived']= preidctions
final_df.to_csv('predictions.csv',index= False)