In [None]:
from google.colab import files

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn import linear_model
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

In [None]:
url = 'https://raw.githubusercontent.com/rahgirrafi/kaggle/master/titanic/train.csv'
df = pd.read_csv(url)
#df.describe()
#df.head()

sub_df = df.loc[:,['Pclass','Fare']]
sub_df.head(50)

Unnamed: 0,Pclass,Fare
0,3,7.25
1,1,71.2833
2,3,7.925
3,1,53.1
4,3,8.05
5,3,8.4583
6,1,51.8625
7,3,21.075
8,3,11.1333
9,2,30.0708


In [None]:
#df['Sex'] = df['Sex'].replace('male',0)
#df['Sex'] = df['Sex'].replace('female',1)
#df['Embarked'] = df['Embarked'] .replace('C',0)
#df['Embarked']  = df['Embarked'].replace('Q',1)
#df['Embarked']  = df['Embarked'].replace('S',2)
df['Embarked']  = df['Embarked'].replace(np.nan,'N')
#df['Embarked'].unique()
df.shape

(891, 12)

In [None]:
splitter = StratifiedShuffleSplit(n_splits = 1, test_size = .2, random_state= 0)

for train_idx, test_idx in splitter.split(df, df[['Sex','Pclass','Survived']]):
  train = df.loc[train_idx]
  test = df.loc[test_idx]


In [None]:
survived_by_gender = df.groupby('Sex')['Survived'].sum()
print(survived_by_gender)
gender_labels = survived_by_gender.index
survivor_counts = survived_by_gender.values

plt.bar(gender_labels, survivor_counts, color=['blue', 'pink'])
plt.xlabel('Sex')
plt.ylabel('Count of Survivors')
plt.title('Survivors on the Titanic by Gender')
plt.show()

In [None]:
sns.heatmap(df.corr(), annot= True)

In [None]:
sns.countplot(x= df['Sex'], hue=df['Survived'])
plt.show()
sns.countplot(x= df['SibSp'], hue=df['Survived'])
plt.show()
sns.countplot(x= df['Parch'], hue=df['Survived'])
plt.show()
sns.countplot(x= df['Pclass'], hue=df['Survived'])
plt.show()
sns.countplot(x= df['Embarked'], hue=df['Survived'])
plt.show()
fig, ax = plt.subplots(figsize=(35, 5))
sns.countplot(data= df, x= df['Age'], hue=df['Survived'])
plt.show()
fig, ax = plt.subplots(figsize=(10,10))
sns.scatterplot(data= df, x= df['PassengerId'], y=df['Fare'], hue=df['Survived'])
plt.show()


In [None]:
class featureEncoder(BaseEstimator, TransformerMixin):
  def fit(self, X, y=None):
    return self
  def transform(self, X):
    encoder = OneHotEncoder(sparse= False)
    mat = encoder.fit_transform(X[['Embarked']]) #fit_transform expects a 2D array
    columns=['C', 'Q', 'S']

    for i in range(len(columns)):
      X[columns[i]]=mat.T[i]

    mat = encoder.fit_transform(X[['Sex']])
    columns = ['female', 'male']

    for i in range(len(columns)):

      X[columns[i]]= mat.T[i]

    return X

In [None]:
#dropping unnecessary columns
class featureDropper(BaseEstimator, TransformerMixin):
  def fit(self, X, y=None):
    return self

  def transform(self, X):
    toDrop = ['PassengerId', 'Name', 'Ticket', 'Cabin', 'Sex', 'Embarked']
    X= X.drop(columns=toDrop)

    return X


In [None]:
#handling missing values
class featureImputer(BaseEstimator, TransformerMixin):
  def fit(self, X, y=None):
    return self
  def transform(self, X):
    ageImputer = SimpleImputer(strategy = 'mean')
    X['Age'] = ageImputer.fit_transform(X[['Age']])
    embarkedImputer = SimpleImputer(strategy = 'most_frequent')
    X['Embarked'] = embarkedImputer.fit_transform(X[['Embarked']])
    return X


In [None]:
pipeline = Pipeline([
    ('imputer', featureImputer() ),
    ('encoder', featureEncoder() ),
    ('dropper', featureDropper() )
])
preprocessed_train = pipeline.fit_transform(train)
preprocessed_test = pipeline.fit_transform(test)
preprocessed_Df = pipeline.fit_transform(df)



In [None]:
#scaling/ normalization of Training set
X_train = preprocessed_train.drop(columns=['Survived'])
y_train = preprocessed_train['Survived']

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
y_train = y_train.to_numpy()


In [None]:
#scaling/ normalization of Testset
X_test = preprocessed_test.drop(columns=['Survived'])
y_test = preprocessed_test['Survived']
X_test = scaler.fit_transform(X_test)
y_test = y_test.to_numpy()


In [None]:
#scaling/ normalization of Full Dataset
X_final = preprocessed_Df.drop(columns=['Survived'])
y_final = preprocessed_Df['Survived']
X_final = scaler.fit_transform(X_final)
y_final = y_final.to_numpy()

In [None]:
#model selection

tree = RandomForestClassifier()
param_grid = {
    'n_estimators': [100,500,1000],
    'criterion': ['gini', 'entropy', 'log_loss'],
    'max_depth': [None,5,10,20],
    'min_samples_split':[2,3,4],

}

gridSearch = GridSearchCV(tree, param_grid, cv = 5, scoring = 'accuracy', n_jobs= -1, return_train_score = True)

gridSearch.fit(X_train, y_train)


In [None]:
chosen_tree = gridSearch.best_estimator_
chosen_tree

In [None]:
resuts = gridSearch.cv_results_
resuts

In [None]:
chosen_tree.score(X_test,y_test)

0.8100558659217877

In [None]:
#final_training

prod_tree = RandomForestClassifier()
param_grid = {
    'n_estimators': [100,500,1000],
    'criterion': ['gini', 'entropy', 'log_loss'],
    'max_depth': [None,5,10,20],
    'min_samples_split':[2,3,4],

}

gridSearch = GridSearchCV(prod_tree, param_grid, cv = 5, scoring = 'accuracy', n_jobs= -1, return_train_score = True)

gridSearch.fit(X_final, y_final)

In [None]:
final_tree = gridSearch.best_estimator_
final_tree

In [None]:
test_url = 'https://raw.githubusercontent.com/rahgirrafi/kaggle/master/titanic/test.csv'
X_test_Data_ = pd.read_csv(test_url)
X_test_Data = pipeline.fit_transform(X_test_Data_)
X_test_Data['Fare']  = X_test_Data['Fare'].replace(np.nan,X_test_Data['Fare'].mean())
X_test_Data



Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,C,Q,S,female,male
0,3,34.50000,0,0,7.8292,0.0,1.0,0.0,0.0,1.0
1,3,47.00000,1,0,7.0000,0.0,0.0,1.0,1.0,0.0
2,2,62.00000,0,0,9.6875,0.0,1.0,0.0,0.0,1.0
3,3,27.00000,0,0,8.6625,0.0,0.0,1.0,0.0,1.0
4,3,22.00000,1,1,12.2875,0.0,0.0,1.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...
413,3,30.27259,0,0,8.0500,0.0,0.0,1.0,0.0,1.0
414,1,39.00000,0,0,108.9000,1.0,0.0,0.0,1.0,0.0
415,3,38.50000,0,0,7.2500,0.0,0.0,1.0,0.0,1.0
416,3,30.27259,0,0,8.0500,0.0,0.0,1.0,0.0,1.0


In [None]:
X_test_final = scaler.fit_transform(X_test_Data)


In [None]:
preidctions = final_tree.predict(X_test_final)

final_df = pd.DataFrame(X_test_Data_['PassengerId'])
final_df['Survived']= preidctions
final_df.to_csv('predictions.csv',index= False)

# New Section