<a href="https://colab.research.google.com/github/pritiyadav888/pritiyadav888.github.io/blob/master/titanic.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
pd.pandas.set_option('display.max_columns',None)

In [6]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
test_ids = test["PassengerId"]

In [7]:
total = train.isnull().sum().sort_values(ascending=False)
percent_1 = train.isnull().sum()/train.isnull().count()*100
percent_2 = (round(percent_1, 1)).sort_values(ascending=False)
missing_data = pd.concat([total, percent_2], axis=1, keys=['Total', '%'])
missing_data.head(5)

Unnamed: 0,Total,%
year_of_death,282,14.3
race,199,10.1
age_group,15,0.8
symptom66,1,0.1
symptom13,1,0.1


In [None]:
def clean(data):
    data = data.drop(["Ticket", "PassengerId", "Name", "Cabin"], axis=1)
    
    cols = ["SibSp", "Parch", "Fare", "Age"]
    for col in cols:
        data[col].fillna(data[col].median(), inplace=True)
        
    data.Embarked.fillna("U", inplace=True)
    return data

train = clean(train)
test = clean(test)
print('\n Train file dataset info')
print(train.info())
print('\n\n Test file dataset info')
print(test.info())

In [None]:
total = train.isnull().sum().sort_values(ascending=False)
percent_1 = train.isnull().sum()/train.isnull().count()*100
percent_2 = (round(percent_1, 1)).sort_values(ascending=False)
missing_data = pd.concat([total, percent_2], axis=1, keys=['Total', '%'])
missing_data.head(5)

In [None]:
female_survivor = train.loc[train.Sex == 'female']["Survived"]
rate_female = sum(female_survivor)/len(female_survivor)
male_survivor = train.loc[train.Sex == 'male']["Survived"]
rate_male = sum(male_survivor)/len(male_survivor)
print("% of female who survived:", rate_female)
print("% of male who survived:", rate_male)

In [None]:
# seperate the data into numeric and categorical
df_num = train[['Age','SibSp','Parch','Fare']]
df_cat = train[['Survived','Pclass','Sex','Embarked']]

In [None]:
for i in df_num.columns:
    plt.hist(df_num[i])
    plt.title(i)
    plt.show()

In [None]:
sns.heatmap(df_num.corr())

In [None]:
pd.pivot_table(train, index = 'Survived', values = ['Age','SibSp','Parch','Fare'])


In [None]:
for i in df_cat.columns:
    sns.barplot(df_cat[i].value_counts().index,df_cat[i].value_counts()).set_title(i)
    plt.show()

In [None]:
print(pd.pivot_table(train, index = 'Survived', columns = 'Pclass',
                     values = 'Age' ,aggfunc ='count'))
print()
print(pd.pivot_table(train, index = 'Survived', columns = 'Sex', 
                     values = 'Age' ,aggfunc ='count'))
print()
print(pd.pivot_table(train, index = 'Survived', columns = 'Embarked', 
                     values = 'Age' ,aggfunc ='count'))

In [None]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
columns = ["Sex", "Embarked"]

for col in columns:
    train[col] = le.fit_transform(train[col])
    test[col] = le.transform(test[col])
    print(le.classes_)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

y = train["Survived"]
X = train.drop("Survived", axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Logistic Regression

In [None]:
clf = LogisticRegression(random_state=0, max_iter=1000).fit(X_train, y_train)

In [None]:
predictions = clf.predict(X_test)
from sklearn.metrics import accuracy_score
accuracy_score(y_test, predictions)

In [None]:
cv = cross_val_score(clf,X_train,y_train,cv=5)
print(cv)
print(cv.mean())

K Nearest Neighbour

In [None]:
knn = KNeighborsClassifier(n_neighbors=3).fit(X_train, y_train)
predictions = clf.predict(X_test)
print(accuracy_score(y_test, predictions))

cv = cross_val_score(knn,X_train,y_train,cv=5)
print(cv)
print(cv.mean())


Support Vector Classifier

In [None]:
svc = SVC(probability = True).fit(X_train, y_train)
predictions = clf.predict(X_test)
print(accuracy_score(y_test, predictions))

cv = cross_val_score(svc,X_train,y_train,cv=5)
print(cv)
print(cv.mean())

Random Forest Classifier

In [None]:
rf_clf = RandomForestClassifier(random_state = 42).fit(X_train, y_train)
predictions = rf_clf.predict(X_test)
print(accuracy_score(y_test, predictions))

cv = cross_val_score(rf_clf,X_train,y_train,cv=5)
print(cv)
print(cv.mean())


Another model Analysis --- Trees

In [None]:
from sklearn.model_selection import cross_val_score
# from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.tree import DecisionTreeClassifier

In [None]:
clf = DecisionTreeClassifier(max_depth=None, min_samples_split=2,
random_state=0).fit(X_train, y_train)
predictions = clf.predict(X_test)
print(accuracy_score(y_test, predictions))
scores = cross_val_score(clf, X_train,y_train, cv=5)
print(scores.mean())

In [None]:
clf_rf = RandomForestClassifier(n_estimators=20, max_depth=10,
min_samples_split=2, random_state=0).fit(X_train, y_train)
predictions = clf_rf.predict(X_test)
print(accuracy_score(y_test, predictions))
scores = cross_val_score(clf_rf, X_train,y_train, cv=5)
print(scores.mean())

In [None]:
clf = ExtraTreesClassifier(n_estimators=20, max_depth=10,
min_samples_split=2, random_state=0).fit(X_train, y_train)
predictions = clf.predict(X_test)
print(accuracy_score(y_test, predictions))
scores = cross_val_score(clf, X_train,y_train, cv=5)
print(scores.mean())

In [None]:
test.head(25)

Best performer is RandomForestClassifier without fine tuning

In [None]:
submission_preds = clf_rf.predict(test)
submission_preds

In [None]:
df = pd.DataFrame({"PassengerId": test_ids.values,
                   "Survived": submission_preds,
                  })

In [None]:
df.to_csv("submission.csv", index=False)

In [8]:
!git init

Initialized empty Git repository in /content/.git/
