<img src="titanic.jpg" alt="titanic" height="100">


## Titanic - Machine Learning from Disaster

## ***Top 6%***

In [None]:
# import necessary modules 
import pandas as pd 
import matplotlib.pyplot as plt 
import numpy as np 
import seaborn as sns
sns.set(rc={'figure.figsize':(40,15)})

### Training Data

In [None]:
#reading csv for train data
data_train=pd.read_csv("../input/titanic/train.csv")
data_train

In [None]:
#distplot
sns.distplot(x = data_train['Fare'], bins =5)

In [None]:
#countplot
sns.countplot(data_train['Embarked'])

In [None]:
#missing value check by matrix
import missingno as msno
msno.matrix(data_train)

In [None]:
#missing value check by heat map
msno.heatmap(data_train)

In [None]:
#Checking correlation if any exists
corr=data_train.corr()
sns.heatmap(corr)

In [None]:
#Checking For Null Values if any Present
data_train.isnull().sum()

In [None]:
#filling  Numerical null values by mean
df=pd.DataFrame()
data_train['Age']=data_train['Age'].fillna((data_train['Age'].mean()))
#filling non numerical Categorical null values by most occuring one
df_most_common_imputed = data_train.apply(lambda x: x.fillna(x.value_counts().index[0]))
df["Survived"]=df_most_common_imputed["Survived"]
#appending the target value column to the last 
df_most_common_imputed.drop(["Survived"], axis=1, inplace=True)
df_most_common_imputed["Survived"]=df["Survived"]
df_most_common_imputed.isnull().sum()

In [None]:
#imputation is completed we will confirm still if there is any na value present
import missingno as msno
msno.matrix(df_most_common_imputed)

In [None]:
df_most_common_imputed.isnull().sum()

In [None]:
#encoding object data type to which it can be fitted to model
import pandas as pd
import numpy as np
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
for column_name in df_most_common_imputed.columns:
    if df_most_common_imputed[column_name].dtype == object:
        df_most_common_imputed[column_name] = le.fit_transform(df_most_common_imputed[column_name])
    else:
        pass
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
X = df_most_common_imputed.iloc[:,0:]  
y = df_most_common_imputed.iloc[:,-1] 
#Selecting best featues according to importance
bestfeatures = SelectKBest(score_func=chi2, k=12)
fit = bestfeatures.fit(X,y)
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(X.columns)
#concat two dataframes for better visualization 
featureScores = pd.concat([dfcolumns,dfscores],axis=1)
featureScores.columns = ['Features','Score']  #naming the dataframe columns
print(featureScores.nlargest(12,'Score')) 
df_most_common_imputed

In [None]:
#outliar checking  by boxplot
sns.boxplot(data=df_most_common_imputed)

In [None]:
#plotting pairplots
sns.pairplot(df_most_common_imputed)

#### Next step is for outliar removal but it may lead to low accuracy in this case

In [None]:
# df_most_common_imputed
# import pandas as pd
# from scipy import stats
# z_scores = stats.zscore(df_most_common_imputed)
# abs_z_scores = np.abs(z_scores)
# filtered_entries = (abs_z_scores < 3).all(axis=1)
# new_df = df_most_common_imputed[filtered_entries]
# new_df

## Testing Data

In [None]:
#reading csv for test data
data_test=pd.read_csv("../input/titanic/test.csv")
data_test.head()

In [None]:
#Checking For Null Values if any Present
data_test.isnull().sum()

In [None]:
#distplot
sns.distplot(x = data_test['Fare'], bins = 5)

In [None]:
#countplot
sns.countplot(data_test['Embarked'])

In [None]:
#missing value check by matrix
import missingno as msno
msno.matrix(data_test)

In [None]:
#missing value check by heat map
msno.heatmap(data_test)

In [None]:
#Checking correlation if any exists
corr=data_test.corr()
sns.heatmap(corr)

In [None]:
#filling  Numerical null values by mean
data_test['Age']=data_test['Age'].fillna((data_test['Age'].mean()))
data_test['Fare']=data_test['Fare'].fillna((data_test['Fare'].mean()))
#filling non numerical Categorical null values by most occuring one
dt_most_common_imputed = data_test.apply(lambda x: x.fillna(x.value_counts().index[0]))
dt_most_common_imputed.isnull().sum()

In [None]:
#imputation is completed we will confirm still if there is any na value present
import missingno as msno
msno.matrix(dt_most_common_imputed)

In [None]:
dt_most_common_imputed.isnull().sum()

In [None]:
#outliar checking  by boxplot
sns.boxplot(data=dt_most_common_imputed)

In [None]:
#plotting pairplots
sns.pairplot(dt_most_common_imputed)

In [None]:
#dataframe for submission
dt=pd.DataFrame()
dt['PassengerId']=dt_most_common_imputed['PassengerId']
dt['Survived']=df_most_common_imputed['Survived']
dt

In [None]:
#encoding object data type to which it can be fitted to model
import pandas as pd
import numpy as np
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
for column_name in dt_most_common_imputed.columns:
    if dt_most_common_imputed[column_name].dtype == object:
        dt_most_common_imputed[column_name] = le.fit_transform(dt_most_common_imputed[column_name])
    else:
        pass
#splitting data for training and testing
x_train =  df_most_common_imputed.iloc[:,0:-1]  
y_train = df_most_common_imputed.iloc[:,-1] 
x_test=  dt_most_common_imputed
x_train
y_train
x_test

In [None]:
#Applying Catboost classifier as it is too efficient in this case!!
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score,confusion_matrix
clf = CatBoostClassifier(iterations=9,learning_rate=0.5)
clf.fit(x_train, y_train)
cat = clf.predict(x_test)
dt["Survived"]=cat

sub=pd.read_csv("../input/titanic/gender_submission.csv")
accuracy = accuracy_score(sub["Survived"], cat)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

print(accuracy_score(sub["Survived"],cat))
print(confusion_matrix(sub["Survived"],cat))
dt.head()

In [None]:
sns.scatterplot(data =dt, x = "PassengerId", y = "Survived")
plt.show()

In [None]:
dt.info()

In [None]:
#dt.to_csv("cat_boost.csv",index=False)