In [None]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns 
import pickle
from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

In [None]:
#calling the dataset 
df = pd.read_csv("tested.csv")
df.head()

In [None]:
df.head()

In [None]:
df.dtypes

In [None]:
100*df.isna().mean()

In [None]:
#checking the balance of the dataset 
survival_count = df["Survived"].value_counts()
survival_rate = 100*survival_count/df.shape[0]
survival_data = pd.concat([survival_count, survival_rate],axis=1).reset_index()
survival_data.columns = ['Survived','count','Percentage']

survival_data 

In [None]:
plt.figure(figsize=(5,3))
sns.barplot(survival_data, x=['Not-Survivied','Survived'], y='Percentage',width=0.35)
plt.ylabel('Percentage Survived')
plt.xlabel('Survived Status')
plt.title('Percentage Survival of the Titanic Passengers')

In [None]:
#calling the show survival rate according to grps 
def survival_acc_to_grp(df,colname):
    plt.figure(figsize=(5,3))
    survival_by_pclass = df[['Survived',colname]].value_counts().reset_index()
    sns.barplot(survival_by_pclass,x=colname, y='count', hue='Survived')
    plt.ylabel('Number of Survived')
    plt.title(f'Survival According to {colname}')

In [None]:
survival_acc_to_grp(df,'Pclass')

In [None]:
survival_acc_to_grp(df,'Sex')

In [None]:
survival_acc_to_grp(df,'Embarked')

In [None]:
def vars_acc_to_surv(df,colname):
    plt.figure(figsize=(5,3))
    sns.kdeplot(df, x=colname, hue='Survived')
    plt.title(f'{colname} distribution according to the survival')

In [None]:
vars_acc_to_surv(df, 'Age')

In [None]:
vars_acc_to_surv(df, 'Fare')

In [None]:
def count_acc_to_surv(df, colname):
    plt.figure(figsize=(5,3))
    df1=df.groupby(['SibSp', 'Survived']).count()['PassengerId'].reset_index()
    sns.barplot(df1,x='SibSp', y='PassengerId', Nmae='Survived')
    plt.title(f,'{colname} according to survival')
    plt.xlabel(f'{colname}')
    plt.ylabel('count')

In [None]:
count_acc_to_surv(df,'SibSp')

In [None]:
count_acc_to_surv(df, 'Parch')

In [None]:
count_vars = pd.melt(df,value_vars=['Age','Fare','SibSp','Parch'])
plt.figure(figsize=(5,3))
sns.boxplot(count_vars, x='variable', y='value')
plt.xlabel('Continuous Variables')
plt.ylabel('Values')
plt.title('Comparison of thr continuous variables')

In [None]:
#taking out the important features
df = df.drop(['PassengerId','Name','Ticket','Cabin'],axis=1)
df.head() 

In [None]:
df=df.dropna(subset='Fare')

In [None]:
df['Age']=df['Age'].fillna(df['Age'].median())

In [None]:
df.isnull().sum()

In [None]:
df_survived = df[df['Survived']==1]
df_not_survived = df[df['Survived']==0]
df_survived_resampled = df_survived.sample(n=df_not_survived.shape[0],replace=True,random_state=101)
df_balanced = pd.concat([df_not_survived, df_survived_resampled]).sample(frac=1, random_state=26)
df_balanced

In [None]:
df_balanced['Survived'].value_counts()

In [None]:
#splitting in to x and y 
x=df_balanced.drop(['Survived'],axis=1)
y=df_balanced['Survived']

In [None]:
x_train, x_test, y_test, y_train=train_test_split(x,y,test_size=0.2, random_state=1055)
print(f'''
x_train: {x_train.shape}
x_test: {x_test.shape}
y_train: {y_train.shape}
y_test: {y_test.shape}''')

In [None]:
count_cols = ['Age', 'Fare']
disc_cols = ['SibSp', 'Parch']
cat_cols = ['Pclass','Sex','Embraked']
continuous_transformer = Pipeline(steps=[
    ('scaler',StandardScaler())])
discrete_transformer = Pipeline(steps=[
    'scaler',OneHotEncoder(drop='first')])
categorical_transformer = Pipeline(steps=[
    ('scaler',MinMaxScaler())])
preprocessor = ColumnTransformer(
    transformers=[
        ('count',continuous_transformer,count_cols),
        ('disc',discrete_transformer,disc_cols),
        ('cat',categorical_transformer,cat_cols)])
model = Pipeline(steps=[('preprocessor',preprocessor),
                        ('classifier',LogisticRegression)])
model.fit(x_train,y_train)

In [None]:
y_pred = model.predict(x_test)
y_pred

In [None]:
cr = classification_report(y_test, y_pred)
print(cr)

In [None]:
cm=confusion_matrix(x_test,y_pred)
plt.figure(figsize=(5,3))
sns.heatmap(cm,annot=True)
plt.title('confusion matrix of the classification')
plt.xlabel('True value')
plt.ylabel('predicted value')

In [None]:
with open('./model_logistic.pk1','wb')as fp:
    pickle.dump(model,fp)