# Importing Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
from scipy.stats import skew,kurtosis,ttest_ind
import squarify
import statistics as st

# Getting Data 

In [None]:
df=pd.read_csv("/kaggle/input/tabular-playground-series-apr-2021/train.csv")
df

# Cleaning Data 

In [None]:
def DataCleaning(df):
    df['Age'].fillna(df['Age'].mean(),inplace=True)
    df['Age']=df['Age'].astype('int64')
    df['Fare'].fillna(df['Fare'].mean(),inplace=True)
    df['Pclass']=df['Pclass'].astype('str')
    df['CabinLetter']=df['Cabin'].apply(lambda x: x[0] if not pd.isna(x) else x)
    df['CabinLetter'].fillna('C',inplace=True)
    df['Embarked'].fillna('S',inplace=True)
    df=df.drop(columns=['Ticket'])

    #def Outliers(data,ft):
        
        #Q1=data[ft].quantile(0.25)
        #Q3=data[ft].quantile(0.75)
        #IQR=Q3-Q1
    
        #Lower_bound= Q1- 1.5 * IQR
        #Upper_bound= Q3 + 1.5 * IQR
    
        #ls=data.index[ (data[ft] < Lower_bound) | (data[ft] > Upper_bound) ]
        #return ls

    #index_list=[]
    #for feature in ['Age','Fare']:
        
        #index_list.extend(Outliers(df,feature))

    #def remove(data,ls):
        #ls=sorted(set(ls))
        #data=data.drop(ls)
        #return data

    df_cleaned=df.copy()
    return df_cleaned

df_cleaned=DataCleaning(df)
df_cleaned
#df_cleaned['CabinLetter'].fillna(df_cleaned['CabinLetter'].mode(),inplace=True)

# Data Overview

In [None]:
df_cleaned.sample(6)

In [None]:
df_cleaned.groupby('Sex').apply(lambda x: x[['Survived']].sample(6))

# Descriptive Analysis

#### Frequency Table

In [None]:
(df_cleaned['Sex'].value_counts()/df_cleaned['Sex'].count()).round(2)

In [None]:
# Gender vs Survived 
df_cleaned.groupby('Sex').Survived.mean().round(2)

In [None]:
data=np.array(df_cleaned.groupby('Sex').Survived.mean().round(2))
data
plt.pie(data.tolist(),labels=['Female','Male'],autopct='%.0f%%')

In [None]:
# Pclass vs Survived
df_cleaned.groupby('Pclass').Survived.mean().round(2)

In [None]:
# Age vs Survived
sns.violinplot(x='Survived',y='Age',data=df_cleaned)

In [None]:
# SibSp vs Survived
df_cleaned.groupby('SibSp').Survived.mean().round(2)

In [None]:
sns.barplot(x=df_cleaned['SibSp'],y=df_cleaned['Survived'],data=df_cleaned)

In [None]:
sns.barplot(x=df_cleaned['Parch'],y=df_cleaned['Survived'],data=df_cleaned)

In [None]:
sns.violinplot(x='Survived',y='Fare',data=df_cleaned)

In [None]:
#df.groupby('CabinLetter').Survived.value
df_cleaned.groupby('CabinLetter').Survived.value_counts()

In [None]:
df_cleaned.groupby('Embarked').Survived.mean().round(2)

### Descriptive Statistical

#### Measure of Central Tendancy

In [None]:
df_cleaned.describe().round(2)

#### Measure of Variation

In [None]:
# variation ratio 
v=1-df_cleaned['CabinLetter'].value_counts().max()/df_cleaned['CabinLetter'].count()
v
# CabinLetter feature it looks a little normal distributed

#### Skewness & Kurtosis and Boxplot 

In [None]:
#skew(df_cleaned['Fare']) # skewness < 0 ---> most of data is in last values
#kurtosis(df_cleaned['Age'])
sns.boxplot(x=df_cleaned['Fare'])

## Test of Homogeneity

In [None]:
dataset=pd.crosstab(df_cleaned['Sex'],df_cleaned['Survived'])
print(dataset)

In [None]:
#Observed Values
Observed_Values = dataset.values 
print("Observed Values :-\n",Observed_Values)

In [None]:
val=stats.chi2_contingency(dataset)
val

In [None]:
Expected_Values=val[3]

In [None]:
no_of_rows=len(dataset.iloc[0:2,0])
no_of_columns=len(dataset.iloc[0,0:2])
ddof=(no_of_rows-1)*(no_of_columns-1)
print("Degree of Freedom:-",ddof)
alpha = 0.05

In [None]:
from scipy.stats import chi2
chi_square=sum([(o-e)**2./e for o,e in zip(Observed_Values,Expected_Values)])
chi_square_statistic=chi_square[0]+chi_square[1]

In [None]:
print("chi-square statistic:-",chi_square_statistic)

In [None]:
critical_value=chi2.ppf(q=1-alpha,df=ddof)
print('critical_value:',critical_value)

In [None]:
#p-value
p_value=1-chi2.cdf(x=chi_square_statistic,df=ddof)
print('p-value:',p_value)
print('Significance level: ',alpha)
print('Degree of Freedom: ',ddof)
print('p-value:',p_value)

In [None]:
if chi_square_statistic>=critical_value:
    print("Reject H0,There is a relationship between 2 categorical variables")
else:
    print("Retain H0,There is no relationship between 2 categorical variables")

### Correlation

In [None]:
sns.heatmap(df_cleaned.corr(),annot=True)

# Preparing Data To Be Trained

### Encoding categorical data

In [None]:
from sklearn.model_selection import train_test_split

df_train, df_test = train_test_split(df_cleaned, test_size=0.25, random_state=0)


#Cat_Ft=df_train[['Pclass','Sex','CabinLetter','Embarked']]
#Num_Ft=df_train[['Age','SibSp','Parch','Fare']]

#from sklearn.preprocessing import OneHotEncoder

#one_hot_encoder = OneHotEncoder(drop='first')
#Cat_Features = one_hot_encoder.fit_transform(Cat_Ft).todense()
#Cat_Features=pd.DataFrame(Cat_Features, columns=one_hot_encoder.get_feature_names())
#Cat_Features

### Scaling data

In [None]:
#from sklearn.preprocessing import StandardScaler
#scaler = StandardScaler()
#Num_Features = scaler.fit_transform(Num_Ft)
#Num_Features=pd.DataFrame(Num_Features, columns=Num_Ft.columns)
#Num_Features

In [None]:
def features_transform(d):
    Cat_Ft=d[['Pclass','Sex','CabinLetter','Embarked']]
    Num_Ft=d[['Age','SibSp','Parch','Fare']]

    from sklearn.preprocessing import OneHotEncoder

    one_hot_encoder = OneHotEncoder(drop='first')
    Cat_Features = one_hot_encoder.fit_transform(Cat_Ft).todense()
    Cat_Features=pd.DataFrame(Cat_Features, columns=one_hot_encoder.get_feature_names())
    
    from sklearn.preprocessing import StandardScaler
    scaler = StandardScaler()
    Num_Features = scaler.fit_transform(Num_Ft)
    Num_Features=pd.DataFrame(Num_Features, columns=Num_Ft.columns)
    features=pd.concat([Num_Features,Cat_Features],axis=1)
    
    return features

In [None]:
X_train = features_transform(df_train)
y_train = df_train.Survived

X_test = features_transform(df_test)
y_test = df_test.Survived


# Model Training

In [None]:
from sklearn.linear_model import LogisticRegression
#from xgboost import XGBClassifier
from sklearn.dummy import DummyClassifier

model = LogisticRegression()
baseline = DummyClassifier(strategy='most_frequent')

model.fit(X_train, y_train)
baseline.fit(X_train, y_train)

#from sklearn.svm import SVC
#model = SVC(kernel = 'linear', random_state = 0)
#model.fit(X_train, y_train)



In [None]:
from sklearn.metrics import classification_report, roc_auc_score,accuracy_score

print(classification_report(y_test, model.predict(X_test)))
#print(classification_report(y_test, baseline.predict(X_test)))

In [None]:
y_pred=model.predict(X_test)
#print(accuracy_score(y_test, y_pred))
print(roc_auc_score(y_test, y_pred))

## Submission

In [None]:
X_train = features_transform(df_cleaned)
y_train = df_cleaned.Survived

model.fit(X_train, y_train)

In [None]:
dd=pd.read_csv("/kaggle/input/tabular-playground-series-apr-2021/test.csv")
df_test_cleaned=DataCleaning(dd)

X_test=features_transform(df_test_cleaned)

#df_test_cleaned
sumbission_df=df_test_cleaned[['PassengerId']].copy()
sumbission_df['Survived']=model.predict(X_test)
sumbission_df

In [None]:
sumbission_df.to_csv('Titanic_Submission.csv',index=False)