In [None]:
# import libraries
import numpy as np                 #data processing :  numpy , pandas
import pandas as pd                 
import matplotlib.pyplot as plt # data visualization: matplotlib,seaborn
import seaborn as sns

In [None]:
path = 'C:/Users/vinod.sharma1/Desktop/titanic.csv'
titanic = pd.read_csv(path)

In [None]:
titanic.head() #   method, Pclass: Passenger class, SibSp: no.of sibling or Spouse travelling 
               #   along, Parch: parent/children travelling along, NaN: not a number,
               #   Embarked: place from where they boardedthe ship
               #   Port of Embarkation (C = Cherbourg; Q = Queenstown; S = Southampton)

In [None]:
titanic.tail()# method

In [None]:
titanic.shape # shape: attribute, gives the count of rows and columns

In [None]:
titanic.info() # Age,Cabin,Embarked: values are missing

In [None]:
titanic.isnull() # True: Null(no value), False: Not null(contains value)

In [None]:
titanic.isnull().sum() #missing data

In [None]:
type(titanic.isnull().sum())  #missing data

In [None]:
#Data Visualization: Visualizing using the count of rows as range to see the missing data
# Wherever there is a gap that denotes the missing values
sns.heatmap(titanic.isnull())

In [None]:
# A heat map (or heatmap) is a data visualization technique that shows magnitude of a 
# phenomenon as color in two dimensions. The variation in color may be by hue or intensity, 
# giving obvious visual cues to the reader about how the phenomenon is clustered or varies
# over space

In [None]:
# Heatmap is a graphical way to visualize visitor behavior data in the form of hot and cold 
# spots employing a warm-to-cool color scheme. The warm colors indicated sections with the
# most visitor interaction, red being the area of highest interaction, and the cool colors 
# indicate sections with the lowest interaction

In [None]:
# Changing colour of heatmap
sns.heatmap(titanic.isnull(),cmap='viridis') #cmap: adjusts colormap

In [None]:
sns.heatmap(titanic.isnull(),cmap='plasma')

In [None]:
sns.heatmap(titanic.isnull(),cmap='cividis')

In [None]:
# countplot: Show the counts of observations in each categorical bin using bars

In [None]:
sns.countplot(x='Survived',data = titanic) # it depicts that more people died

In [None]:
# hue is used to divide the result on the basis of some attribute.
# survived--no.  (always)
# hue---non no. (always)

In [None]:
sns.countplot(x='Survived' , data=titanic , hue='Sex')
# In case of died people: more men died as compared to women
# In case of alive: more women survived as compared to men

In [None]:
sns.countplot(x='Survived' , data=titanic , hue='Pclass')
# died: mostly people belonging to 3rd class died
# alive: mostly people from 1st class survived.

In [None]:
sns.countplot(x='Sex',data=titanic) # it shows more man went on the ship

In [None]:
sns.countplot(x='Survived',data=titanic,hue='Embarked') # it shows people that board the 
# ship from 's' died max.

In [None]:
sns.countplot(x='SibSp',data=titanic) # It shows maximum people travelled alone and how many 
# people took either their sibling or their spouse along with them

In [None]:
sns.countplot(x='Parch',data=titanic) # It shows maximum people travelled alone and how many 
# people took either their parent or their children along with them

In [None]:
# distplot lets you show a histogram with a line on it , histogram is used when we want to 
# show some attribute within a particular range and if you want to remove the line that comes,
# when an hist is created then you need to set the kernel distribution, kde to false, i.e, 
# kde=False

In [None]:
sns.distplot(titanic['Age'])

In [None]:
sns.distplot(titanic['Age'],kde=False)

In [None]:
sns.distplot(titanic['Fare'],kde=False,bins=30)
#  bins are the vertical columns, general range: 25-50

In [None]:
sns.distplot(titanic['Fare'],kde=False,bins=10)

In [None]:
# Data cleaning means filtering and modifying your data such that it is easier to explore, 
# understand, and model. Filtering out the parts you don't want or need so that you don't need
# to look at or process them.
# The main aim of Data Cleaning is to identify and remove errors & duplicate data, in order to 
# create a reliable dataset. This improves the quality of the training data for analytics and 
# enables accurate decision-making.

In [None]:
# Missing values can be imputed/filled with a provided constant value or using the statistics 
# (mean, median or most frequent) of each column in which the missing values are located. 
# Age imputation
sns.boxplot(x='Pclass', y='Age', data=titanic)

In [None]:
#A box plot is a method for graphically depicting groups of numerical data through their 
#quartiles. The box extends from the Q1 to Q3 quartile values of the data, with a line at the 
#median (Q2). The whiskers extend from the edges of box to show the range of the data

In [None]:
def impute_age(cols):
    age = cols[0]
    pclass =cols[1]
    
    if pd.isnull(age):
        if pclass ==1:
            return 37
        elif pclass ==2:
            return 29
        else:
            return 27
    else:
        return age

In [None]:
titanic['Age'] = titanic[['Age','Pclass']].apply(impute_age,axis=1)

In [None]:
#age imputation

# creating func impute_age, dataframe will be created.cols is df axis=1, 
# tells you need to apply on columns else will apply on rows


In [None]:
sns.heatmap(titanic.isnull(),cmap='viridis')

In [None]:
titanic.drop('Cabin',axis=1,inplace=True)
# inplace=True, so that whatever changes we are doing it must update the 
# original data also

In [None]:
sns.heatmap(titanic.isnull(),cmap='viridis')

In [None]:
sns.countplot(x='Embarked',data=titanic)

In [None]:
def emb_imput(cols):
    emb=cols[0]
    pclass=cols[1]
    
    if pd.isnull(emb):
        if pclass==1 or pclass==2 or  pclass==3: 
            return 'S'
    else:
        return emb

In [None]:
titanic['Embarked'] = titanic[['Embarked','Pclass']].apply(emb_imput,axis=1)

In [None]:
sns.heatmap(titanic.isnull(),cmap='viridis')

In [None]:
titanic.shape

In [None]:
titanic.info()

In [None]:
titanic.head()

In [None]:
# Drop the columns which are not contributing in the analysis and are of no use 

In [None]:
titanic.drop(['Name','Ticket','PassengerId'], axis=1, inplace=True)

In [None]:
titanic.head()

In [None]:
# Sex and embarked(S,C,Q) are in character form , so refine the data and convert them into no.s
# by first creating their dummies and then removing them from the original data and combining 
# the data with their dummies value

In [None]:
sex = pd.get_dummies(titanic['Sex'],drop_first=True)
embark = pd.get_dummies(titanic['Embarked'],drop_first=True)
# drop_first: bool,  by default = False
# Whether to get k-1 dummies out of k categorical levels by removing the first level.

In [None]:
titanic.head()

In [None]:
titanic.drop(['Sex','Embarked'],axis = True, inplace = True)

In [None]:
titanic.head()

In [None]:
titanic = pd.concat([titanic,sex,embark],axis= 1)

In [None]:
titanic.head()

In [None]:
titanic.tail()

In [None]:
# Machine Learning: Dividing the data into 3 parts

X = titanic.drop('Survived',axis = 1)
Y = titanic['Survived']
X.head()

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size=0.30)

# x_train is the training data set(that includes features). y_train is the set of labels to 
# all the data in x_train .

# test_size — This parameter decides the size of the data that has to be split as the test 
# dataset. This is given as a fraction. For example, if you pass 0.5 as the value, the dataset 
# will be split 50% as the test dataset.

# The training set is a subset of the data set used to train a model.
# x_train is the training data set. y_train is the set of labels to all the data in x_train.
# The test set is a subset of the data set that you use to test your model after the model has 
# gone through initial vetting by the validation set.

# x_test is the test data set.y_test is the set of labels to all the data in x_test.
# The validation set is a subset of the data set (separate from the training set) that you use 
# to adjust hyperparameters.

# A hyperparameter is a parameter whose value is set before the learning process begins.

In [None]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()

In [None]:
model.fit(X_train, Y_train)

In [None]:
predictions = model.predict(X_test)

In [109]:
predictions

array([1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0,
       1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1,
       1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1,
       1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0,
       0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0,
       0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0,
       1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0,
       0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0,
       0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0,
       1, 1, 0, 1], dtype=int64)

In [112]:
from sklearn.metrics import classification_report

print(classification_report(Y_test,predictions))

              precision    recall  f1-score   support

           0       0.85      0.84      0.84       178
           1       0.69      0.71      0.70        90

    accuracy                           0.79       268
   macro avg       0.77      0.77      0.77       268
weighted avg       0.80      0.79      0.80       268

